4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
37 struct ctdb_iface *prev, *next;
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
46 return vnn->iface->name;
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
56 /* Verify that we dont have an entry for this ip yet */
57 for (i=ctdb->ifaces;i;i=i->next) {
58 if (strcmp(i->name, iface) == 0) {
63 /* create a new structure for this interface */
64 i = talloc_zero(ctdb, struct ctdb_iface);
65 CTDB_NO_MEMORY_FATAL(ctdb, i);
66 i->name = talloc_strdup(i, iface);
67 CTDB_NO_MEMORY(ctdb, i->name);
70 DLIST_ADD(ctdb->ifaces, i);
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
80 /* Verify that we dont have an entry for this ip yet */
81 for (i=ctdb->ifaces;i;i=i->next) {
82 if (strcmp(i->name, iface) == 0) {
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
94 struct ctdb_iface *cur = NULL;
95 struct ctdb_iface *best = NULL;
97 for (i=0; vnn->ifaces[i]; i++) {
99 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
113 if (cur->references < best->references) {
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123 struct ctdb_vnn *vnn)
125 struct ctdb_iface *best = NULL;
128 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129 "still assigned to iface '%s'\n",
130 ctdb_addr_to_str(&vnn->public_address),
131 ctdb_vnn_iface_string(vnn)));
135 best = ctdb_vnn_best_iface(ctdb, vnn);
137 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138 "cannot assign to iface any iface\n",
139 ctdb_addr_to_str(&vnn->public_address)));
145 vnn->pnn = ctdb->pnn;
147 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148 "now assigned to iface '%s' refs[%d]\n",
149 ctdb_addr_to_str(&vnn->public_address),
150 ctdb_vnn_iface_string(vnn),
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156 struct ctdb_vnn *vnn)
158 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159 "now unassigned (old iface '%s' refs[%d])\n",
160 ctdb_addr_to_str(&vnn->public_address),
161 ctdb_vnn_iface_string(vnn),
162 vnn->iface?vnn->iface->references:0));
164 vnn->iface->references--;
167 if (vnn->pnn == ctdb->pnn) {
172 static bool ctdb_vnn_available(struct ctdb_context *ctdb,
173 struct ctdb_vnn *vnn)
177 if (vnn->iface && vnn->iface->link_up) {
181 for (i=0; vnn->ifaces[i]; i++) {
182 struct ctdb_iface *cur;
184 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
197 struct ctdb_takeover_arp {
198 struct ctdb_context *ctdb;
201 struct ctdb_tcp_array *tcparray;
202 struct ctdb_vnn *vnn;
207 lists of tcp endpoints
209 struct ctdb_tcp_list {
210 struct ctdb_tcp_list *prev, *next;
211 struct ctdb_tcp_connection connection;
215 list of clients to kill on IP release
217 struct ctdb_client_ip {
218 struct ctdb_client_ip *prev, *next;
219 struct ctdb_context *ctdb;
226 send a gratuitous arp
228 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
229 struct timeval t, void *private_data)
231 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
232 struct ctdb_takeover_arp);
234 struct ctdb_tcp_array *tcparray;
235 const char *iface = ctdb_vnn_iface_string(arp->vnn);
237 ret = ctdb_sys_send_arp(&arp->addr, iface);
239 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
240 iface, strerror(errno)));
243 tcparray = arp->tcparray;
245 for (i=0;i<tcparray->num;i++) {
246 struct ctdb_tcp_connection *tcon;
248 tcon = &tcparray->connections[i];
249 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
250 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
251 ctdb_addr_to_str(&tcon->src_addr),
252 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
253 ret = ctdb_sys_send_tcp(
258 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
259 ctdb_addr_to_str(&tcon->src_addr)));
266 if (arp->count == CTDB_ARP_REPEAT) {
271 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
272 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
273 ctdb_control_send_arp, arp);
276 struct takeover_callback_state {
277 struct ctdb_req_control *c;
278 ctdb_sock_addr *addr;
279 struct ctdb_vnn *vnn;
283 called when takeip event finishes
285 static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
288 struct takeover_callback_state *state =
289 talloc_get_type(private_data, struct takeover_callback_state);
290 struct ctdb_takeover_arp *arp;
291 struct ctdb_tcp_array *tcparray;
294 if (status == -ETIME) {
297 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
298 ctdb_addr_to_str(state->addr),
299 ctdb_vnn_iface_string(state->vnn)));
300 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
305 if (!state->vnn->takeover_ctx) {
306 state->vnn->takeover_ctx = talloc_new(state->vnn);
307 if (!state->vnn->takeover_ctx) {
312 arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
313 if (!arp) goto failed;
316 arp->addr = *state->addr;
317 arp->vnn = state->vnn;
319 tcparray = state->vnn->tcp_array;
321 /* add all of the known tcp connections for this IP to the
322 list of tcp connections to send tickle acks for */
323 arp->tcparray = talloc_steal(arp, tcparray);
325 state->vnn->tcp_array = NULL;
326 state->vnn->tcp_update_needed = true;
329 event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx,
330 timeval_zero(), ctdb_control_send_arp, arp);
332 /* the control succeeded */
333 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
338 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
344 Find the vnn of the node that has a public ip address
345 returns -1 if the address is not known as a public address
347 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
349 struct ctdb_vnn *vnn;
351 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
352 if (ctdb_same_ip(&vnn->public_address, addr)) {
361 take over an ip address
363 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
364 struct ctdb_req_control *c,
369 struct takeover_callback_state *state;
370 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
371 struct ctdb_vnn *vnn;
373 /* update out vnn list */
374 vnn = find_public_ip_vnn(ctdb, &pip->addr);
376 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
377 ctdb_addr_to_str(&pip->addr)));
382 /* if our kernel already has this IP, do nothing */
383 if (ctdb_sys_have_ip(&pip->addr)) {
387 ret = ctdb_vnn_assign_iface(ctdb, vnn);
389 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
390 "assin a usable interface\n",
391 ctdb_addr_to_str(&pip->addr),
392 vnn->public_netmask_bits));
396 state = talloc(vnn, struct takeover_callback_state);
397 CTDB_NO_MEMORY(ctdb, state);
399 state->c = talloc_steal(ctdb, c);
400 state->addr = talloc(ctdb, ctdb_sock_addr);
401 CTDB_NO_MEMORY(ctdb, state->addr);
403 *state->addr = pip->addr;
406 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
407 ctdb_addr_to_str(&pip->addr),
408 vnn->public_netmask_bits,
409 ctdb_vnn_iface_string(vnn)));
411 ret = ctdb_event_script_callback(ctdb,
412 state, takeover_ip_callback, state,
416 ctdb_vnn_iface_string(vnn),
417 ctdb_addr_to_str(&pip->addr),
418 vnn->public_netmask_bits);
421 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
422 ctdb_addr_to_str(&pip->addr),
423 ctdb_vnn_iface_string(vnn)));
428 /* tell ctdb_control.c that we will be replying asynchronously */
435 takeover an ip address old v4 style
437 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb,
438 struct ctdb_req_control *c,
444 data.dsize = sizeof(struct ctdb_public_ip);
445 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
446 CTDB_NO_MEMORY(ctdb, data.dptr);
448 memcpy(data.dptr, indata.dptr, indata.dsize);
449 return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
453 kill any clients that are registered with a IP that is being released
455 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
457 struct ctdb_client_ip *ip;
459 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
460 ctdb_addr_to_str(addr)));
462 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
463 ctdb_sock_addr tmp_addr;
466 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
468 ctdb_addr_to_str(&ip->addr)));
470 if (ctdb_same_ip(&tmp_addr, addr)) {
471 struct ctdb_client *client = ctdb_reqid_find(ctdb,
474 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
476 ctdb_addr_to_str(&ip->addr),
479 if (client->pid != 0) {
480 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
481 (unsigned)client->pid,
482 ctdb_addr_to_str(addr),
484 kill(client->pid, SIGKILL);
491 called when releaseip event finishes
493 static void release_ip_callback(struct ctdb_context *ctdb, int status,
496 struct takeover_callback_state *state =
497 talloc_get_type(private_data, struct takeover_callback_state);
500 if (status == -ETIME) {
504 /* send a message to all clients of this node telling them
505 that the cluster has been reconfigured and they should
506 release any sockets on this IP */
507 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
508 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
509 data.dsize = strlen((char *)data.dptr)+1;
511 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
513 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
515 /* kill clients that have registered with this IP */
516 release_kill_clients(ctdb, state->addr);
518 ctdb_vnn_unassign_iface(ctdb, state->vnn);
520 /* the control succeeded */
521 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
526 release an ip address
528 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
529 struct ctdb_req_control *c,
534 struct takeover_callback_state *state;
535 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
536 struct ctdb_vnn *vnn;
538 /* update our vnn list */
539 vnn = find_public_ip_vnn(ctdb, &pip->addr);
541 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
542 ctdb_addr_to_str(&pip->addr)));
547 /* stop any previous arps */
548 talloc_free(vnn->takeover_ctx);
549 vnn->takeover_ctx = NULL;
551 if (!ctdb_sys_have_ip(&pip->addr)) {
552 DEBUG(DEBUG_NOTICE,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
553 ctdb_addr_to_str(&pip->addr),
554 vnn->public_netmask_bits,
555 ctdb_vnn_iface_string(vnn)));
556 ctdb_vnn_unassign_iface(ctdb, vnn);
560 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%u\n",
561 ctdb_addr_to_str(&pip->addr),
562 vnn->public_netmask_bits,
563 ctdb_vnn_iface_string(vnn),
566 state = talloc(ctdb, struct takeover_callback_state);
567 CTDB_NO_MEMORY(ctdb, state);
569 state->c = talloc_steal(state, c);
570 state->addr = talloc(state, ctdb_sock_addr);
571 CTDB_NO_MEMORY(ctdb, state->addr);
572 *state->addr = pip->addr;
575 ret = ctdb_event_script_callback(ctdb,
576 state, release_ip_callback, state,
578 CTDB_EVENT_RELEASE_IP,
580 ctdb_vnn_iface_string(vnn),
581 ctdb_addr_to_str(&pip->addr),
582 vnn->public_netmask_bits);
584 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
585 ctdb_addr_to_str(&pip->addr),
586 ctdb_vnn_iface_string(vnn)));
591 /* tell the control that we will be reply asynchronously */
597 release an ip address old v4 style
599 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb,
600 struct ctdb_req_control *c,
606 data.dsize = sizeof(struct ctdb_public_ip);
607 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
608 CTDB_NO_MEMORY(ctdb, data.dptr);
610 memcpy(data.dptr, indata.dptr, indata.dsize);
611 return ctdb_control_release_ip(ctdb, c, data, async_reply);
615 static int ctdb_add_public_address(struct ctdb_context *ctdb,
616 ctdb_sock_addr *addr,
617 unsigned mask, const char *ifaces)
619 struct ctdb_vnn *vnn;
626 /* Verify that we dont have an entry for this ip yet */
627 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
628 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
629 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
630 ctdb_addr_to_str(addr)));
635 /* create a new vnn structure for this ip address */
636 vnn = talloc_zero(ctdb, struct ctdb_vnn);
637 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
638 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
639 tmp = talloc_strdup(vnn, ifaces);
640 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
641 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
642 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
643 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
644 vnn->ifaces[num] = talloc_strdup(vnn, iface);
645 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
649 vnn->ifaces[num] = NULL;
650 vnn->public_address = *addr;
651 vnn->public_netmask_bits = mask;
654 for (i=0; vnn->ifaces[i]; i++) {
655 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
657 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
658 "for public_address[%s]\n",
659 vnn->ifaces[i], ctdb_addr_to_str(addr)));
665 DLIST_ADD(ctdb->vnn, vnn);
671 setup the event script directory
673 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
675 ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
676 CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
681 setup the public address lists from a file
683 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
689 lines = file_lines_load(alist, &nlines, ctdb);
691 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
694 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
698 for (i=0;i<nlines;i++) {
706 while ((*line == ' ') || (*line == '\t')) {
712 if (strcmp(line, "") == 0) {
715 tok = strtok(line, " \t");
717 tok = strtok(NULL, " \t");
719 if (NULL == ctdb->default_public_interface) {
720 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
725 ifaces = ctdb->default_public_interface;
730 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
731 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
735 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
736 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
746 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
750 struct ctdb_vnn *svnn;
754 svnn = talloc_zero(ctdb, struct ctdb_vnn);
755 CTDB_NO_MEMORY(ctdb, svnn);
757 svnn->ifaces = talloc_array(svnn, const char *, 2);
758 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
759 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
760 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
761 svnn->ifaces[1] = NULL;
763 ok = parse_ip(ip, iface, 0, &svnn->public_address);
769 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
771 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
772 "for single_ip[%s]\n",
774 ctdb_addr_to_str(&svnn->public_address)));
779 ret = ctdb_vnn_assign_iface(ctdb, svnn);
785 ctdb->single_ip_vnn = svnn;
789 struct ctdb_public_ip_list {
790 struct ctdb_public_ip_list *next;
796 /* Given a physical node, return the number of
797 public addresses that is currently assigned to this node.
799 static int node_ip_coverage(struct ctdb_context *ctdb,
801 struct ctdb_public_ip_list *ips)
805 for (;ips;ips=ips->next) {
806 if (ips->pnn == pnn) {
814 /* Check if this is a public ip known to the node, i.e. can that
815 node takeover this ip ?
817 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn,
818 struct ctdb_public_ip_list *ip)
820 struct ctdb_all_public_ips *public_ips;
823 public_ips = ctdb->nodes[pnn]->available_public_ips;
825 if (public_ips == NULL) {
829 for (i=0;i<public_ips->num;i++) {
830 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
831 /* yes, this node can serve this public ip */
840 /* search the node lists list for a node to takeover this ip.
841 pick the node that currently are serving the least number of ips
842 so that the ips get spread out evenly.
844 static int find_takeover_node(struct ctdb_context *ctdb,
845 struct ctdb_node_map *nodemap, uint32_t mask,
846 struct ctdb_public_ip_list *ip,
847 struct ctdb_public_ip_list *all_ips)
853 for (i=0;i<nodemap->num;i++) {
854 if (nodemap->nodes[i].flags & mask) {
855 /* This node is not healty and can not be used to serve
861 /* verify that this node can serve this ip */
862 if (can_node_serve_ip(ctdb, i, ip)) {
863 /* no it couldnt so skip to the next node */
867 num = node_ip_coverage(ctdb, i, all_ips);
868 /* was this the first node we checked ? */
880 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
881 ctdb_addr_to_str(&ip->addr)));
891 static uint32_t *ip_key(ctdb_sock_addr *ip)
893 static uint32_t key[IP_KEYLEN];
895 bzero(key, sizeof(key));
897 switch (ip->sa.sa_family) {
899 key[3] = htonl(ip->ip.sin_addr.s_addr);
902 key[0] = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
903 key[1] = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
904 key[2] = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
905 key[3] = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
908 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
915 static void *add_ip_callback(void *parm, void *data)
920 void getips_count_callback(void *param, void *data)
922 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
923 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
925 new_ip->next = *ip_list;
929 struct ctdb_public_ip_list *
930 create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
933 struct ctdb_public_ip_list *ip_list;
934 struct ctdb_all_public_ips *public_ips;
935 trbt_tree_t *ip_tree;
937 ip_tree = trbt_create(tmp_ctx, 0);
939 for (i=0;i<ctdb->num_nodes;i++) {
940 public_ips = ctdb->nodes[i]->known_public_ips;
942 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
946 /* there were no public ips for this node */
947 if (public_ips == NULL) {
951 for (j=0;j<public_ips->num;j++) {
952 struct ctdb_public_ip_list *tmp_ip;
954 tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
955 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
956 tmp_ip->pnn = public_ips->ips[j].pnn;
957 tmp_ip->addr = public_ips->ips[j].addr;
960 trbt_insertarray32_callback(ip_tree,
961 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
968 trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
974 make any IP alias changes for public addresses that are necessary
976 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
978 int i, num_healthy, retries;
979 struct ctdb_public_ip ip;
980 struct ctdb_public_ipv4 ipv4;
982 struct ctdb_public_ip_list *all_ips, *tmp_ip;
983 int maxnode, maxnum=0, minnode, minnum=0, num;
985 struct timeval timeout;
986 struct client_async_data *async_data;
987 struct ctdb_client_control_state *state;
988 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
993 /* Count how many completely healthy nodes we have */
995 for (i=0;i<nodemap->num;i++) {
996 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1001 if (num_healthy > 0) {
1002 /* We have healthy nodes, so only consider them for
1003 serving public addresses
1005 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
1007 /* We didnt have any completely healthy nodes so
1008 use "disabled" nodes as a fallback
1010 mask = NODE_FLAGS_INACTIVE;
1013 /* since nodes only know about those public addresses that
1014 can be served by that particular node, no single node has
1015 a full list of all public addresses that exist in the cluster.
1016 Walk over all node structures and create a merged list of
1017 all public addresses that exist in the cluster.
1019 all_ips = create_merged_ip_list(ctdb, tmp_ctx);
1021 /* If we want deterministic ip allocations, i.e. that the ip addresses
1022 will always be allocated the same way for a specific set of
1023 available/unavailable nodes.
1025 if (1 == ctdb->tunable.deterministic_public_ips) {
1026 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1027 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1028 tmp_ip->pnn = i%nodemap->num;
1033 /* mark all public addresses with a masked node as being served by
1036 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1037 if (tmp_ip->pnn == -1) {
1040 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1045 /* verify that the assigned nodes can serve that public ip
1046 and set it to -1 if not
1048 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1049 if (tmp_ip->pnn == -1) {
1052 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1053 /* this node can not serve this ip. */
1059 /* now we must redistribute all public addresses with takeover node
1060 -1 among the nodes available
1064 /* loop over all ip's and find a physical node to cover for
1067 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1068 if (tmp_ip->pnn == -1) {
1069 if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1070 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1071 ctdb_addr_to_str(&tmp_ip->addr)));
1076 /* If we dont want ips to fail back after a node becomes healthy
1077 again, we wont even try to reallocat the ip addresses so that
1078 they are evenly spread out.
1079 This can NOT be used at the same time as DeterministicIPs !
1081 if (1 == ctdb->tunable.no_ip_failback) {
1082 if (1 == ctdb->tunable.deterministic_public_ips) {
1083 DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1089 /* now, try to make sure the ip adresses are evenly distributed
1091 for each ip address, loop over all nodes that can serve this
1092 ip and make sure that the difference between the node
1093 serving the most and the node serving the least ip's are not greater
1096 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1097 if (tmp_ip->pnn == -1) {
1101 /* Get the highest and lowest number of ips's served by any
1102 valid node which can serve this ip.
1106 for (i=0;i<nodemap->num;i++) {
1107 if (nodemap->nodes[i].flags & mask) {
1111 /* only check nodes that can actually serve this ip */
1112 if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1113 /* no it couldnt so skip to the next node */
1117 num = node_ip_coverage(ctdb, i, all_ips);
1118 if (maxnode == -1) {
1127 if (minnode == -1) {
1137 if (maxnode == -1) {
1138 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1139 ctdb_addr_to_str(&tmp_ip->addr)));
1144 /* If we want deterministic IPs then dont try to reallocate
1145 them to spread out the load.
1147 if (1 == ctdb->tunable.deterministic_public_ips) {
1151 /* if the spread between the smallest and largest coverage by
1152 a node is >=2 we steal one of the ips from the node with
1153 most coverage to even things out a bit.
1154 try to do this at most 5 times since we dont want to spend
1155 too much time balancing the ip coverage.
1157 if ( (maxnum > minnum+1)
1159 struct ctdb_public_ip_list *tmp;
1161 /* mark one of maxnode's vnn's as unassigned and try
1164 for (tmp=all_ips;tmp;tmp=tmp->next) {
1165 if (tmp->pnn == maxnode) {
1175 /* finished distributing the public addresses, now just send the
1176 info out to the nodes
1180 /* at this point ->pnn is the node which will own each IP
1181 or -1 if there is no node that can cover this ip
1184 /* now tell all nodes to delete any alias that they should not
1185 have. This will be a NOOP on nodes that don't currently
1186 hold the given alias */
1187 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1188 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1190 for (i=0;i<nodemap->num;i++) {
1191 /* don't talk to unconnected nodes, but do talk to banned nodes */
1192 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1196 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1197 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1198 /* This node should be serving this
1199 vnn so dont tell it to release the ip
1203 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1204 ipv4.pnn = tmp_ip->pnn;
1205 ipv4.sin = tmp_ip->addr.ip;
1207 timeout = TAKEOVER_TIMEOUT();
1208 data.dsize = sizeof(ipv4);
1209 data.dptr = (uint8_t *)&ipv4;
1210 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1211 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1215 ip.pnn = tmp_ip->pnn;
1216 ip.addr = tmp_ip->addr;
1218 timeout = TAKEOVER_TIMEOUT();
1219 data.dsize = sizeof(ip);
1220 data.dptr = (uint8_t *)&ip;
1221 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1222 0, CTDB_CONTROL_RELEASE_IP, 0,
1227 if (state == NULL) {
1228 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1229 talloc_free(tmp_ctx);
1233 ctdb_client_async_add(async_data, state);
1236 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1237 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1238 talloc_free(tmp_ctx);
1241 talloc_free(async_data);
1244 /* tell all nodes to get their own IPs */
1245 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1246 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1247 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1248 if (tmp_ip->pnn == -1) {
1249 /* this IP won't be taken over */
1253 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1254 ipv4.pnn = tmp_ip->pnn;
1255 ipv4.sin = tmp_ip->addr.ip;
1257 timeout = TAKEOVER_TIMEOUT();
1258 data.dsize = sizeof(ipv4);
1259 data.dptr = (uint8_t *)&ipv4;
1260 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1261 0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1265 ip.pnn = tmp_ip->pnn;
1266 ip.addr = tmp_ip->addr;
1268 timeout = TAKEOVER_TIMEOUT();
1269 data.dsize = sizeof(ip);
1270 data.dptr = (uint8_t *)&ip;
1271 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1272 0, CTDB_CONTROL_TAKEOVER_IP, 0,
1276 if (state == NULL) {
1277 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1278 talloc_free(tmp_ctx);
1282 ctdb_client_async_add(async_data, state);
1284 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1285 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1286 talloc_free(tmp_ctx);
1290 talloc_free(tmp_ctx);
1296 destroy a ctdb_client_ip structure
1298 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1300 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1301 ctdb_addr_to_str(&ip->addr),
1302 ntohs(ip->addr.ip.sin_port),
1305 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1310 called by a client to inform us of a TCP connection that it is managing
1311 that should tickled with an ACK when IP takeover is done
1312 we handle both the old ipv4 style of packets as well as the new ipv4/6
1315 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1318 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1319 struct ctdb_control_tcp *old_addr = NULL;
1320 struct ctdb_control_tcp_addr new_addr;
1321 struct ctdb_control_tcp_addr *tcp_sock = NULL;
1322 struct ctdb_tcp_list *tcp;
1323 struct ctdb_control_tcp_vnn t;
1326 struct ctdb_client_ip *ip;
1327 struct ctdb_vnn *vnn;
1328 ctdb_sock_addr addr;
1330 switch (indata.dsize) {
1331 case sizeof(struct ctdb_control_tcp):
1332 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1333 ZERO_STRUCT(new_addr);
1334 tcp_sock = &new_addr;
1335 tcp_sock->src.ip = old_addr->src;
1336 tcp_sock->dest.ip = old_addr->dest;
1338 case sizeof(struct ctdb_control_tcp_addr):
1339 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1342 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1343 "to ctdb_control_tcp_client. size was %d but "
1344 "only allowed sizes are %lu and %lu\n",
1346 (long unsigned)sizeof(struct ctdb_control_tcp),
1347 (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1351 addr = tcp_sock->src;
1352 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
1353 addr = tcp_sock->dest;
1354 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1357 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1358 vnn = find_public_ip_vnn(ctdb, &addr);
1360 switch (addr.sa.sa_family) {
1362 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1363 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
1364 ctdb_addr_to_str(&addr)));
1368 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
1369 ctdb_addr_to_str(&addr)));
1372 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1378 if (vnn->pnn != ctdb->pnn) {
1379 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1380 ctdb_addr_to_str(&addr),
1381 client_id, client->pid));
1382 /* failing this call will tell smbd to die */
1386 ip = talloc(client, struct ctdb_client_ip);
1387 CTDB_NO_MEMORY(ctdb, ip);
1391 ip->client_id = client_id;
1392 talloc_set_destructor(ip, ctdb_client_ip_destructor);
1393 DLIST_ADD(ctdb->client_ip_list, ip);
1395 tcp = talloc(client, struct ctdb_tcp_list);
1396 CTDB_NO_MEMORY(ctdb, tcp);
1398 tcp->connection.src_addr = tcp_sock->src;
1399 tcp->connection.dst_addr = tcp_sock->dest;
1401 DLIST_ADD(client->tcp_list, tcp);
1403 t.src = tcp_sock->src;
1404 t.dest = tcp_sock->dest;
1406 data.dptr = (uint8_t *)&t;
1407 data.dsize = sizeof(t);
1409 switch (addr.sa.sa_family) {
1411 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1412 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
1413 ctdb_addr_to_str(&tcp_sock->src),
1414 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1417 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1418 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
1419 ctdb_addr_to_str(&tcp_sock->src),
1420 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1423 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1427 /* tell all nodes about this tcp connection */
1428 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
1429 CTDB_CONTROL_TCP_ADD,
1430 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1432 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1440 find a tcp address on a list
1442 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
1443 struct ctdb_tcp_connection *tcp)
1447 if (array == NULL) {
1451 for (i=0;i<array->num;i++) {
1452 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1453 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1454 return &array->connections[i];
1461 called by a daemon to inform us of a TCP connection that one of its
1462 clients managing that should tickled with an ACK when IP takeover is
1465 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
1467 struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
1468 struct ctdb_tcp_array *tcparray;
1469 struct ctdb_tcp_connection tcp;
1470 struct ctdb_vnn *vnn;
1472 vnn = find_public_ip_vnn(ctdb, &p->dest);
1474 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1475 ctdb_addr_to_str(&p->dest)));
1481 tcparray = vnn->tcp_array;
1483 /* If this is the first tickle */
1484 if (tcparray == NULL) {
1485 tcparray = talloc_size(ctdb->nodes,
1486 offsetof(struct ctdb_tcp_array, connections) +
1487 sizeof(struct ctdb_tcp_connection) * 1);
1488 CTDB_NO_MEMORY(ctdb, tcparray);
1489 vnn->tcp_array = tcparray;
1492 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1493 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1495 tcparray->connections[tcparray->num].src_addr = p->src;
1496 tcparray->connections[tcparray->num].dst_addr = p->dest;
1502 /* Do we already have this tickle ?*/
1503 tcp.src_addr = p->src;
1504 tcp.dst_addr = p->dest;
1505 if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1506 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1507 ctdb_addr_to_str(&tcp.dst_addr),
1508 ntohs(tcp.dst_addr.ip.sin_port),
1513 /* A new tickle, we must add it to the array */
1514 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1515 struct ctdb_tcp_connection,
1517 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1519 vnn->tcp_array = tcparray;
1520 tcparray->connections[tcparray->num].src_addr = p->src;
1521 tcparray->connections[tcparray->num].dst_addr = p->dest;
1524 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1525 ctdb_addr_to_str(&tcp.dst_addr),
1526 ntohs(tcp.dst_addr.ip.sin_port),
1534 called by a daemon to inform us of a TCP connection that one of its
1535 clients managing that should tickled with an ACK when IP takeover is
1538 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1540 struct ctdb_tcp_connection *tcpp;
1541 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1544 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1545 ctdb_addr_to_str(&conn->dst_addr)));
1549 /* if the array is empty we cant remove it
1550 and we dont need to do anything
1552 if (vnn->tcp_array == NULL) {
1553 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1554 ctdb_addr_to_str(&conn->dst_addr),
1555 ntohs(conn->dst_addr.ip.sin_port)));
1560 /* See if we know this connection
1561 if we dont know this connection then we dont need to do anything
1563 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1565 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1566 ctdb_addr_to_str(&conn->dst_addr),
1567 ntohs(conn->dst_addr.ip.sin_port)));
1572 /* We need to remove this entry from the array.
1573 Instead of allocating a new array and copying data to it
1574 we cheat and just copy the last entry in the existing array
1575 to the entry that is to be removed and just shring the
1578 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1579 vnn->tcp_array->num--;
1581 /* If we deleted the last entry we also need to remove the entire array
1583 if (vnn->tcp_array->num == 0) {
1584 talloc_free(vnn->tcp_array);
1585 vnn->tcp_array = NULL;
1588 vnn->tcp_update_needed = true;
1590 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1591 ctdb_addr_to_str(&conn->src_addr),
1592 ntohs(conn->src_addr.ip.sin_port)));
1597 called when a daemon restarts - send all tickes for all public addresses
1598 we are serving immediately to the new node.
1600 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1602 /*XXX here we should send all tickes we are serving to the new node */
1608 called when a client structure goes away - hook to remove
1609 elements from the tcp_list in all daemons
1611 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1613 while (client->tcp_list) {
1614 struct ctdb_tcp_list *tcp = client->tcp_list;
1615 DLIST_REMOVE(client->tcp_list, tcp);
1616 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1622 release all IPs on shutdown
1624 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1626 struct ctdb_vnn *vnn;
1628 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1629 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1630 ctdb_vnn_unassign_iface(ctdb, vnn);
1636 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1637 ctdb_vnn_iface_string(vnn),
1638 ctdb_addr_to_str(&vnn->public_address),
1639 vnn->public_netmask_bits);
1640 release_kill_clients(ctdb, &vnn->public_address);
1641 ctdb_vnn_unassign_iface(ctdb, vnn);
1647 get list of public IPs
1649 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
1650 struct ctdb_req_control *c, TDB_DATA *outdata)
1653 struct ctdb_all_public_ips *ips;
1654 struct ctdb_vnn *vnn;
1655 bool only_available = false;
1657 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
1658 only_available = true;
1661 /* count how many public ip structures we have */
1663 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1667 len = offsetof(struct ctdb_all_public_ips, ips) +
1668 num*sizeof(struct ctdb_public_ip);
1669 ips = talloc_zero_size(outdata, len);
1670 CTDB_NO_MEMORY(ctdb, ips);
1673 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1674 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
1677 ips->ips[i].pnn = vnn->pnn;
1678 ips->ips[i].addr = vnn->public_address;
1682 len = offsetof(struct ctdb_all_public_ips, ips) +
1683 i*sizeof(struct ctdb_public_ip);
1685 outdata->dsize = len;
1686 outdata->dptr = (uint8_t *)ips;
1693 get list of public IPs, old ipv4 style. only returns ipv4 addresses
1695 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb,
1696 struct ctdb_req_control *c, TDB_DATA *outdata)
1699 struct ctdb_all_public_ipsv4 *ips;
1700 struct ctdb_vnn *vnn;
1702 /* count how many public ip structures we have */
1704 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1705 if (vnn->public_address.sa.sa_family != AF_INET) {
1711 len = offsetof(struct ctdb_all_public_ipsv4, ips) +
1712 num*sizeof(struct ctdb_public_ipv4);
1713 ips = talloc_zero_size(outdata, len);
1714 CTDB_NO_MEMORY(ctdb, ips);
1716 outdata->dsize = len;
1717 outdata->dptr = (uint8_t *)ips;
1721 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1722 if (vnn->public_address.sa.sa_family != AF_INET) {
1725 ips->ips[i].pnn = vnn->pnn;
1726 ips->ips[i].sin = vnn->public_address.ip;
1733 int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
1734 struct ctdb_req_control *c,
1739 ctdb_sock_addr *addr;
1740 struct ctdb_control_public_ip_info *info;
1741 struct ctdb_vnn *vnn;
1743 addr = (ctdb_sock_addr *)indata.dptr;
1745 vnn = find_public_ip_vnn(ctdb, addr);
1747 /* if it is not a public ip it could be our 'single ip' */
1748 if (ctdb->single_ip_vnn) {
1749 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
1750 vnn = ctdb->single_ip_vnn;
1755 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
1756 "'%s'not a public address\n",
1757 ctdb_addr_to_str(addr)));
1761 /* count how many public ip structures we have */
1763 for (;vnn->ifaces[num];) {
1767 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
1768 num*sizeof(struct ctdb_control_iface_info);
1769 info = talloc_zero_size(outdata, len);
1770 CTDB_NO_MEMORY(ctdb, info);
1772 info->ip.addr = vnn->public_address;
1773 info->ip.pnn = vnn->pnn;
1774 info->active_idx = 0xFFFFFFFF;
1776 for (i=0; vnn->ifaces[i]; i++) {
1777 struct ctdb_iface *cur;
1779 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
1781 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
1785 if (vnn->iface == cur) {
1786 info->active_idx = i;
1788 strcpy(info->ifaces[i].name, cur->name);
1789 info->ifaces[i].link_state = cur->link_up;
1790 info->ifaces[i].references = cur->references;
1793 len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
1794 i*sizeof(struct ctdb_control_iface_info);
1796 outdata->dsize = len;
1797 outdata->dptr = (uint8_t *)info;
1802 int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
1803 struct ctdb_req_control *c,
1807 struct ctdb_control_get_ifaces *ifaces;
1808 struct ctdb_iface *cur;
1810 /* count how many public ip structures we have */
1812 for (cur=ctdb->ifaces;cur;cur=cur->next) {
1816 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
1817 num*sizeof(struct ctdb_control_iface_info);
1818 ifaces = talloc_zero_size(outdata, len);
1819 CTDB_NO_MEMORY(ctdb, ifaces);
1822 for (cur=ctdb->ifaces;cur;cur=cur->next) {
1823 strcpy(ifaces->ifaces[i].name, cur->name);
1824 ifaces->ifaces[i].link_state = cur->link_up;
1825 ifaces->ifaces[i].references = cur->references;
1829 len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
1830 i*sizeof(struct ctdb_control_iface_info);
1832 outdata->dsize = len;
1833 outdata->dptr = (uint8_t *)ifaces;
1838 int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
1839 struct ctdb_req_control *c,
1847 structure containing the listening socket and the list of tcp connections
1848 that the ctdb daemon is to kill
1850 struct ctdb_kill_tcp {
1851 struct ctdb_vnn *vnn;
1852 struct ctdb_context *ctdb;
1854 struct fd_event *fde;
1855 trbt_tree_t *connections;
1860 a tcp connection that is to be killed
1862 struct ctdb_killtcp_con {
1863 ctdb_sock_addr src_addr;
1864 ctdb_sock_addr dst_addr;
1866 struct ctdb_kill_tcp *killtcp;
1869 /* this function is used to create a key to represent this socketpair
1870 in the killtcp tree.
1871 this key is used to insert and lookup matching socketpairs that are
1872 to be tickled and RST
1874 #define KILLTCP_KEYLEN 10
1875 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
1877 static uint32_t key[KILLTCP_KEYLEN];
1879 bzero(key, sizeof(key));
1881 if (src->sa.sa_family != dst->sa.sa_family) {
1882 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
1886 switch (src->sa.sa_family) {
1888 key[0] = dst->ip.sin_addr.s_addr;
1889 key[1] = src->ip.sin_addr.s_addr;
1890 key[2] = dst->ip.sin_port;
1891 key[3] = src->ip.sin_port;
1894 key[0] = dst->ip6.sin6_addr.s6_addr32[3];
1895 key[1] = src->ip6.sin6_addr.s6_addr32[3];
1896 key[2] = dst->ip6.sin6_addr.s6_addr32[2];
1897 key[3] = src->ip6.sin6_addr.s6_addr32[2];
1898 key[4] = dst->ip6.sin6_addr.s6_addr32[1];
1899 key[5] = src->ip6.sin6_addr.s6_addr32[1];
1900 key[6] = dst->ip6.sin6_addr.s6_addr32[0];
1901 key[7] = src->ip6.sin6_addr.s6_addr32[0];
1902 key[8] = dst->ip6.sin6_port;
1903 key[9] = src->ip6.sin6_port;
1906 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
1914 called when we get a read event on the raw socket
1916 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
1917 uint16_t flags, void *private_data)
1919 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1920 struct ctdb_killtcp_con *con;
1921 ctdb_sock_addr src, dst;
1922 uint32_t ack_seq, seq;
1924 if (!(flags & EVENT_FD_READ)) {
1928 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1929 killtcp->private_data,
1931 &ack_seq, &seq) != 0) {
1932 /* probably a non-tcp ACK packet */
1936 /* check if we have this guy in our list of connections
1939 con = trbt_lookuparray32(killtcp->connections,
1940 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1942 /* no this was some other packet we can just ignore */
1946 /* This one has been tickled !
1947 now reset him and remove him from the list.
1949 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
1950 ntohs(con->dst_addr.ip.sin_port),
1951 ctdb_addr_to_str(&con->src_addr),
1952 ntohs(con->src_addr.ip.sin_port)));
1954 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
1959 /* when traversing the list of all tcp connections to send tickle acks to
1960 (so that we can capture the ack coming back and kill the connection
1962 this callback is called for each connection we are currently trying to kill
1964 static void tickle_connection_traverse(void *param, void *data)
1966 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1968 /* have tried too many times, just give up */
1969 if (con->count >= 5) {
1974 /* othervise, try tickling it again */
1977 (ctdb_sock_addr *)&con->dst_addr,
1978 (ctdb_sock_addr *)&con->src_addr,
1984 called every second until all sentenced connections have been reset
1986 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
1987 struct timeval t, void *private_data)
1989 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1992 /* loop over all connections sending tickle ACKs */
1993 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, NULL);
1996 /* If there are no more connections to kill we can remove the
1997 entire killtcp structure
1999 if ( (killtcp->connections == NULL) ||
2000 (killtcp->connections->root == NULL) ) {
2001 talloc_free(killtcp);
2005 /* try tickling them again in a seconds time
2007 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
2008 ctdb_tickle_sentenced_connections, killtcp);
2012 destroy the killtcp structure
2014 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2016 killtcp->vnn->killtcp = NULL;
2021 /* nothing fancy here, just unconditionally replace any existing
2022 connection structure with the new one.
2024 dont even free the old one if it did exist, that one is talloc_stolen
2025 by the same node in the tree anyway and will be deleted when the new data
2028 static void *add_killtcp_callback(void *parm, void *data)
2034 add a tcp socket to the list of connections we want to RST
2036 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
2040 ctdb_sock_addr src, dst;
2041 struct ctdb_kill_tcp *killtcp;
2042 struct ctdb_killtcp_con *con;
2043 struct ctdb_vnn *vnn;
2045 ctdb_canonicalize_ip(s, &src);
2046 ctdb_canonicalize_ip(d, &dst);
2048 vnn = find_public_ip_vnn(ctdb, &dst);
2050 vnn = find_public_ip_vnn(ctdb, &src);
2053 /* if it is not a public ip it could be our 'single ip' */
2054 if (ctdb->single_ip_vnn) {
2055 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2056 vnn = ctdb->single_ip_vnn;
2061 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
2065 killtcp = vnn->killtcp;
2067 /* If this is the first connection to kill we must allocate
2070 if (killtcp == NULL) {
2071 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
2072 CTDB_NO_MEMORY(ctdb, killtcp);
2075 killtcp->ctdb = ctdb;
2076 killtcp->capture_fd = -1;
2077 killtcp->connections = trbt_create(killtcp, 0);
2079 vnn->killtcp = killtcp;
2080 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2085 /* create a structure that describes this connection we want to
2086 RST and store it in killtcp->connections
2088 con = talloc(killtcp, struct ctdb_killtcp_con);
2089 CTDB_NO_MEMORY(ctdb, con);
2090 con->src_addr = src;
2091 con->dst_addr = dst;
2093 con->killtcp = killtcp;
2096 trbt_insertarray32_callback(killtcp->connections,
2097 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2098 add_killtcp_callback, con);
2101 If we dont have a socket to listen on yet we must create it
2103 if (killtcp->capture_fd == -1) {
2104 const char *iface = ctdb_vnn_iface_string(vnn);
2105 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2106 if (killtcp->capture_fd == -1) {
2107 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2108 "socket on iface '%s' for killtcp (%s)\n",
2109 iface, strerror(errno)));
2115 if (killtcp->fde == NULL) {
2116 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
2117 EVENT_FD_READ | EVENT_FD_AUTOCLOSE,
2118 capture_tcp_handler, killtcp);
2120 /* We also need to set up some events to tickle all these connections
2121 until they are all reset
2123 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
2124 ctdb_tickle_sentenced_connections, killtcp);
2127 /* tickle him once now */
2136 talloc_free(vnn->killtcp);
2137 vnn->killtcp = NULL;
2142 kill a TCP connection.
2144 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2146 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2148 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2152 called by a daemon to inform us of the entire list of TCP tickles for
2153 a particular public address.
2154 this control should only be sent by the node that is currently serving
2155 that public address.
2157 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2159 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2160 struct ctdb_tcp_array *tcparray;
2161 struct ctdb_vnn *vnn;
2163 /* We must at least have tickles.num or else we cant verify the size
2164 of the received data blob
2166 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
2167 tickles.connections)) {
2168 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2172 /* verify that the size of data matches what we expect */
2173 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
2174 tickles.connections)
2175 + sizeof(struct ctdb_tcp_connection)
2176 * list->tickles.num) {
2177 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2181 vnn = find_public_ip_vnn(ctdb, &list->addr);
2183 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
2184 ctdb_addr_to_str(&list->addr)));
2189 /* remove any old ticklelist we might have */
2190 talloc_free(vnn->tcp_array);
2191 vnn->tcp_array = NULL;
2193 tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2194 CTDB_NO_MEMORY(ctdb, tcparray);
2196 tcparray->num = list->tickles.num;
2198 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2199 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2201 memcpy(tcparray->connections, &list->tickles.connections[0],
2202 sizeof(struct ctdb_tcp_connection)*tcparray->num);
2204 /* We now have a new fresh tickle list array for this vnn */
2205 vnn->tcp_array = talloc_steal(vnn, tcparray);
2211 called to return the full list of tickles for the puclic address associated
2212 with the provided vnn
2214 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2216 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2217 struct ctdb_control_tcp_tickle_list *list;
2218 struct ctdb_tcp_array *tcparray;
2220 struct ctdb_vnn *vnn;
2222 vnn = find_public_ip_vnn(ctdb, addr);
2224 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
2225 ctdb_addr_to_str(addr)));
2230 tcparray = vnn->tcp_array;
2232 num = tcparray->num;
2237 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
2238 tickles.connections)
2239 + sizeof(struct ctdb_tcp_connection) * num;
2241 outdata->dptr = talloc_size(outdata, outdata->dsize);
2242 CTDB_NO_MEMORY(ctdb, outdata->dptr);
2243 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2246 list->tickles.num = num;
2248 memcpy(&list->tickles.connections[0], tcparray->connections,
2249 sizeof(struct ctdb_tcp_connection) * num);
2257 set the list of all tcp tickles for a public address
2259 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb,
2260 struct timeval timeout, uint32_t destnode,
2261 ctdb_sock_addr *addr,
2262 struct ctdb_tcp_array *tcparray)
2266 struct ctdb_control_tcp_tickle_list *list;
2269 num = tcparray->num;
2274 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
2275 tickles.connections) +
2276 sizeof(struct ctdb_tcp_connection) * num;
2277 data.dptr = talloc_size(ctdb, data.dsize);
2278 CTDB_NO_MEMORY(ctdb, data.dptr);
2280 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2282 list->tickles.num = num;
2284 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2287 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2288 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2289 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2291 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2295 talloc_free(data.dptr);
2302 perform tickle updates if required
2304 static void ctdb_update_tcp_tickles(struct event_context *ev,
2305 struct timed_event *te,
2306 struct timeval t, void *private_data)
2308 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2310 struct ctdb_vnn *vnn;
2312 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2313 /* we only send out updates for public addresses that
2316 if (ctdb->pnn != vnn->pnn) {
2319 /* We only send out the updates if we need to */
2320 if (!vnn->tcp_update_needed) {
2323 ret = ctdb_ctrl_set_tcp_tickles(ctdb,
2325 CTDB_BROADCAST_CONNECTED,
2326 &vnn->public_address,
2329 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2330 ctdb_addr_to_str(&vnn->public_address)));
2334 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2335 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2336 ctdb_update_tcp_tickles, ctdb);
2341 start periodic update of tcp tickles
2343 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2345 ctdb->tickle_update_context = talloc_new(ctdb);
2347 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2348 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2349 ctdb_update_tcp_tickles, ctdb);
2355 struct control_gratious_arp {
2356 struct ctdb_context *ctdb;
2357 ctdb_sock_addr addr;
2363 send a control_gratuitous arp
2365 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
2366 struct timeval t, void *private_data)
2369 struct control_gratious_arp *arp = talloc_get_type(private_data,
2370 struct control_gratious_arp);
2372 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2374 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2375 arp->iface, strerror(errno)));
2380 if (arp->count == CTDB_ARP_REPEAT) {
2385 event_add_timed(arp->ctdb->ev, arp,
2386 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
2387 send_gratious_arp, arp);
2394 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2396 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2397 struct control_gratious_arp *arp;
2399 /* verify the size of indata */
2400 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2401 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
2402 (unsigned)indata.dsize,
2403 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2407 ( offsetof(struct ctdb_control_gratious_arp, iface)
2408 + gratious_arp->len ) ){
2410 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2411 "but should be %u bytes\n",
2412 (unsigned)indata.dsize,
2413 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2418 arp = talloc(ctdb, struct control_gratious_arp);
2419 CTDB_NO_MEMORY(ctdb, arp);
2422 arp->addr = gratious_arp->addr;
2423 arp->iface = talloc_strdup(arp, gratious_arp->iface);
2424 CTDB_NO_MEMORY(ctdb, arp->iface);
2427 event_add_timed(arp->ctdb->ev, arp,
2428 timeval_zero(), send_gratious_arp, arp);
2433 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2435 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2438 /* verify the size of indata */
2439 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2440 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2444 ( offsetof(struct ctdb_control_ip_iface, iface)
2447 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2448 "but should be %u bytes\n",
2449 (unsigned)indata.dsize,
2450 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2454 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2457 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2465 called when releaseip event finishes for del_public_address
2467 static void delete_ip_callback(struct ctdb_context *ctdb, int status,
2470 talloc_free(private_data);
2473 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2475 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2476 struct ctdb_vnn *vnn;
2479 /* verify the size of indata */
2480 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2481 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2485 ( offsetof(struct ctdb_control_ip_iface, iface)
2488 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2489 "but should be %u bytes\n",
2490 (unsigned)indata.dsize,
2491 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2495 /* walk over all public addresses until we find a match */
2496 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2497 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2498 TALLOC_CTX *mem_ctx;
2500 DLIST_REMOVE(ctdb->vnn, vnn);
2501 if (vnn->iface == NULL) {
2506 mem_ctx = talloc_new(ctdb);
2507 ret = ctdb_event_script_callback(ctdb,
2508 mem_ctx, delete_ip_callback, mem_ctx,
2510 CTDB_EVENT_RELEASE_IP,
2512 ctdb_vnn_iface_string(vnn),
2513 ctdb_addr_to_str(&vnn->public_address),
2514 vnn->public_netmask_bits);
2515 ctdb_vnn_unassign_iface(ctdb, vnn);