4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
37 struct ctdb_iface *prev, *next;
43 static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
46 return vnn->iface->name;
52 static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
56 /* Verify that we dont have an entry for this ip yet */
57 for (i=ctdb->ifaces;i;i=i->next) {
58 if (strcmp(i->name, iface) == 0) {
63 /* create a new structure for this interface */
64 i = talloc_zero(ctdb, struct ctdb_iface);
65 CTDB_NO_MEMORY_FATAL(ctdb, i);
66 i->name = talloc_strdup(i, iface);
67 CTDB_NO_MEMORY(ctdb, i->name);
70 DLIST_ADD(ctdb->ifaces, i);
75 static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
80 /* Verify that we dont have an entry for this ip yet */
81 for (i=ctdb->ifaces;i;i=i->next) {
82 if (strcmp(i->name, iface) == 0) {
90 static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
94 struct ctdb_iface *cur = NULL;
95 struct ctdb_iface *best = NULL;
97 for (i=0; vnn->ifaces[i]; i++) {
99 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
113 if (cur->references < best->references) {
122 static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
123 struct ctdb_vnn *vnn)
125 struct ctdb_iface *best = NULL;
128 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
129 "still assigned to iface '%s'\n",
130 ctdb_addr_to_str(&vnn->public_address),
131 ctdb_vnn_iface_string(vnn)));
135 best = ctdb_vnn_best_iface(ctdb, vnn);
137 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
138 "cannot assign to iface any iface\n",
139 ctdb_addr_to_str(&vnn->public_address)));
145 vnn->pnn = ctdb->pnn;
147 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
148 "now assigned to iface '%s' refs[%d]\n",
149 ctdb_addr_to_str(&vnn->public_address),
150 ctdb_vnn_iface_string(vnn),
155 static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
156 struct ctdb_vnn *vnn)
158 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
159 "now unassigned (old iface '%s' refs[%d])\n",
160 ctdb_addr_to_str(&vnn->public_address),
161 ctdb_vnn_iface_string(vnn),
162 vnn->iface?vnn->iface->references:0));
164 vnn->iface->references--;
167 if (vnn->pnn == ctdb->pnn) {
172 struct ctdb_takeover_arp {
173 struct ctdb_context *ctdb;
176 struct ctdb_tcp_array *tcparray;
177 struct ctdb_vnn *vnn;
182 lists of tcp endpoints
184 struct ctdb_tcp_list {
185 struct ctdb_tcp_list *prev, *next;
186 struct ctdb_tcp_connection connection;
190 list of clients to kill on IP release
192 struct ctdb_client_ip {
193 struct ctdb_client_ip *prev, *next;
194 struct ctdb_context *ctdb;
201 send a gratuitous arp
203 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
204 struct timeval t, void *private_data)
206 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
207 struct ctdb_takeover_arp);
209 struct ctdb_tcp_array *tcparray;
210 const char *iface = ctdb_vnn_iface_string(arp->vnn);
212 ret = ctdb_sys_send_arp(&arp->addr, iface);
214 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
215 iface, strerror(errno)));
218 tcparray = arp->tcparray;
220 for (i=0;i<tcparray->num;i++) {
221 struct ctdb_tcp_connection *tcon;
223 tcon = &tcparray->connections[i];
224 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
225 (unsigned)ntohs(tcon->dst_addr.ip.sin_port),
226 ctdb_addr_to_str(&tcon->src_addr),
227 (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
228 ret = ctdb_sys_send_tcp(
233 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
234 ctdb_addr_to_str(&tcon->src_addr)));
241 if (arp->count == CTDB_ARP_REPEAT) {
246 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
247 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
248 ctdb_control_send_arp, arp);
251 struct takeover_callback_state {
252 struct ctdb_req_control *c;
253 ctdb_sock_addr *addr;
254 struct ctdb_vnn *vnn;
258 called when takeip event finishes
260 static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
263 struct takeover_callback_state *state =
264 talloc_get_type(private_data, struct takeover_callback_state);
265 struct ctdb_takeover_arp *arp;
266 struct ctdb_tcp_array *tcparray;
269 if (status == -ETIME) {
272 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
273 ctdb_addr_to_str(state->addr),
274 ctdb_vnn_iface_string(state->vnn)));
275 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
280 if (!state->vnn->takeover_ctx) {
281 state->vnn->takeover_ctx = talloc_new(state->vnn);
282 if (!state->vnn->takeover_ctx) {
287 arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
288 if (!arp) goto failed;
291 arp->addr = *state->addr;
292 arp->vnn = state->vnn;
294 tcparray = state->vnn->tcp_array;
296 /* add all of the known tcp connections for this IP to the
297 list of tcp connections to send tickle acks for */
298 arp->tcparray = talloc_steal(arp, tcparray);
300 state->vnn->tcp_array = NULL;
301 state->vnn->tcp_update_needed = true;
304 event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx,
305 timeval_zero(), ctdb_control_send_arp, arp);
307 /* the control succeeded */
308 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
313 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
319 Find the vnn of the node that has a public ip address
320 returns -1 if the address is not known as a public address
322 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
324 struct ctdb_vnn *vnn;
326 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
327 if (ctdb_same_ip(&vnn->public_address, addr)) {
336 take over an ip address
338 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
339 struct ctdb_req_control *c,
344 struct takeover_callback_state *state;
345 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
346 struct ctdb_vnn *vnn;
348 /* update out vnn list */
349 vnn = find_public_ip_vnn(ctdb, &pip->addr);
351 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
352 ctdb_addr_to_str(&pip->addr)));
357 /* if our kernel already has this IP, do nothing */
358 if (ctdb_sys_have_ip(&pip->addr)) {
362 ret = ctdb_vnn_assign_iface(ctdb, vnn);
364 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
365 "assin a usable interface\n",
366 ctdb_addr_to_str(&pip->addr),
367 vnn->public_netmask_bits));
371 state = talloc(vnn, struct takeover_callback_state);
372 CTDB_NO_MEMORY(ctdb, state);
374 state->c = talloc_steal(ctdb, c);
375 state->addr = talloc(ctdb, ctdb_sock_addr);
376 CTDB_NO_MEMORY(ctdb, state->addr);
378 *state->addr = pip->addr;
381 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
382 ctdb_addr_to_str(&pip->addr),
383 vnn->public_netmask_bits,
384 ctdb_vnn_iface_string(vnn)));
386 ret = ctdb_event_script_callback(ctdb,
387 state, takeover_ip_callback, state,
391 ctdb_vnn_iface_string(vnn),
392 ctdb_addr_to_str(&pip->addr),
393 vnn->public_netmask_bits);
396 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
397 ctdb_addr_to_str(&pip->addr),
398 ctdb_vnn_iface_string(vnn)));
403 /* tell ctdb_control.c that we will be replying asynchronously */
410 takeover an ip address old v4 style
412 int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb,
413 struct ctdb_req_control *c,
419 data.dsize = sizeof(struct ctdb_public_ip);
420 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
421 CTDB_NO_MEMORY(ctdb, data.dptr);
423 memcpy(data.dptr, indata.dptr, indata.dsize);
424 return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
428 kill any clients that are registered with a IP that is being released
430 static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
432 struct ctdb_client_ip *ip;
434 DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
435 ctdb_addr_to_str(addr)));
437 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
438 ctdb_sock_addr tmp_addr;
441 DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n",
443 ctdb_addr_to_str(&ip->addr)));
445 if (ctdb_same_ip(&tmp_addr, addr)) {
446 struct ctdb_client *client = ctdb_reqid_find(ctdb,
449 DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n",
451 ctdb_addr_to_str(&ip->addr),
454 if (client->pid != 0) {
455 DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
456 (unsigned)client->pid,
457 ctdb_addr_to_str(addr),
459 kill(client->pid, SIGKILL);
466 called when releaseip event finishes
468 static void release_ip_callback(struct ctdb_context *ctdb, int status,
471 struct takeover_callback_state *state =
472 talloc_get_type(private_data, struct takeover_callback_state);
475 if (status == -ETIME) {
479 /* send a message to all clients of this node telling them
480 that the cluster has been reconfigured and they should
481 release any sockets on this IP */
482 data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
483 CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
484 data.dsize = strlen((char *)data.dptr)+1;
486 DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
488 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
490 /* kill clients that have registered with this IP */
491 release_kill_clients(ctdb, state->addr);
493 ctdb_vnn_unassign_iface(ctdb, state->vnn);
495 /* the control succeeded */
496 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
501 release an ip address
503 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
504 struct ctdb_req_control *c,
509 struct takeover_callback_state *state;
510 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
511 struct ctdb_vnn *vnn;
513 /* update our vnn list */
514 vnn = find_public_ip_vnn(ctdb, &pip->addr);
516 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
517 ctdb_addr_to_str(&pip->addr)));
522 /* stop any previous arps */
523 talloc_free(vnn->takeover_ctx);
524 vnn->takeover_ctx = NULL;
526 if (!ctdb_sys_have_ip(&pip->addr)) {
527 DEBUG(DEBUG_NOTICE,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
528 ctdb_addr_to_str(&pip->addr),
529 vnn->public_netmask_bits,
530 ctdb_vnn_iface_string(vnn)));
531 ctdb_vnn_unassign_iface(ctdb, vnn);
535 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%u\n",
536 ctdb_addr_to_str(&pip->addr),
537 vnn->public_netmask_bits,
538 ctdb_vnn_iface_string(vnn),
541 state = talloc(ctdb, struct takeover_callback_state);
542 CTDB_NO_MEMORY(ctdb, state);
544 state->c = talloc_steal(state, c);
545 state->addr = talloc(state, ctdb_sock_addr);
546 CTDB_NO_MEMORY(ctdb, state->addr);
547 *state->addr = pip->addr;
550 ret = ctdb_event_script_callback(ctdb,
551 state, release_ip_callback, state,
553 CTDB_EVENT_RELEASE_IP,
555 ctdb_vnn_iface_string(vnn),
556 ctdb_addr_to_str(&pip->addr),
557 vnn->public_netmask_bits);
559 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
560 ctdb_addr_to_str(&pip->addr),
561 ctdb_vnn_iface_string(vnn)));
566 /* tell the control that we will be reply asynchronously */
572 release an ip address old v4 style
574 int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb,
575 struct ctdb_req_control *c,
581 data.dsize = sizeof(struct ctdb_public_ip);
582 data.dptr = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
583 CTDB_NO_MEMORY(ctdb, data.dptr);
585 memcpy(data.dptr, indata.dptr, indata.dsize);
586 return ctdb_control_release_ip(ctdb, c, data, async_reply);
590 static int ctdb_add_public_address(struct ctdb_context *ctdb,
591 ctdb_sock_addr *addr,
592 unsigned mask, const char *ifaces)
594 struct ctdb_vnn *vnn;
601 /* Verify that we dont have an entry for this ip yet */
602 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
603 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
604 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
605 ctdb_addr_to_str(addr)));
610 /* create a new vnn structure for this ip address */
611 vnn = talloc_zero(ctdb, struct ctdb_vnn);
612 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
613 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
614 tmp = talloc_strdup(vnn, ifaces);
615 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
616 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
617 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
618 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
619 vnn->ifaces[num] = talloc_strdup(vnn, iface);
620 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
624 vnn->ifaces[num] = NULL;
625 vnn->public_address = *addr;
626 vnn->public_netmask_bits = mask;
629 for (i=0; vnn->ifaces[i]; i++) {
630 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
632 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
633 "for public_address[%s]\n",
634 vnn->ifaces[i], ctdb_addr_to_str(addr)));
640 DLIST_ADD(ctdb->vnn, vnn);
646 setup the event script directory
648 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
650 ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
651 CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
656 setup the public address lists from a file
658 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
664 lines = file_lines_load(alist, &nlines, ctdb);
666 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
669 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
673 for (i=0;i<nlines;i++) {
681 while ((*line == ' ') || (*line == '\t')) {
687 if (strcmp(line, "") == 0) {
690 tok = strtok(line, " \t");
692 tok = strtok(NULL, " \t");
694 if (NULL == ctdb->default_public_interface) {
695 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
700 ifaces = ctdb->default_public_interface;
705 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
706 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
710 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces)) {
711 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
721 int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
725 struct ctdb_vnn *svnn;
729 svnn = talloc_zero(ctdb, struct ctdb_vnn);
730 CTDB_NO_MEMORY(ctdb, svnn);
732 svnn->ifaces = talloc_array(svnn, const char *, 2);
733 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
734 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
735 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
736 svnn->ifaces[1] = NULL;
738 ok = parse_ip(ip, iface, 0, &svnn->public_address);
744 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
746 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
747 "for single_ip[%s]\n",
749 ctdb_addr_to_str(&svnn->public_address)));
754 ret = ctdb_vnn_assign_iface(ctdb, svnn);
760 ctdb->single_ip_vnn = svnn;
764 struct ctdb_public_ip_list {
765 struct ctdb_public_ip_list *next;
771 /* Given a physical node, return the number of
772 public addresses that is currently assigned to this node.
774 static int node_ip_coverage(struct ctdb_context *ctdb,
776 struct ctdb_public_ip_list *ips)
780 for (;ips;ips=ips->next) {
781 if (ips->pnn == pnn) {
789 /* Check if this is a public ip known to the node, i.e. can that
790 node takeover this ip ?
792 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn,
793 struct ctdb_public_ip_list *ip)
795 struct ctdb_all_public_ips *public_ips;
798 public_ips = ctdb->nodes[pnn]->public_ips;
800 if (public_ips == NULL) {
804 for (i=0;i<public_ips->num;i++) {
805 if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
806 /* yes, this node can serve this public ip */
815 /* search the node lists list for a node to takeover this ip.
816 pick the node that currently are serving the least number of ips
817 so that the ips get spread out evenly.
819 static int find_takeover_node(struct ctdb_context *ctdb,
820 struct ctdb_node_map *nodemap, uint32_t mask,
821 struct ctdb_public_ip_list *ip,
822 struct ctdb_public_ip_list *all_ips)
828 for (i=0;i<nodemap->num;i++) {
829 if (nodemap->nodes[i].flags & mask) {
830 /* This node is not healty and can not be used to serve
836 /* verify that this node can serve this ip */
837 if (can_node_serve_ip(ctdb, i, ip)) {
838 /* no it couldnt so skip to the next node */
842 num = node_ip_coverage(ctdb, i, all_ips);
843 /* was this the first node we checked ? */
855 DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
856 ctdb_addr_to_str(&ip->addr)));
866 static uint32_t *ip_key(ctdb_sock_addr *ip)
868 static uint32_t key[IP_KEYLEN];
870 bzero(key, sizeof(key));
872 switch (ip->sa.sa_family) {
874 key[3] = htonl(ip->ip.sin_addr.s_addr);
877 key[0] = htonl(ip->ip6.sin6_addr.s6_addr32[0]);
878 key[1] = htonl(ip->ip6.sin6_addr.s6_addr32[1]);
879 key[2] = htonl(ip->ip6.sin6_addr.s6_addr32[2]);
880 key[3] = htonl(ip->ip6.sin6_addr.s6_addr32[3]);
883 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
890 static void *add_ip_callback(void *parm, void *data)
895 void getips_count_callback(void *param, void *data)
897 struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
898 struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
900 new_ip->next = *ip_list;
904 struct ctdb_public_ip_list *
905 create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
908 struct ctdb_public_ip_list *ip_list;
909 struct ctdb_all_public_ips *public_ips;
910 trbt_tree_t *ip_tree;
912 ip_tree = trbt_create(tmp_ctx, 0);
914 for (i=0;i<ctdb->num_nodes;i++) {
915 public_ips = ctdb->nodes[i]->public_ips;
917 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
921 /* there were no public ips for this node */
922 if (public_ips == NULL) {
926 for (j=0;j<public_ips->num;j++) {
927 struct ctdb_public_ip_list *tmp_ip;
929 tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
930 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
931 tmp_ip->pnn = public_ips->ips[j].pnn;
932 tmp_ip->addr = public_ips->ips[j].addr;
935 trbt_insertarray32_callback(ip_tree,
936 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
943 trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
949 make any IP alias changes for public addresses that are necessary
951 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
953 int i, num_healthy, retries;
954 struct ctdb_public_ip ip;
955 struct ctdb_public_ipv4 ipv4;
957 struct ctdb_public_ip_list *all_ips, *tmp_ip;
958 int maxnode, maxnum=0, minnode, minnum=0, num;
960 struct timeval timeout;
961 struct client_async_data *async_data;
962 struct ctdb_client_control_state *state;
963 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
968 /* Count how many completely healthy nodes we have */
970 for (i=0;i<nodemap->num;i++) {
971 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
976 if (num_healthy > 0) {
977 /* We have healthy nodes, so only consider them for
978 serving public addresses
980 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
982 /* We didnt have any completely healthy nodes so
983 use "disabled" nodes as a fallback
985 mask = NODE_FLAGS_INACTIVE;
988 /* since nodes only know about those public addresses that
989 can be served by that particular node, no single node has
990 a full list of all public addresses that exist in the cluster.
991 Walk over all node structures and create a merged list of
992 all public addresses that exist in the cluster.
994 all_ips = create_merged_ip_list(ctdb, tmp_ctx);
996 /* If we want deterministic ip allocations, i.e. that the ip addresses
997 will always be allocated the same way for a specific set of
998 available/unavailable nodes.
1000 if (1 == ctdb->tunable.deterministic_public_ips) {
1001 DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
1002 for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
1003 tmp_ip->pnn = i%nodemap->num;
1008 /* mark all public addresses with a masked node as being served by
1011 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1012 if (tmp_ip->pnn == -1) {
1015 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
1020 /* verify that the assigned nodes can serve that public ip
1021 and set it to -1 if not
1023 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1024 if (tmp_ip->pnn == -1) {
1027 if (can_node_serve_ip(ctdb, tmp_ip->pnn, tmp_ip) != 0) {
1028 /* this node can not serve this ip. */
1034 /* now we must redistribute all public addresses with takeover node
1035 -1 among the nodes available
1039 /* loop over all ip's and find a physical node to cover for
1042 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1043 if (tmp_ip->pnn == -1) {
1044 if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
1045 DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
1046 ctdb_addr_to_str(&tmp_ip->addr)));
1051 /* If we dont want ips to fail back after a node becomes healthy
1052 again, we wont even try to reallocat the ip addresses so that
1053 they are evenly spread out.
1054 This can NOT be used at the same time as DeterministicIPs !
1056 if (1 == ctdb->tunable.no_ip_failback) {
1057 if (1 == ctdb->tunable.deterministic_public_ips) {
1058 DEBUG(DEBUG_ERR, ("ERROR: You can not use 'DeterministicIPs' and 'NoIPFailback' at the same time\n"));
1064 /* now, try to make sure the ip adresses are evenly distributed
1066 for each ip address, loop over all nodes that can serve this
1067 ip and make sure that the difference between the node
1068 serving the most and the node serving the least ip's are not greater
1071 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1072 if (tmp_ip->pnn == -1) {
1076 /* Get the highest and lowest number of ips's served by any
1077 valid node which can serve this ip.
1081 for (i=0;i<nodemap->num;i++) {
1082 if (nodemap->nodes[i].flags & mask) {
1086 /* only check nodes that can actually serve this ip */
1087 if (can_node_serve_ip(ctdb, i, tmp_ip)) {
1088 /* no it couldnt so skip to the next node */
1092 num = node_ip_coverage(ctdb, i, all_ips);
1093 if (maxnode == -1) {
1102 if (minnode == -1) {
1112 if (maxnode == -1) {
1113 DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
1114 ctdb_addr_to_str(&tmp_ip->addr)));
1119 /* If we want deterministic IPs then dont try to reallocate
1120 them to spread out the load.
1122 if (1 == ctdb->tunable.deterministic_public_ips) {
1126 /* if the spread between the smallest and largest coverage by
1127 a node is >=2 we steal one of the ips from the node with
1128 most coverage to even things out a bit.
1129 try to do this at most 5 times since we dont want to spend
1130 too much time balancing the ip coverage.
1132 if ( (maxnum > minnum+1)
1134 struct ctdb_public_ip_list *tmp;
1136 /* mark one of maxnode's vnn's as unassigned and try
1139 for (tmp=all_ips;tmp;tmp=tmp->next) {
1140 if (tmp->pnn == maxnode) {
1150 /* finished distributing the public addresses, now just send the
1151 info out to the nodes
1155 /* at this point ->pnn is the node which will own each IP
1156 or -1 if there is no node that can cover this ip
1159 /* now tell all nodes to delete any alias that they should not
1160 have. This will be a NOOP on nodes that don't currently
1161 hold the given alias */
1162 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1163 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1165 for (i=0;i<nodemap->num;i++) {
1166 /* don't talk to unconnected nodes, but do talk to banned nodes */
1167 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1171 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1172 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1173 /* This node should be serving this
1174 vnn so dont tell it to release the ip
1178 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1179 ipv4.pnn = tmp_ip->pnn;
1180 ipv4.sin = tmp_ip->addr.ip;
1182 timeout = TAKEOVER_TIMEOUT();
1183 data.dsize = sizeof(ipv4);
1184 data.dptr = (uint8_t *)&ipv4;
1185 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1186 0, CTDB_CONTROL_RELEASE_IPv4, 0,
1190 ip.pnn = tmp_ip->pnn;
1191 ip.addr = tmp_ip->addr;
1193 timeout = TAKEOVER_TIMEOUT();
1194 data.dsize = sizeof(ip);
1195 data.dptr = (uint8_t *)&ip;
1196 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1197 0, CTDB_CONTROL_RELEASE_IP, 0,
1202 if (state == NULL) {
1203 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1204 talloc_free(tmp_ctx);
1208 ctdb_client_async_add(async_data, state);
1211 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1212 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1213 talloc_free(tmp_ctx);
1216 talloc_free(async_data);
1219 /* tell all nodes to get their own IPs */
1220 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1221 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1222 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1223 if (tmp_ip->pnn == -1) {
1224 /* this IP won't be taken over */
1228 if (tmp_ip->addr.sa.sa_family == AF_INET) {
1229 ipv4.pnn = tmp_ip->pnn;
1230 ipv4.sin = tmp_ip->addr.ip;
1232 timeout = TAKEOVER_TIMEOUT();
1233 data.dsize = sizeof(ipv4);
1234 data.dptr = (uint8_t *)&ipv4;
1235 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1236 0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
1240 ip.pnn = tmp_ip->pnn;
1241 ip.addr = tmp_ip->addr;
1243 timeout = TAKEOVER_TIMEOUT();
1244 data.dsize = sizeof(ip);
1245 data.dptr = (uint8_t *)&ip;
1246 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1247 0, CTDB_CONTROL_TAKEOVER_IP, 0,
1251 if (state == NULL) {
1252 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1253 talloc_free(tmp_ctx);
1257 ctdb_client_async_add(async_data, state);
1259 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1260 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1261 talloc_free(tmp_ctx);
1265 talloc_free(tmp_ctx);
1271 destroy a ctdb_client_ip structure
1273 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
1275 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
1276 ctdb_addr_to_str(&ip->addr),
1277 ntohs(ip->addr.ip.sin_port),
1280 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
1285 called by a client to inform us of a TCP connection that it is managing
1286 that should tickled with an ACK when IP takeover is done
1287 we handle both the old ipv4 style of packets as well as the new ipv4/6
1290 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
1293 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
1294 struct ctdb_control_tcp *old_addr = NULL;
1295 struct ctdb_control_tcp_addr new_addr;
1296 struct ctdb_control_tcp_addr *tcp_sock = NULL;
1297 struct ctdb_tcp_list *tcp;
1298 struct ctdb_control_tcp_vnn t;
1301 struct ctdb_client_ip *ip;
1302 struct ctdb_vnn *vnn;
1303 ctdb_sock_addr addr;
1305 switch (indata.dsize) {
1306 case sizeof(struct ctdb_control_tcp):
1307 old_addr = (struct ctdb_control_tcp *)indata.dptr;
1308 ZERO_STRUCT(new_addr);
1309 tcp_sock = &new_addr;
1310 tcp_sock->src.ip = old_addr->src;
1311 tcp_sock->dest.ip = old_addr->dest;
1313 case sizeof(struct ctdb_control_tcp_addr):
1314 tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
1317 DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
1318 "to ctdb_control_tcp_client. size was %d but "
1319 "only allowed sizes are %lu and %lu\n",
1321 (long unsigned)sizeof(struct ctdb_control_tcp),
1322 (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
1326 addr = tcp_sock->src;
1327 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
1328 addr = tcp_sock->dest;
1329 ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
1332 memcpy(&addr, &tcp_sock->dest, sizeof(addr));
1333 vnn = find_public_ip_vnn(ctdb, &addr);
1335 switch (addr.sa.sa_family) {
1337 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
1338 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
1339 ctdb_addr_to_str(&addr)));
1343 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
1344 ctdb_addr_to_str(&addr)));
1347 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
1353 if (vnn->pnn != ctdb->pnn) {
1354 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
1355 ctdb_addr_to_str(&addr),
1356 client_id, client->pid));
1357 /* failing this call will tell smbd to die */
1361 ip = talloc(client, struct ctdb_client_ip);
1362 CTDB_NO_MEMORY(ctdb, ip);
1366 ip->client_id = client_id;
1367 talloc_set_destructor(ip, ctdb_client_ip_destructor);
1368 DLIST_ADD(ctdb->client_ip_list, ip);
1370 tcp = talloc(client, struct ctdb_tcp_list);
1371 CTDB_NO_MEMORY(ctdb, tcp);
1373 tcp->connection.src_addr = tcp_sock->src;
1374 tcp->connection.dst_addr = tcp_sock->dest;
1376 DLIST_ADD(client->tcp_list, tcp);
1378 t.src = tcp_sock->src;
1379 t.dest = tcp_sock->dest;
1381 data.dptr = (uint8_t *)&t;
1382 data.dsize = sizeof(t);
1384 switch (addr.sa.sa_family) {
1386 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1387 (unsigned)ntohs(tcp_sock->dest.ip.sin_port),
1388 ctdb_addr_to_str(&tcp_sock->src),
1389 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
1392 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
1393 (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port),
1394 ctdb_addr_to_str(&tcp_sock->src),
1395 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
1398 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
1402 /* tell all nodes about this tcp connection */
1403 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
1404 CTDB_CONTROL_TCP_ADD,
1405 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1407 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
1415 find a tcp address on a list
1417 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
1418 struct ctdb_tcp_connection *tcp)
1422 if (array == NULL) {
1426 for (i=0;i<array->num;i++) {
1427 if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
1428 ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
1429 return &array->connections[i];
1436 called by a daemon to inform us of a TCP connection that one of its
1437 clients managing that should tickled with an ACK when IP takeover is
1440 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
1442 struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
1443 struct ctdb_tcp_array *tcparray;
1444 struct ctdb_tcp_connection tcp;
1445 struct ctdb_vnn *vnn;
1447 vnn = find_public_ip_vnn(ctdb, &p->dest);
1449 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
1450 ctdb_addr_to_str(&p->dest)));
1456 tcparray = vnn->tcp_array;
1458 /* If this is the first tickle */
1459 if (tcparray == NULL) {
1460 tcparray = talloc_size(ctdb->nodes,
1461 offsetof(struct ctdb_tcp_array, connections) +
1462 sizeof(struct ctdb_tcp_connection) * 1);
1463 CTDB_NO_MEMORY(ctdb, tcparray);
1464 vnn->tcp_array = tcparray;
1467 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
1468 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1470 tcparray->connections[tcparray->num].src_addr = p->src;
1471 tcparray->connections[tcparray->num].dst_addr = p->dest;
1477 /* Do we already have this tickle ?*/
1478 tcp.src_addr = p->src;
1479 tcp.dst_addr = p->dest;
1480 if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1481 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
1482 ctdb_addr_to_str(&tcp.dst_addr),
1483 ntohs(tcp.dst_addr.ip.sin_port),
1488 /* A new tickle, we must add it to the array */
1489 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1490 struct ctdb_tcp_connection,
1492 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1494 vnn->tcp_array = tcparray;
1495 tcparray->connections[tcparray->num].src_addr = p->src;
1496 tcparray->connections[tcparray->num].dst_addr = p->dest;
1499 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
1500 ctdb_addr_to_str(&tcp.dst_addr),
1501 ntohs(tcp.dst_addr.ip.sin_port),
1509 called by a daemon to inform us of a TCP connection that one of its
1510 clients managing that should tickled with an ACK when IP takeover is
1513 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1515 struct ctdb_tcp_connection *tcpp;
1516 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
1519 DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
1520 ctdb_addr_to_str(&conn->dst_addr)));
1524 /* if the array is empty we cant remove it
1525 and we dont need to do anything
1527 if (vnn->tcp_array == NULL) {
1528 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1529 ctdb_addr_to_str(&conn->dst_addr),
1530 ntohs(conn->dst_addr.ip.sin_port)));
1535 /* See if we know this connection
1536 if we dont know this connection then we dont need to do anything
1538 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1540 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
1541 ctdb_addr_to_str(&conn->dst_addr),
1542 ntohs(conn->dst_addr.ip.sin_port)));
1547 /* We need to remove this entry from the array.
1548 Instead of allocating a new array and copying data to it
1549 we cheat and just copy the last entry in the existing array
1550 to the entry that is to be removed and just shring the
1553 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1554 vnn->tcp_array->num--;
1556 /* If we deleted the last entry we also need to remove the entire array
1558 if (vnn->tcp_array->num == 0) {
1559 talloc_free(vnn->tcp_array);
1560 vnn->tcp_array = NULL;
1563 vnn->tcp_update_needed = true;
1565 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
1566 ctdb_addr_to_str(&conn->src_addr),
1567 ntohs(conn->src_addr.ip.sin_port)));
1572 called when a daemon restarts - send all tickes for all public addresses
1573 we are serving immediately to the new node.
1575 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1577 /*XXX here we should send all tickes we are serving to the new node */
1583 called when a client structure goes away - hook to remove
1584 elements from the tcp_list in all daemons
1586 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1588 while (client->tcp_list) {
1589 struct ctdb_tcp_list *tcp = client->tcp_list;
1590 DLIST_REMOVE(client->tcp_list, tcp);
1591 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1597 release all IPs on shutdown
1599 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1601 struct ctdb_vnn *vnn;
1603 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1604 if (!ctdb_sys_have_ip(&vnn->public_address)) {
1605 ctdb_vnn_unassign_iface(ctdb, vnn);
1611 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
1612 ctdb_vnn_iface_string(vnn),
1613 ctdb_addr_to_str(&vnn->public_address),
1614 vnn->public_netmask_bits);
1615 release_kill_clients(ctdb, &vnn->public_address);
1616 ctdb_vnn_unassign_iface(ctdb, vnn);
1622 get list of public IPs
1624 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
1625 struct ctdb_req_control *c, TDB_DATA *outdata)
1628 struct ctdb_all_public_ips *ips;
1629 struct ctdb_vnn *vnn;
1631 /* count how many public ip structures we have */
1633 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1637 len = offsetof(struct ctdb_all_public_ips, ips) +
1638 num*sizeof(struct ctdb_public_ip);
1639 ips = talloc_zero_size(outdata, len);
1640 CTDB_NO_MEMORY(ctdb, ips);
1642 outdata->dsize = len;
1643 outdata->dptr = (uint8_t *)ips;
1647 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1648 ips->ips[i].pnn = vnn->pnn;
1649 ips->ips[i].addr = vnn->public_address;
1658 get list of public IPs, old ipv4 style. only returns ipv4 addresses
1660 int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb,
1661 struct ctdb_req_control *c, TDB_DATA *outdata)
1664 struct ctdb_all_public_ipsv4 *ips;
1665 struct ctdb_vnn *vnn;
1667 /* count how many public ip structures we have */
1669 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1670 if (vnn->public_address.sa.sa_family != AF_INET) {
1676 len = offsetof(struct ctdb_all_public_ipsv4, ips) +
1677 num*sizeof(struct ctdb_public_ipv4);
1678 ips = talloc_zero_size(outdata, len);
1679 CTDB_NO_MEMORY(ctdb, ips);
1681 outdata->dsize = len;
1682 outdata->dptr = (uint8_t *)ips;
1686 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1687 if (vnn->public_address.sa.sa_family != AF_INET) {
1690 ips->ips[i].pnn = vnn->pnn;
1691 ips->ips[i].sin = vnn->public_address.ip;
1700 structure containing the listening socket and the list of tcp connections
1701 that the ctdb daemon is to kill
1703 struct ctdb_kill_tcp {
1704 struct ctdb_vnn *vnn;
1705 struct ctdb_context *ctdb;
1707 struct fd_event *fde;
1708 trbt_tree_t *connections;
1713 a tcp connection that is to be killed
1715 struct ctdb_killtcp_con {
1716 ctdb_sock_addr src_addr;
1717 ctdb_sock_addr dst_addr;
1719 struct ctdb_kill_tcp *killtcp;
1722 /* this function is used to create a key to represent this socketpair
1723 in the killtcp tree.
1724 this key is used to insert and lookup matching socketpairs that are
1725 to be tickled and RST
1727 #define KILLTCP_KEYLEN 10
1728 static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
1730 static uint32_t key[KILLTCP_KEYLEN];
1732 bzero(key, sizeof(key));
1734 if (src->sa.sa_family != dst->sa.sa_family) {
1735 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
1739 switch (src->sa.sa_family) {
1741 key[0] = dst->ip.sin_addr.s_addr;
1742 key[1] = src->ip.sin_addr.s_addr;
1743 key[2] = dst->ip.sin_port;
1744 key[3] = src->ip.sin_port;
1747 key[0] = dst->ip6.sin6_addr.s6_addr32[3];
1748 key[1] = src->ip6.sin6_addr.s6_addr32[3];
1749 key[2] = dst->ip6.sin6_addr.s6_addr32[2];
1750 key[3] = src->ip6.sin6_addr.s6_addr32[2];
1751 key[4] = dst->ip6.sin6_addr.s6_addr32[1];
1752 key[5] = src->ip6.sin6_addr.s6_addr32[1];
1753 key[6] = dst->ip6.sin6_addr.s6_addr32[0];
1754 key[7] = src->ip6.sin6_addr.s6_addr32[0];
1755 key[8] = dst->ip6.sin6_port;
1756 key[9] = src->ip6.sin6_port;
1759 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
1767 called when we get a read event on the raw socket
1769 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
1770 uint16_t flags, void *private_data)
1772 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1773 struct ctdb_killtcp_con *con;
1774 ctdb_sock_addr src, dst;
1775 uint32_t ack_seq, seq;
1777 if (!(flags & EVENT_FD_READ)) {
1781 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1782 killtcp->private_data,
1784 &ack_seq, &seq) != 0) {
1785 /* probably a non-tcp ACK packet */
1789 /* check if we have this guy in our list of connections
1792 con = trbt_lookuparray32(killtcp->connections,
1793 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1795 /* no this was some other packet we can just ignore */
1799 /* This one has been tickled !
1800 now reset him and remove him from the list.
1802 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
1803 ntohs(con->dst_addr.ip.sin_port),
1804 ctdb_addr_to_str(&con->src_addr),
1805 ntohs(con->src_addr.ip.sin_port)));
1807 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
1812 /* when traversing the list of all tcp connections to send tickle acks to
1813 (so that we can capture the ack coming back and kill the connection
1815 this callback is called for each connection we are currently trying to kill
1817 static void tickle_connection_traverse(void *param, void *data)
1819 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1821 /* have tried too many times, just give up */
1822 if (con->count >= 5) {
1827 /* othervise, try tickling it again */
1830 (ctdb_sock_addr *)&con->dst_addr,
1831 (ctdb_sock_addr *)&con->src_addr,
1837 called every second until all sentenced connections have been reset
1839 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
1840 struct timeval t, void *private_data)
1842 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1845 /* loop over all connections sending tickle ACKs */
1846 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, NULL);
1849 /* If there are no more connections to kill we can remove the
1850 entire killtcp structure
1852 if ( (killtcp->connections == NULL) ||
1853 (killtcp->connections->root == NULL) ) {
1854 talloc_free(killtcp);
1858 /* try tickling them again in a seconds time
1860 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
1861 ctdb_tickle_sentenced_connections, killtcp);
1865 destroy the killtcp structure
1867 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
1869 killtcp->vnn->killtcp = NULL;
1874 /* nothing fancy here, just unconditionally replace any existing
1875 connection structure with the new one.
1877 dont even free the old one if it did exist, that one is talloc_stolen
1878 by the same node in the tree anyway and will be deleted when the new data
1881 static void *add_killtcp_callback(void *parm, void *data)
1887 add a tcp socket to the list of connections we want to RST
1889 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
1893 ctdb_sock_addr src, dst;
1894 struct ctdb_kill_tcp *killtcp;
1895 struct ctdb_killtcp_con *con;
1896 struct ctdb_vnn *vnn;
1898 ctdb_canonicalize_ip(s, &src);
1899 ctdb_canonicalize_ip(d, &dst);
1901 vnn = find_public_ip_vnn(ctdb, &dst);
1903 vnn = find_public_ip_vnn(ctdb, &src);
1906 /* if it is not a public ip it could be our 'single ip' */
1907 if (ctdb->single_ip_vnn) {
1908 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
1909 vnn = ctdb->single_ip_vnn;
1914 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
1918 killtcp = vnn->killtcp;
1920 /* If this is the first connection to kill we must allocate
1923 if (killtcp == NULL) {
1924 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
1925 CTDB_NO_MEMORY(ctdb, killtcp);
1928 killtcp->ctdb = ctdb;
1929 killtcp->capture_fd = -1;
1930 killtcp->connections = trbt_create(killtcp, 0);
1932 vnn->killtcp = killtcp;
1933 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
1938 /* create a structure that describes this connection we want to
1939 RST and store it in killtcp->connections
1941 con = talloc(killtcp, struct ctdb_killtcp_con);
1942 CTDB_NO_MEMORY(ctdb, con);
1943 con->src_addr = src;
1944 con->dst_addr = dst;
1946 con->killtcp = killtcp;
1949 trbt_insertarray32_callback(killtcp->connections,
1950 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
1951 add_killtcp_callback, con);
1954 If we dont have a socket to listen on yet we must create it
1956 if (killtcp->capture_fd == -1) {
1957 const char *iface = ctdb_vnn_iface_string(vnn);
1958 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
1959 if (killtcp->capture_fd == -1) {
1960 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
1961 "socket on iface '%s' for killtcp (%s)\n",
1962 iface, strerror(errno)));
1968 if (killtcp->fde == NULL) {
1969 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
1970 EVENT_FD_READ | EVENT_FD_AUTOCLOSE,
1971 capture_tcp_handler, killtcp);
1973 /* We also need to set up some events to tickle all these connections
1974 until they are all reset
1976 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
1977 ctdb_tickle_sentenced_connections, killtcp);
1980 /* tickle him once now */
1989 talloc_free(vnn->killtcp);
1990 vnn->killtcp = NULL;
1995 kill a TCP connection.
1997 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
1999 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
2001 return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
2005 called by a daemon to inform us of the entire list of TCP tickles for
2006 a particular public address.
2007 this control should only be sent by the node that is currently serving
2008 that public address.
2010 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2012 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
2013 struct ctdb_tcp_array *tcparray;
2014 struct ctdb_vnn *vnn;
2016 /* We must at least have tickles.num or else we cant verify the size
2017 of the received data blob
2019 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
2020 tickles.connections)) {
2021 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
2025 /* verify that the size of data matches what we expect */
2026 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
2027 tickles.connections)
2028 + sizeof(struct ctdb_tcp_connection)
2029 * list->tickles.num) {
2030 DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
2034 vnn = find_public_ip_vnn(ctdb, &list->addr);
2036 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
2037 ctdb_addr_to_str(&list->addr)));
2042 /* remove any old ticklelist we might have */
2043 talloc_free(vnn->tcp_array);
2044 vnn->tcp_array = NULL;
2046 tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
2047 CTDB_NO_MEMORY(ctdb, tcparray);
2049 tcparray->num = list->tickles.num;
2051 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
2052 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2054 memcpy(tcparray->connections, &list->tickles.connections[0],
2055 sizeof(struct ctdb_tcp_connection)*tcparray->num);
2057 /* We now have a new fresh tickle list array for this vnn */
2058 vnn->tcp_array = talloc_steal(vnn, tcparray);
2064 called to return the full list of tickles for the puclic address associated
2065 with the provided vnn
2067 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
2069 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
2070 struct ctdb_control_tcp_tickle_list *list;
2071 struct ctdb_tcp_array *tcparray;
2073 struct ctdb_vnn *vnn;
2075 vnn = find_public_ip_vnn(ctdb, addr);
2077 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
2078 ctdb_addr_to_str(addr)));
2083 tcparray = vnn->tcp_array;
2085 num = tcparray->num;
2090 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
2091 tickles.connections)
2092 + sizeof(struct ctdb_tcp_connection) * num;
2094 outdata->dptr = talloc_size(outdata, outdata->dsize);
2095 CTDB_NO_MEMORY(ctdb, outdata->dptr);
2096 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
2099 list->tickles.num = num;
2101 memcpy(&list->tickles.connections[0], tcparray->connections,
2102 sizeof(struct ctdb_tcp_connection) * num);
2110 set the list of all tcp tickles for a public address
2112 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb,
2113 struct timeval timeout, uint32_t destnode,
2114 ctdb_sock_addr *addr,
2115 struct ctdb_tcp_array *tcparray)
2119 struct ctdb_control_tcp_tickle_list *list;
2122 num = tcparray->num;
2127 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
2128 tickles.connections) +
2129 sizeof(struct ctdb_tcp_connection) * num;
2130 data.dptr = talloc_size(ctdb, data.dsize);
2131 CTDB_NO_MEMORY(ctdb, data.dptr);
2133 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
2135 list->tickles.num = num;
2137 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
2140 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2141 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
2142 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2144 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
2148 talloc_free(data.dptr);
2155 perform tickle updates if required
2157 static void ctdb_update_tcp_tickles(struct event_context *ev,
2158 struct timed_event *te,
2159 struct timeval t, void *private_data)
2161 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
2163 struct ctdb_vnn *vnn;
2165 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2166 /* we only send out updates for public addresses that
2169 if (ctdb->pnn != vnn->pnn) {
2172 /* We only send out the updates if we need to */
2173 if (!vnn->tcp_update_needed) {
2176 ret = ctdb_ctrl_set_tcp_tickles(ctdb,
2178 CTDB_BROADCAST_CONNECTED,
2179 &vnn->public_address,
2182 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
2183 ctdb_addr_to_str(&vnn->public_address)));
2187 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2188 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2189 ctdb_update_tcp_tickles, ctdb);
2194 start periodic update of tcp tickles
2196 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
2198 ctdb->tickle_update_context = talloc_new(ctdb);
2200 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
2201 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
2202 ctdb_update_tcp_tickles, ctdb);
2208 struct control_gratious_arp {
2209 struct ctdb_context *ctdb;
2210 ctdb_sock_addr addr;
2216 send a control_gratuitous arp
2218 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
2219 struct timeval t, void *private_data)
2222 struct control_gratious_arp *arp = talloc_get_type(private_data,
2223 struct control_gratious_arp);
2225 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
2227 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
2228 arp->iface, strerror(errno)));
2233 if (arp->count == CTDB_ARP_REPEAT) {
2238 event_add_timed(arp->ctdb->ev, arp,
2239 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
2240 send_gratious_arp, arp);
2247 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
2249 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
2250 struct control_gratious_arp *arp;
2252 /* verify the size of indata */
2253 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
2254 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
2255 (unsigned)indata.dsize,
2256 (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
2260 ( offsetof(struct ctdb_control_gratious_arp, iface)
2261 + gratious_arp->len ) ){
2263 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2264 "but should be %u bytes\n",
2265 (unsigned)indata.dsize,
2266 (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
2271 arp = talloc(ctdb, struct control_gratious_arp);
2272 CTDB_NO_MEMORY(ctdb, arp);
2275 arp->addr = gratious_arp->addr;
2276 arp->iface = talloc_strdup(arp, gratious_arp->iface);
2277 CTDB_NO_MEMORY(ctdb, arp->iface);
2280 event_add_timed(arp->ctdb->ev, arp,
2281 timeval_zero(), send_gratious_arp, arp);
2286 int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2288 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2291 /* verify the size of indata */
2292 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2293 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2297 ( offsetof(struct ctdb_control_ip_iface, iface)
2300 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2301 "but should be %u bytes\n",
2302 (unsigned)indata.dsize,
2303 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2307 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0]);
2310 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
2318 called when releaseip event finishes for del_public_address
2320 static void delete_ip_callback(struct ctdb_context *ctdb, int status,
2323 talloc_free(private_data);
2326 int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
2328 struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
2329 struct ctdb_vnn *vnn;
2332 /* verify the size of indata */
2333 if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
2334 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
2338 ( offsetof(struct ctdb_control_ip_iface, iface)
2341 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
2342 "but should be %u bytes\n",
2343 (unsigned)indata.dsize,
2344 (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
2348 /* walk over all public addresses until we find a match */
2349 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2350 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
2351 TALLOC_CTX *mem_ctx;
2353 DLIST_REMOVE(ctdb->vnn, vnn);
2354 if (vnn->iface == NULL) {
2359 mem_ctx = talloc_new(ctdb);
2360 ret = ctdb_event_script_callback(ctdb,
2361 mem_ctx, delete_ip_callback, mem_ctx,
2363 CTDB_EVENT_RELEASE_IP,
2365 ctdb_vnn_iface_string(vnn),
2366 ctdb_addr_to_str(&vnn->public_address),
2367 vnn->public_netmask_bits);
2368 ctdb_vnn_unassign_iface(ctdb, vnn);