4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 struct ctdb_takeover_arp {
37 struct ctdb_context *ctdb;
39 struct sockaddr_in sin;
40 struct ctdb_tcp_array *tcparray;
46 lists of tcp endpoints
48 struct ctdb_tcp_list {
49 struct ctdb_tcp_list *prev, *next;
50 struct ctdb_tcp_connection connection;
54 list of clients to kill on IP release
56 struct ctdb_client_ip {
57 struct ctdb_client_ip *prev, *next;
58 struct ctdb_context *ctdb;
59 struct sockaddr_in ip;
67 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
68 struct timeval t, void *private_data)
70 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
71 struct ctdb_takeover_arp);
73 struct ctdb_tcp_array *tcparray;
76 ret = ctdb_sys_send_arp(&arp->sin, arp->vnn->iface);
78 DEBUG(0,(__location__ " sending of arp failed (%s)\n", strerror(errno)));
81 s = ctdb_sys_open_sending_socket();
83 DEBUG(0,(__location__ " failed to open raw socket for sending tickles\n"));
87 tcparray = arp->tcparray;
89 for (i=0;i<tcparray->num;i++) {
90 DEBUG(2,("sending tcp tickle ack for %u->%s:%u\n",
91 (unsigned)ntohs(tcparray->connections[i].daddr.sin_port),
92 inet_ntoa(tcparray->connections[i].saddr.sin_addr),
93 (unsigned)ntohs(tcparray->connections[i].saddr.sin_port)));
94 ret = ctdb_sys_send_tcp(s, &tcparray->connections[i].saddr,
95 &tcparray->connections[i].daddr, 0, 0, 0);
97 DEBUG(0,(__location__ " Failed to send tcp tickle ack for %s\n",
98 inet_ntoa(tcparray->connections[i].saddr.sin_addr)));
106 if (arp->count == CTDB_ARP_REPEAT) {
111 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
112 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
113 ctdb_control_send_arp, arp);
116 struct takeover_callback_state {
117 struct ctdb_req_control *c;
118 struct sockaddr_in *sin;
119 struct ctdb_vnn *vnn;
123 called when takeip event finishes
125 static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
128 struct takeover_callback_state *state =
129 talloc_get_type(private_data, struct takeover_callback_state);
130 struct ctdb_takeover_arp *arp;
131 char *ip = inet_ntoa(state->sin->sin_addr);
132 struct ctdb_tcp_array *tcparray;
134 ctdb_start_monitoring(ctdb);
137 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
138 ip, state->vnn->iface));
139 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
144 if (!state->vnn->takeover_ctx) {
145 state->vnn->takeover_ctx = talloc_new(ctdb);
146 if (!state->vnn->takeover_ctx) {
151 arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
152 if (!arp) goto failed;
155 arp->sin = *state->sin;
156 arp->vnn = state->vnn;
158 tcparray = state->vnn->tcp_array;
160 /* add all of the known tcp connections for this IP to the
161 list of tcp connections to send tickle acks for */
162 arp->tcparray = talloc_steal(arp, tcparray);
164 state->vnn->tcp_array = NULL;
165 state->vnn->tcp_update_needed = true;
168 event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx,
169 timeval_zero(), ctdb_control_send_arp, arp);
171 /* the control succeeded */
172 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
177 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
183 Find the vnn of the node that has a public ip address
184 returns -1 if the address is not known as a public address
186 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, struct sockaddr_in ip)
188 struct ctdb_vnn *vnn;
190 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
191 if (ctdb_same_ip(&vnn->public_address, &ip)) {
201 take over an ip address
203 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
204 struct ctdb_req_control *c,
208 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
210 struct takeover_callback_state *state;
211 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
212 struct ctdb_vnn *vnn;
213 bool have_ip, is_loopback;
216 /* update out vnn list */
217 vnn = find_public_ip_vnn(ctdb, pip->sin);
219 DEBUG(0,("takeoverip called for an ip '%s' that is not a public address\n",
220 inet_ntoa(pip->sin.sin_addr)));
221 talloc_free(tmp_ctx);
226 /* if our kernel already has this IP, do nothing */
227 have_ip = ctdb_sys_have_ip(pip->sin, &is_loopback, tmp_ctx, &ifname);
228 /* if we have the ip and it is not set to a loopback address */
229 if (have_ip && !is_loopback) {
230 talloc_free(tmp_ctx);
234 state = talloc(ctdb, struct takeover_callback_state);
235 CTDB_NO_MEMORY(ctdb, state);
237 state->c = talloc_steal(ctdb, c);
238 state->sin = talloc(ctdb, struct sockaddr_in);
239 CTDB_NO_MEMORY(ctdb, state->sin);
240 *state->sin = pip->sin;
244 DEBUG(0,("Takeover of IP %s/%u on interface %s\n",
245 inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits,
248 ctdb_stop_monitoring(ctdb);
250 ret = ctdb_event_script_callback(ctdb,
251 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
252 state, takeover_ip_callback, state,
255 inet_ntoa(pip->sin.sin_addr),
256 vnn->public_netmask_bits);
258 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
259 inet_ntoa(pip->sin.sin_addr), vnn->iface));
260 talloc_free(tmp_ctx);
265 /* tell ctdb_control.c that we will be replying asynchronously */
268 talloc_free(tmp_ctx);
273 kill any clients that are registered with a IP that is being released
275 static void release_kill_clients(struct ctdb_context *ctdb, struct sockaddr_in in)
277 struct ctdb_client_ip *ip;
279 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
280 if (ctdb_same_ip(&ip->ip, &in)) {
281 struct ctdb_client *client = ctdb_reqid_find(ctdb,
284 if (client->pid != 0) {
285 DEBUG(0,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
286 (unsigned)client->pid, inet_ntoa(in.sin_addr),
288 kill(client->pid, SIGKILL);
295 called when releaseip event finishes
297 static void release_ip_callback(struct ctdb_context *ctdb, int status,
300 struct takeover_callback_state *state =
301 talloc_get_type(private_data, struct takeover_callback_state);
302 char *ip = inet_ntoa(state->sin->sin_addr);
305 ctdb_start_monitoring(ctdb);
307 /* send a message to all clients of this node telling them
308 that the cluster has been reconfigured and they should
309 release any sockets on this IP */
310 data.dptr = (uint8_t *)ip;
311 data.dsize = strlen(ip)+1;
313 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
315 /* kill clients that have registered with this IP */
316 release_kill_clients(ctdb, *state->sin);
318 /* the control succeeded */
319 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
324 release an ip address
326 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
327 struct ctdb_req_control *c,
331 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
333 struct takeover_callback_state *state;
334 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
335 struct ctdb_vnn *vnn;
336 bool have_ip, is_loopback;
339 /* update our vnn list */
340 vnn = find_public_ip_vnn(ctdb, pip->sin);
342 DEBUG(0,("releaseip called for an ip '%s' that is not a public address\n",
343 inet_ntoa(pip->sin.sin_addr)));
344 talloc_free(tmp_ctx);
349 have_ip = ctdb_sys_have_ip(pip->sin, &is_loopback, tmp_ctx, &ifname);
350 if ( (!have_ip) || is_loopback) {
351 DEBUG(0,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
352 inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits,
354 talloc_free(tmp_ctx);
358 DEBUG(0,("Release of IP %s/%u on interface %s\n",
359 inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits,
362 /* stop any previous arps */
363 talloc_free(vnn->takeover_ctx);
364 vnn->takeover_ctx = NULL;
366 state = talloc(ctdb, struct takeover_callback_state);
367 CTDB_NO_MEMORY(ctdb, state);
369 state->c = talloc_steal(state, c);
370 state->sin = talloc(state, struct sockaddr_in);
371 CTDB_NO_MEMORY(ctdb, state->sin);
372 *state->sin = pip->sin;
376 ctdb_stop_monitoring(ctdb);
378 ret = ctdb_event_script_callback(ctdb,
379 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
380 state, release_ip_callback, state,
381 "releaseip %s %s %u",
383 inet_ntoa(pip->sin.sin_addr),
384 vnn->public_netmask_bits);
386 DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
387 inet_ntoa(pip->sin.sin_addr), vnn->iface));
388 talloc_free(tmp_ctx);
393 /* tell the control that we will be reply asynchronously */
396 talloc_free(tmp_ctx);
402 static int add_public_address(struct ctdb_context *ctdb, struct sockaddr_in addr, unsigned mask, const char *iface)
404 struct ctdb_vnn *vnn;
406 /* Verify that we dont have an entry for this ip yet */
407 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
408 if (ctdb_same_sockaddr(&addr, &vnn->public_address)) {
409 DEBUG(0,("Same ip '%s' specified multiple times in the public address list \n",
410 inet_ntoa(addr.sin_addr)));
415 /* create a new vnn structure for this ip address */
416 vnn = talloc_zero(ctdb, struct ctdb_vnn);
417 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
418 vnn->iface = talloc_strdup(vnn, iface);
419 vnn->public_address = addr;
420 vnn->public_netmask_bits = mask;
423 DLIST_ADD(ctdb->vnn, vnn);
430 setup the event script directory
432 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
434 ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
435 CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
440 setup the public address lists from a file
442 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
448 lines = file_lines_load(alist, &nlines, ctdb);
450 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
453 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
457 for (i=0;i<nlines;i++) {
459 struct sockaddr_in addr;
463 tok = strtok(lines[i], " \t");
464 if (!tok || !parse_ip_mask(tok, &addr, &mask)) {
465 DEBUG(0,("Badly formed line %u in public address list\n", i+1));
469 tok = strtok(NULL, " \t");
471 if (NULL == ctdb->default_public_interface) {
472 DEBUG(0,("No default public interface and no interface specified at line %u of public address list\n",
477 iface = ctdb->default_public_interface;
482 if (add_public_address(ctdb, addr, mask, iface)) {
483 DEBUG(0,("Failed to add line %u to the public address list\n", i+1));
496 struct ctdb_public_ip_list {
497 struct ctdb_public_ip_list *next;
499 struct sockaddr_in sin;
503 /* Given a physical node, return the number of
504 public addresses that is currently assigned to this node.
506 static int node_ip_coverage(struct ctdb_context *ctdb,
508 struct ctdb_public_ip_list *ips)
512 for (;ips;ips=ips->next) {
513 if (ips->pnn == pnn) {
521 /* Check if this is a public ip known to the node, i.e. can that
522 node takeover this ip ?
524 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn,
525 struct ctdb_public_ip_list *ip)
527 struct ctdb_all_public_ips *public_ips;
530 public_ips = ctdb->nodes[pnn]->public_ips;
532 if (public_ips == NULL) {
536 for (i=0;i<public_ips->num;i++) {
537 if (ip->sin.sin_addr.s_addr == public_ips->ips[i].sin.sin_addr.s_addr) {
538 /* yes, this node can serve this public ip */
547 /* search the node lists list for a node to takeover this ip.
548 pick the node that currently are serving the least number of ips
549 so that the ips get spread out evenly.
551 static int find_takeover_node(struct ctdb_context *ctdb,
552 struct ctdb_node_map *nodemap, uint32_t mask,
553 struct ctdb_public_ip_list *ip,
554 struct ctdb_public_ip_list *all_ips)
560 for (i=0;i<nodemap->num;i++) {
561 if (nodemap->nodes[i].flags & mask) {
562 /* This node is not healty and can not be used to serve
568 /* verify that this node can serve this ip */
569 if (can_node_serve_ip(ctdb, i, ip)) {
570 /* no it couldnt so skip to the next node */
574 num = node_ip_coverage(ctdb, i, all_ips);
575 /* was this the first node we checked ? */
587 DEBUG(0,(__location__ " Could not find node to take over public address '%s'\n", inet_ntoa(ip->sin.sin_addr)));
595 struct ctdb_public_ip_list *
596 add_ip_to_merged_list(struct ctdb_context *ctdb,
598 struct ctdb_public_ip_list *ip_list,
599 struct ctdb_public_ip *ip)
601 struct ctdb_public_ip_list *tmp_ip;
603 /* do we already have this ip in our merged list ?*/
604 for (tmp_ip=ip_list;tmp_ip;tmp_ip=tmp_ip->next) {
606 /* we already have this public ip in the list */
607 if (tmp_ip->sin.sin_addr.s_addr == ip->sin.sin_addr.s_addr) {
612 /* this is a new public ip, we must add it to the list */
613 tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
614 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
615 tmp_ip->pnn = ip->pnn;
616 tmp_ip->sin = ip->sin;
617 tmp_ip->next = ip_list;
622 struct ctdb_public_ip_list *
623 create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
626 struct ctdb_public_ip_list *ip_list = NULL;
627 struct ctdb_all_public_ips *public_ips;
629 for (i=0;i<ctdb->num_nodes;i++) {
630 public_ips = ctdb->nodes[i]->public_ips;
632 /* there were no public ips for this node */
633 if (public_ips == NULL) {
637 for (j=0;j<public_ips->num;j++) {
638 ip_list = add_ip_to_merged_list(ctdb, tmp_ctx,
639 ip_list, &public_ips->ips[j]);
647 make any IP alias changes for public addresses that are necessary
649 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
651 int i, num_healthy, retries;
653 struct ctdb_public_ip ip;
655 struct ctdb_public_ip_list *all_ips, *tmp_ip;
656 int maxnode, maxnum=0, minnode, minnum=0, num;
657 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
662 /* Count how many completely healthy nodes we have */
664 for (i=0;i<nodemap->num;i++) {
665 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
670 if (num_healthy > 0) {
671 /* We have healthy nodes, so only consider them for
672 serving public addresses
674 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
676 /* We didnt have any completely healthy nodes so
677 use "disabled" nodes as a fallback
679 mask = NODE_FLAGS_INACTIVE;
682 /* since nodes only know about those public addresses that
683 can be served by that particular node, no single node has
684 a full list of all public addresses that exist in the cluster.
685 Walk over all node structures and create a merged list of
686 all public addresses that exist in the cluster.
688 all_ips = create_merged_ip_list(ctdb, tmp_ctx);
691 /* mark all public addresses with a masked node as being served by
694 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
695 if (tmp_ip->pnn == -1) {
698 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
704 /* now we must redistribute all public addresses with takeover node
705 -1 among the nodes available
709 /* loop over all ip's and find a physical node to cover for
712 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
713 if (tmp_ip->pnn == -1) {
714 if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
715 DEBUG(0,("Failed to find node to cover ip %s\n", inet_ntoa(tmp_ip->sin.sin_addr)));
721 /* now, try to make sure the ip adresses are evenly distributed
723 for each ip address, loop over all nodes that can serve this
724 ip and make sure that the difference between the node
725 serving the most and the node serving the least ip's are not greater
728 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
729 if (tmp_ip->pnn == -1) {
733 /* Get the highest and lowest number of ips's served by any
734 valid node which can serve this ip.
738 for (i=0;i<nodemap->num;i++) {
739 if (nodemap->nodes[i].flags & mask) {
743 /* only check nodes that can actually serve this ip */
744 if (can_node_serve_ip(ctdb, i, tmp_ip)) {
745 /* no it couldnt so skip to the next node */
749 num = node_ip_coverage(ctdb, i, all_ips);
770 DEBUG(0,(__location__ " Could not find maxnode. May not be able to server ip '%s'\n", inet_ntoa(tmp_ip->sin.sin_addr)));
774 /* if the spread between the smallest and largest coverage by
775 a node is >=2 we steal one of the ips from the node with
776 most coverage to even things out a bit.
777 try to do this at most 5 times since we dont want to spend
778 too much time balancing the ip coverage.
780 if ( (maxnum > minnum+1)
782 struct ctdb_public_ip_list *tmp;
784 /* mark one of maxnode's vnn's as unassigned and try
787 for (tmp=all_ips;tmp;tmp=tmp->next) {
788 if (tmp->pnn == maxnode) {
799 /* at this point ->pnn is the node which will own each IP
800 or -1 if there is no node that can cover this ip
803 /* now tell all nodes to delete any alias that they should not
804 have. This will be a NOOP on nodes that don't currently
805 hold the given alias */
806 for (i=0;i<nodemap->num;i++) {
807 /* don't talk to unconnected nodes, but do talk to banned nodes */
808 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
812 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
813 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
814 /* This node should be serving this
815 vnn so dont tell it to release the ip
819 ip.pnn = tmp_ip->pnn;
820 ip.sin.sin_family = AF_INET;
821 ip.sin.sin_addr = tmp_ip->sin.sin_addr;
823 ret = ctdb_ctrl_release_ip(ctdb, TAKEOVER_TIMEOUT(),
824 nodemap->nodes[i].pnn,
827 DEBUG(0,("Failed to tell vnn %u to release IP %s\n",
828 nodemap->nodes[i].pnn,
829 inet_ntoa(tmp_ip->sin.sin_addr)));
830 talloc_free(tmp_ctx);
837 /* tell all nodes to get their own IPs */
838 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
839 if (tmp_ip->pnn == -1) {
840 /* this IP won't be taken over */
843 ip.pnn = tmp_ip->pnn;
844 ip.sin.sin_family = AF_INET;
845 ip.sin.sin_addr = tmp_ip->sin.sin_addr;
847 ret = ctdb_ctrl_takeover_ip(ctdb, TAKEOVER_TIMEOUT(),
851 DEBUG(0,("Failed asking vnn %u to take over IP %s\n",
853 inet_ntoa(tmp_ip->sin.sin_addr)));
854 talloc_free(tmp_ctx);
859 talloc_free(tmp_ctx);
865 destroy a ctdb_client_ip structure
867 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
869 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
874 called by a client to inform us of a TCP connection that it is managing
875 that should tickled with an ACK when IP takeover is done
877 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
880 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
881 struct ctdb_control_tcp *p = (struct ctdb_control_tcp *)indata.dptr;
882 struct ctdb_tcp_list *tcp;
883 struct ctdb_control_tcp_vnn t;
886 struct ctdb_client_ip *ip;
887 struct ctdb_vnn *vnn;
889 vnn = find_public_ip_vnn(ctdb, p->dest);
891 DEBUG(3,("Could not add client IP %s. This is not a public address.\n", inet_ntoa(p->dest.sin_addr)));
895 ip = talloc(client, struct ctdb_client_ip);
896 CTDB_NO_MEMORY(ctdb, ip);
900 ip->client_id = client_id;
901 talloc_set_destructor(ip, ctdb_client_ip_destructor);
902 DLIST_ADD(ctdb->client_ip_list, ip);
904 tcp = talloc(client, struct ctdb_tcp_list);
905 CTDB_NO_MEMORY(ctdb, tcp);
907 tcp->connection.saddr = p->src;
908 tcp->connection.daddr = p->dest;
910 DLIST_ADD(client->tcp_list, tcp);
915 data.dptr = (uint8_t *)&t;
916 data.dsize = sizeof(t);
918 DEBUG(2,("registered tcp client for %u->%s:%u\n",
919 (unsigned)ntohs(p->dest.sin_port),
920 inet_ntoa(p->src.sin_addr),
921 (unsigned)ntohs(p->src.sin_port)));
923 /* tell all nodes about this tcp connection */
924 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
925 CTDB_CONTROL_TCP_ADD,
926 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
928 DEBUG(0,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
936 see if two sockaddr_in are the same
938 static bool same_sockaddr_in(struct sockaddr_in *in1, struct sockaddr_in *in2)
940 return in1->sin_family == in2->sin_family &&
941 in1->sin_port == in2->sin_port &&
942 in1->sin_addr.s_addr == in2->sin_addr.s_addr;
946 find a tcp address on a list
948 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
949 struct ctdb_tcp_connection *tcp)
957 for (i=0;i<array->num;i++) {
958 if (same_sockaddr_in(&array->connections[i].saddr, &tcp->saddr) &&
959 same_sockaddr_in(&array->connections[i].daddr, &tcp->daddr)) {
960 return &array->connections[i];
967 called by a daemon to inform us of a TCP connection that one of its
968 clients managing that should tickled with an ACK when IP takeover is
971 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
973 struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
974 struct ctdb_tcp_array *tcparray;
975 struct ctdb_tcp_connection tcp;
976 struct ctdb_vnn *vnn;
978 vnn = find_public_ip_vnn(ctdb, p->dest);
980 DEBUG(0,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
981 inet_ntoa(p->dest.sin_addr)));
986 tcparray = vnn->tcp_array;
988 /* If this is the first tickle */
989 if (tcparray == NULL) {
990 tcparray = talloc_size(ctdb->nodes,
991 offsetof(struct ctdb_tcp_array, connections) +
992 sizeof(struct ctdb_tcp_connection) * 1);
993 CTDB_NO_MEMORY(ctdb, tcparray);
994 vnn->tcp_array = tcparray;
997 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
998 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1000 tcparray->connections[tcparray->num].saddr = p->src;
1001 tcparray->connections[tcparray->num].daddr = p->dest;
1007 /* Do we already have this tickle ?*/
1009 tcp.daddr = p->dest;
1010 if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1011 DEBUG(4,("Already had tickle info for %s:%u for vnn:%u\n",
1012 inet_ntoa(tcp.daddr.sin_addr),
1013 ntohs(tcp.daddr.sin_port),
1018 /* A new tickle, we must add it to the array */
1019 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1020 struct ctdb_tcp_connection,
1022 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1024 vnn->tcp_array = tcparray;
1025 tcparray->connections[tcparray->num].saddr = p->src;
1026 tcparray->connections[tcparray->num].daddr = p->dest;
1029 DEBUG(2,("Added tickle info for %s:%u from vnn %u\n",
1030 inet_ntoa(tcp.daddr.sin_addr),
1031 ntohs(tcp.daddr.sin_port),
1039 called by a daemon to inform us of a TCP connection that one of its
1040 clients managing that should tickled with an ACK when IP takeover is
1043 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1045 struct ctdb_tcp_connection *tcpp;
1046 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, conn->daddr);
1049 DEBUG(0,(__location__ " unable to find public address %s\n", inet_ntoa(conn->daddr.sin_addr)));
1053 /* if the array is empty we cant remove it
1054 and we dont need to do anything
1056 if (vnn->tcp_array == NULL) {
1057 DEBUG(2,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1058 inet_ntoa(conn->daddr.sin_addr),
1059 ntohs(conn->daddr.sin_port)));
1064 /* See if we know this connection
1065 if we dont know this connection then we dont need to do anything
1067 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1069 DEBUG(2,("Trying to remove tickle that doesnt exist %s:%u\n",
1070 inet_ntoa(conn->daddr.sin_addr),
1071 ntohs(conn->daddr.sin_port)));
1076 /* We need to remove this entry from the array.
1077 Instead of allocating a new array and copying data to it
1078 we cheat and just copy the last entry in the existing array
1079 to the entry that is to be removed and just shring the
1082 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1083 vnn->tcp_array->num--;
1085 /* If we deleted the last entry we also need to remove the entire array
1087 if (vnn->tcp_array->num == 0) {
1088 talloc_free(vnn->tcp_array);
1089 vnn->tcp_array = NULL;
1092 vnn->tcp_update_needed = true;
1094 DEBUG(2,("Removed tickle info for %s:%u\n",
1095 inet_ntoa(conn->saddr.sin_addr),
1096 ntohs(conn->saddr.sin_port)));
1101 called when a daemon restarts - send all tickes for all public addresses
1102 we are serving immediately to the new node.
1104 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1106 /*XXX here we should send all tickes we are serving to the new node */
1112 called when a client structure goes away - hook to remove
1113 elements from the tcp_list in all daemons
1115 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1117 while (client->tcp_list) {
1118 struct ctdb_tcp_list *tcp = client->tcp_list;
1119 DLIST_REMOVE(client->tcp_list, tcp);
1120 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1126 release all IPs on shutdown
1128 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1130 struct ctdb_vnn *vnn;
1131 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1132 bool have_ip, is_loopback;
1133 char *ifname = NULL;
1135 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1136 have_ip = ctdb_sys_have_ip(vnn->public_address, &is_loopback, tmp_ctx, &ifname);
1137 if (have_ip && !is_loopback) {
1138 ctdb_event_script(ctdb, "releaseip %s %s %u",
1140 inet_ntoa(vnn->public_address.sin_addr),
1141 vnn->public_netmask_bits);
1142 release_kill_clients(ctdb, vnn->public_address);
1145 talloc_free(tmp_ctx);
1150 get list of public IPs
1152 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
1153 struct ctdb_req_control *c, TDB_DATA *outdata)
1156 struct ctdb_all_public_ips *ips;
1157 struct ctdb_vnn *vnn;
1159 /* count how many public ip structures we have */
1161 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1165 len = offsetof(struct ctdb_all_public_ips, ips) +
1166 num*sizeof(struct ctdb_public_ip);
1167 ips = talloc_zero_size(outdata, len);
1168 CTDB_NO_MEMORY(ctdb, ips);
1170 outdata->dsize = len;
1171 outdata->dptr = (uint8_t *)ips;
1175 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1176 ips->ips[i].pnn = vnn->pnn;
1177 ips->ips[i].sin = vnn->public_address;
1187 structure containing the listening socket and the list of tcp connections
1188 that the ctdb daemon is to kill
1190 struct ctdb_kill_tcp {
1191 struct ctdb_vnn *vnn;
1192 struct ctdb_context *ctdb;
1195 struct fd_event *fde;
1196 trbt_tree_t *connections;
1201 a tcp connection that is to be killed
1203 struct ctdb_killtcp_con {
1204 struct sockaddr_in src;
1205 struct sockaddr_in dst;
1207 struct ctdb_kill_tcp *killtcp;
1210 /* this function is used to create a key to represent this socketpair
1211 in the killtcp tree.
1212 this key is used to insert and lookup matching socketpairs that are
1213 to be tickled and RST
1215 #define KILLTCP_KEYLEN 4
1216 static uint32_t *killtcp_key(struct sockaddr_in *src, struct sockaddr_in *dst)
1218 static uint32_t key[KILLTCP_KEYLEN];
1220 key[0] = dst->sin_addr.s_addr;
1221 key[1] = src->sin_addr.s_addr;
1222 key[2] = dst->sin_port;
1223 key[3] = src->sin_port;
1229 called when we get a read event on the raw socket
1231 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
1232 uint16_t flags, void *private_data)
1234 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1235 struct ctdb_killtcp_con *con;
1236 struct sockaddr_in src, dst;
1237 uint32_t ack_seq, seq;
1239 if (!(flags & EVENT_FD_READ)) {
1243 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1244 killtcp->private_data,
1246 &ack_seq, &seq) != 0) {
1247 /* probably a non-tcp ACK packet */
1251 /* check if we have this guy in our list of connections
1254 con = trbt_lookuparray32(killtcp->connections,
1255 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1257 /* no this was some other packet we can just ignore */
1261 /* This one has been tickled !
1262 now reset him and remove him from the list.
1264 DEBUG(1, ("sending a tcp reset to kill connection :%d -> %s:%d\n", ntohs(con->dst.sin_port), inet_ntoa(con->src.sin_addr), ntohs(con->src.sin_port)));
1266 ctdb_sys_send_tcp(killtcp->sending_fd, &con->dst,
1267 &con->src, ack_seq, seq, 1);
1272 /* when traversing the list of all tcp connections to send tickle acks to
1273 (so that we can capture the ack coming back and kill the connection
1275 this callback is called for each connection we are currently trying to kill
1277 static void tickle_connection_traverse(void *param, void *data)
1279 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1280 struct ctdb_kill_tcp *killtcp = talloc_get_type(param, struct ctdb_kill_tcp);
1282 /* have tried too many times, just give up */
1283 if (con->count >= 5) {
1288 /* othervise, try tickling it again */
1290 ctdb_sys_send_tcp(killtcp->sending_fd, &con->dst, &con->src, 0, 0, 0);
1295 called every second until all sentenced connections have been reset
1297 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
1298 struct timeval t, void *private_data)
1300 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1303 /* loop over all connections sending tickle ACKs */
1304 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, killtcp);
1307 /* If there are no more connections to kill we can remove the
1308 entire killtcp structure
1310 if ( (killtcp->connections == NULL) ||
1311 (killtcp->connections->root == NULL) ) {
1312 talloc_free(killtcp);
1316 /* try tickling them again in a seconds time
1318 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
1319 ctdb_tickle_sentenced_connections, killtcp);
1323 destroy the killtcp structure
1325 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
1327 if (killtcp->sending_fd != -1) {
1328 close(killtcp->sending_fd);
1329 killtcp->sending_fd = -1;
1331 killtcp->vnn->killtcp = NULL;
1336 /* nothing fancy here, just unconditionally replace any existing
1337 connection structure with the new one.
1339 dont even free the old one if it did exist, that one is talloc_stolen
1340 by the same node in the tree anyway and will be deleted when the new data
1343 static void *add_killtcp_callback(void *parm, void *data)
1349 add a tcp socket to the list of connections we want to RST
1351 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
1352 struct sockaddr_in *src, struct sockaddr_in *dst)
1354 struct ctdb_kill_tcp *killtcp;
1355 struct ctdb_killtcp_con *con;
1356 struct ctdb_vnn *vnn;
1358 vnn = find_public_ip_vnn(ctdb, *dst);
1360 vnn = find_public_ip_vnn(ctdb, *src);
1363 DEBUG(0,(__location__ " Could not killtcp, not a public address\n"));
1367 killtcp = vnn->killtcp;
1369 /* If this is the first connection to kill we must allocate
1372 if (killtcp == NULL) {
1373 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
1374 CTDB_NO_MEMORY(ctdb, killtcp);
1377 killtcp->ctdb = ctdb;
1378 killtcp->capture_fd = -1;
1379 killtcp->sending_fd = -1;
1380 killtcp->connections = trbt_create(killtcp, 0);
1382 vnn->killtcp = killtcp;
1383 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
1388 /* create a structure that describes this connection we want to
1389 RST and store it in killtcp->connections
1391 con = talloc(killtcp, struct ctdb_killtcp_con);
1392 CTDB_NO_MEMORY(ctdb, con);
1396 con->killtcp = killtcp;
1399 trbt_insertarray32_callback(killtcp->connections,
1400 KILLTCP_KEYLEN, killtcp_key(&con->dst, &con->src),
1401 add_killtcp_callback, con);
1404 If we dont have a socket to send from yet we must create it
1406 if (killtcp->sending_fd == -1) {
1407 killtcp->sending_fd = ctdb_sys_open_sending_socket();
1408 if (killtcp->sending_fd == -1) {
1409 DEBUG(0,(__location__ " Failed to open sending socket for killtcp\n"));
1415 If we dont have a socket to listen on yet we must create it
1417 if (killtcp->capture_fd == -1) {
1418 killtcp->capture_fd = ctdb_sys_open_capture_socket(vnn->iface, &killtcp->private_data);
1419 if (killtcp->capture_fd == -1) {
1420 DEBUG(0,(__location__ " Failed to open capturing socket for killtcp\n"));
1426 if (killtcp->fde == NULL) {
1427 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
1428 EVENT_FD_READ | EVENT_FD_AUTOCLOSE,
1429 capture_tcp_handler, killtcp);
1431 /* We also need to set up some events to tickle all these connections
1432 until they are all reset
1434 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
1435 ctdb_tickle_sentenced_connections, killtcp);
1438 /* tickle him once now */
1439 ctdb_sys_send_tcp(killtcp->sending_fd, &con->dst, &con->src, 0, 0, 0);
1444 talloc_free(vnn->killtcp);
1445 vnn->killtcp = NULL;
1450 kill a TCP connection.
1452 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
1454 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
1456 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
1460 called by a daemon to inform us of the entire list of TCP tickles for
1461 a particular public address.
1462 this control should only be sent by the node that is currently serving
1463 that public address.
1465 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
1467 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
1468 struct ctdb_tcp_array *tcparray;
1469 struct ctdb_vnn *vnn;
1471 /* We must at least have tickles.num or else we cant verify the size
1472 of the received data blob
1474 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
1475 tickles.connections)) {
1476 DEBUG(0,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
1480 /* verify that the size of data matches what we expect */
1481 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
1482 tickles.connections)
1483 + sizeof(struct ctdb_tcp_connection)
1484 * list->tickles.num) {
1485 DEBUG(0,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
1489 vnn = find_public_ip_vnn(ctdb, list->ip);
1491 DEBUG(0,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
1492 inet_ntoa(list->ip.sin_addr)));
1496 /* remove any old ticklelist we might have */
1497 talloc_free(vnn->tcp_array);
1498 vnn->tcp_array = NULL;
1500 tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
1501 CTDB_NO_MEMORY(ctdb, tcparray);
1503 tcparray->num = list->tickles.num;
1505 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
1506 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1508 memcpy(tcparray->connections, &list->tickles.connections[0],
1509 sizeof(struct ctdb_tcp_connection)*tcparray->num);
1511 /* We now have a new fresh tickle list array for this vnn */
1512 vnn->tcp_array = talloc_steal(vnn, tcparray);
1518 called to return the full list of tickles for the puclic address associated
1519 with the provided vnn
1521 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1523 struct sockaddr_in *ip = (struct sockaddr_in *)indata.dptr;
1524 struct ctdb_control_tcp_tickle_list *list;
1525 struct ctdb_tcp_array *tcparray;
1527 struct ctdb_vnn *vnn;
1529 vnn = find_public_ip_vnn(ctdb, *ip);
1531 DEBUG(0,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
1532 inet_ntoa(ip->sin_addr)));
1536 tcparray = vnn->tcp_array;
1538 num = tcparray->num;
1543 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
1544 tickles.connections)
1545 + sizeof(struct ctdb_tcp_connection) * num;
1547 outdata->dptr = talloc_size(outdata, outdata->dsize);
1548 CTDB_NO_MEMORY(ctdb, outdata->dptr);
1549 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
1552 list->tickles.num = num;
1554 memcpy(&list->tickles.connections[0], tcparray->connections,
1555 sizeof(struct ctdb_tcp_connection) * num);
1563 set the list of all tcp tickles for a public address
1565 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb,
1566 struct timeval timeout, uint32_t destnode,
1567 struct sockaddr_in *ip,
1568 struct ctdb_tcp_array *tcparray)
1572 struct ctdb_control_tcp_tickle_list *list;
1575 num = tcparray->num;
1580 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
1581 tickles.connections) +
1582 sizeof(struct ctdb_tcp_connection) * num;
1583 data.dptr = talloc_size(ctdb, data.dsize);
1584 CTDB_NO_MEMORY(ctdb, data.dptr);
1586 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
1588 list->tickles.num = num;
1590 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
1593 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
1594 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
1595 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1597 DEBUG(0,(__location__ " ctdb_control for set tcp tickles failed\n"));
1601 talloc_free(data.dptr);
1608 perform tickle updates if required
1610 static void ctdb_update_tcp_tickles(struct event_context *ev,
1611 struct timed_event *te,
1612 struct timeval t, void *private_data)
1614 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1616 struct ctdb_vnn *vnn;
1618 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1619 /* we only send out updates for public addresses that
1622 if (ctdb->pnn != vnn->pnn) {
1625 /* We only send out the updates if we need to */
1626 if (!vnn->tcp_update_needed) {
1629 ret = ctdb_ctrl_set_tcp_tickles(ctdb,
1631 CTDB_BROADCAST_CONNECTED,
1632 &vnn->public_address,
1635 DEBUG(0,("Failed to send the tickle update for public address %s\n",
1636 inet_ntoa(vnn->public_address.sin_addr)));
1640 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
1641 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
1642 ctdb_update_tcp_tickles, ctdb);
1647 start periodic update of tcp tickles
1649 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
1651 ctdb->tickle_update_context = talloc_new(ctdb);
1653 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
1654 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
1655 ctdb_update_tcp_tickles, ctdb);