4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "lib/util/dlinklist.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "../common/rb_tree.h"
31 #define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
33 #define CTDB_ARP_INTERVAL 1
34 #define CTDB_ARP_REPEAT 3
36 struct ctdb_takeover_arp {
37 struct ctdb_context *ctdb;
39 struct sockaddr_in sin;
40 struct ctdb_tcp_array *tcparray;
46 lists of tcp endpoints
48 struct ctdb_tcp_list {
49 struct ctdb_tcp_list *prev, *next;
50 struct ctdb_tcp_connection connection;
54 list of clients to kill on IP release
56 struct ctdb_client_ip {
57 struct ctdb_client_ip *prev, *next;
58 struct ctdb_context *ctdb;
59 struct sockaddr_in ip;
67 static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te,
68 struct timeval t, void *private_data)
70 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
71 struct ctdb_takeover_arp);
73 struct ctdb_tcp_array *tcparray;
76 ret = ctdb_sys_send_arp(&arp->sin, arp->vnn->iface);
78 DEBUG(0,(__location__ " sending of arp failed (%s)\n", strerror(errno)));
81 s = ctdb_sys_open_sending_socket();
83 DEBUG(0,(__location__ " failed to open raw socket for sending tickles\n"));
87 tcparray = arp->tcparray;
89 for (i=0;i<tcparray->num;i++) {
90 DEBUG(2,("sending tcp tickle ack for %u->%s:%u\n",
91 (unsigned)ntohs(tcparray->connections[i].daddr.sin_port),
92 inet_ntoa(tcparray->connections[i].saddr.sin_addr),
93 (unsigned)ntohs(tcparray->connections[i].saddr.sin_port)));
94 ret = ctdb_sys_send_tcp(s, &tcparray->connections[i].saddr,
95 &tcparray->connections[i].daddr, 0, 0, 0);
97 DEBUG(0,(__location__ " Failed to send tcp tickle ack for %s\n",
98 inet_ntoa(tcparray->connections[i].saddr.sin_addr)));
106 if (arp->count == CTDB_ARP_REPEAT) {
111 event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx,
112 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
113 ctdb_control_send_arp, arp);
116 struct takeover_callback_state {
117 struct ctdb_req_control *c;
118 struct sockaddr_in *sin;
119 struct ctdb_vnn *vnn;
123 called when takeip event finishes
125 static void takeover_ip_callback(struct ctdb_context *ctdb, int status,
128 struct takeover_callback_state *state =
129 talloc_get_type(private_data, struct takeover_callback_state);
130 struct ctdb_takeover_arp *arp;
131 char *ip = inet_ntoa(state->sin->sin_addr);
132 struct ctdb_tcp_array *tcparray;
134 ctdb_start_monitoring(ctdb);
137 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
138 ip, state->vnn->iface));
139 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
144 if (!state->vnn->takeover_ctx) {
145 state->vnn->takeover_ctx = talloc_new(ctdb);
146 if (!state->vnn->takeover_ctx) {
151 arp = talloc_zero(state->vnn->takeover_ctx, struct ctdb_takeover_arp);
152 if (!arp) goto failed;
155 arp->sin = *state->sin;
156 arp->vnn = state->vnn;
158 tcparray = state->vnn->tcp_array;
160 /* add all of the known tcp connections for this IP to the
161 list of tcp connections to send tickle acks for */
162 arp->tcparray = talloc_steal(arp, tcparray);
164 state->vnn->tcp_array = NULL;
165 state->vnn->tcp_update_needed = true;
168 event_add_timed(arp->ctdb->ev, state->vnn->takeover_ctx,
169 timeval_zero(), ctdb_control_send_arp, arp);
171 /* the control succeeded */
172 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
177 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
183 Find the vnn of the node that has a public ip address
184 returns -1 if the address is not known as a public address
186 static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, struct sockaddr_in ip)
188 struct ctdb_vnn *vnn;
190 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
191 if (ctdb_same_ip(&vnn->public_address, &ip)) {
201 take over an ip address
203 int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
204 struct ctdb_req_control *c,
209 struct takeover_callback_state *state;
210 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
211 struct ctdb_vnn *vnn;
213 /* update out vnn list */
214 vnn = find_public_ip_vnn(ctdb, pip->sin);
216 DEBUG(0,("takeoverip called for an ip '%s' that is not a public address\n",
217 inet_ntoa(pip->sin.sin_addr)));
222 /* if our kernel already has this IP, do nothing */
223 if (ctdb_sys_have_ip(pip->sin)) {
227 state = talloc(ctdb, struct takeover_callback_state);
228 CTDB_NO_MEMORY(ctdb, state);
230 state->c = talloc_steal(ctdb, c);
231 state->sin = talloc(ctdb, struct sockaddr_in);
232 CTDB_NO_MEMORY(ctdb, state->sin);
233 *state->sin = pip->sin;
237 DEBUG(0,("Takeover of IP %s/%u on interface %s\n",
238 inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits,
241 ctdb_stop_monitoring(ctdb);
243 ret = ctdb_event_script_callback(ctdb,
244 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
245 state, takeover_ip_callback, state,
248 inet_ntoa(pip->sin.sin_addr),
249 vnn->public_netmask_bits);
251 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
252 inet_ntoa(pip->sin.sin_addr), vnn->iface));
257 /* tell ctdb_control.c that we will be replying asynchronously */
264 kill any clients that are registered with a IP that is being released
266 static void release_kill_clients(struct ctdb_context *ctdb, struct sockaddr_in in)
268 struct ctdb_client_ip *ip;
270 DEBUG(1,("release_kill_clients for ip %s\n", inet_ntoa(in.sin_addr)));
272 for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
273 DEBUG(2,("checking for client %u with IP %s\n",
274 ip->client_id, inet_ntoa(ip->ip.sin_addr)));
275 if (ctdb_same_ip(&ip->ip, &in)) {
276 struct ctdb_client *client = ctdb_reqid_find(ctdb,
279 DEBUG(1,("matched client %u with IP %s and pid %u\n",
280 ip->client_id, inet_ntoa(ip->ip.sin_addr), client->pid));
281 if (client->pid != 0) {
282 DEBUG(0,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
283 (unsigned)client->pid, inet_ntoa(in.sin_addr),
285 kill(client->pid, SIGKILL);
292 called when releaseip event finishes
294 static void release_ip_callback(struct ctdb_context *ctdb, int status,
297 struct takeover_callback_state *state =
298 talloc_get_type(private_data, struct takeover_callback_state);
299 char *ip = inet_ntoa(state->sin->sin_addr);
302 ctdb_start_monitoring(ctdb);
304 /* send a message to all clients of this node telling them
305 that the cluster has been reconfigured and they should
306 release any sockets on this IP */
307 data.dptr = (uint8_t *)ip;
308 data.dsize = strlen(ip)+1;
310 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
312 /* kill clients that have registered with this IP */
313 release_kill_clients(ctdb, *state->sin);
315 /* the control succeeded */
316 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
321 release an ip address
323 int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
324 struct ctdb_req_control *c,
329 struct takeover_callback_state *state;
330 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
331 struct ctdb_vnn *vnn;
333 /* update our vnn list */
334 vnn = find_public_ip_vnn(ctdb, pip->sin);
336 DEBUG(0,("releaseip called for an ip '%s' that is not a public address\n",
337 inet_ntoa(pip->sin.sin_addr)));
342 /* stop any previous arps */
343 talloc_free(vnn->takeover_ctx);
344 vnn->takeover_ctx = NULL;
346 if (!ctdb_sys_have_ip(pip->sin)) {
347 DEBUG(2,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
348 inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits,
353 DEBUG(0,("Release of IP %s/%u on interface %s\n",
354 inet_ntoa(pip->sin.sin_addr), vnn->public_netmask_bits,
357 state = talloc(ctdb, struct takeover_callback_state);
358 CTDB_NO_MEMORY(ctdb, state);
360 state->c = talloc_steal(state, c);
361 state->sin = talloc(state, struct sockaddr_in);
362 CTDB_NO_MEMORY(ctdb, state->sin);
363 *state->sin = pip->sin;
367 ctdb_stop_monitoring(ctdb);
369 ret = ctdb_event_script_callback(ctdb,
370 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
371 state, release_ip_callback, state,
372 "releaseip %s %s %u",
374 inet_ntoa(pip->sin.sin_addr),
375 vnn->public_netmask_bits);
377 DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
378 inet_ntoa(pip->sin.sin_addr), vnn->iface));
383 /* tell the control that we will be reply asynchronously */
390 static int add_public_address(struct ctdb_context *ctdb, struct sockaddr_in addr, unsigned mask, const char *iface)
392 struct ctdb_vnn *vnn;
394 /* Verify that we dont have an entry for this ip yet */
395 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
396 if (ctdb_same_sockaddr(&addr, &vnn->public_address)) {
397 DEBUG(0,("Same ip '%s' specified multiple times in the public address list \n",
398 inet_ntoa(addr.sin_addr)));
403 /* create a new vnn structure for this ip address */
404 vnn = talloc_zero(ctdb, struct ctdb_vnn);
405 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
406 vnn->iface = talloc_strdup(vnn, iface);
407 vnn->public_address = addr;
408 vnn->public_netmask_bits = mask;
411 DLIST_ADD(ctdb->vnn, vnn);
418 setup the event script directory
420 int ctdb_set_event_script_dir(struct ctdb_context *ctdb, const char *script_dir)
422 ctdb->event_script_dir = talloc_strdup(ctdb, script_dir);
423 CTDB_NO_MEMORY(ctdb, ctdb->event_script_dir);
428 setup the public address lists from a file
430 int ctdb_set_public_addresses(struct ctdb_context *ctdb, const char *alist)
436 lines = file_lines_load(alist, &nlines, ctdb);
438 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", alist);
441 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
445 for (i=0;i<nlines;i++) {
447 struct sockaddr_in addr;
451 tok = strtok(lines[i], " \t");
452 if (!tok || !parse_ip_mask(tok, &addr, &mask)) {
453 DEBUG(0,("Badly formed line %u in public address list\n", i+1));
457 tok = strtok(NULL, " \t");
459 if (NULL == ctdb->default_public_interface) {
460 DEBUG(0,("No default public interface and no interface specified at line %u of public address list\n",
465 iface = ctdb->default_public_interface;
470 if (add_public_address(ctdb, addr, mask, iface)) {
471 DEBUG(0,("Failed to add line %u to the public address list\n", i+1));
484 struct ctdb_public_ip_list {
485 struct ctdb_public_ip_list *next;
487 struct sockaddr_in sin;
491 /* Given a physical node, return the number of
492 public addresses that is currently assigned to this node.
494 static int node_ip_coverage(struct ctdb_context *ctdb,
496 struct ctdb_public_ip_list *ips)
500 for (;ips;ips=ips->next) {
501 if (ips->pnn == pnn) {
509 /* Check if this is a public ip known to the node, i.e. can that
510 node takeover this ip ?
512 static int can_node_serve_ip(struct ctdb_context *ctdb, int32_t pnn,
513 struct ctdb_public_ip_list *ip)
515 struct ctdb_all_public_ips *public_ips;
518 public_ips = ctdb->nodes[pnn]->public_ips;
520 if (public_ips == NULL) {
524 for (i=0;i<public_ips->num;i++) {
525 if (ip->sin.sin_addr.s_addr == public_ips->ips[i].sin.sin_addr.s_addr) {
526 /* yes, this node can serve this public ip */
535 /* search the node lists list for a node to takeover this ip.
536 pick the node that currently are serving the least number of ips
537 so that the ips get spread out evenly.
539 static int find_takeover_node(struct ctdb_context *ctdb,
540 struct ctdb_node_map *nodemap, uint32_t mask,
541 struct ctdb_public_ip_list *ip,
542 struct ctdb_public_ip_list *all_ips)
548 for (i=0;i<nodemap->num;i++) {
549 if (nodemap->nodes[i].flags & mask) {
550 /* This node is not healty and can not be used to serve
556 /* verify that this node can serve this ip */
557 if (can_node_serve_ip(ctdb, i, ip)) {
558 /* no it couldnt so skip to the next node */
562 num = node_ip_coverage(ctdb, i, all_ips);
563 /* was this the first node we checked ? */
575 DEBUG(0,(__location__ " Could not find node to take over public address '%s'\n", inet_ntoa(ip->sin.sin_addr)));
583 struct ctdb_public_ip_list *
584 add_ip_to_merged_list(struct ctdb_context *ctdb,
586 struct ctdb_public_ip_list *ip_list,
587 struct ctdb_public_ip *ip)
589 struct ctdb_public_ip_list *tmp_ip;
591 /* do we already have this ip in our merged list ?*/
592 for (tmp_ip=ip_list;tmp_ip;tmp_ip=tmp_ip->next) {
594 /* we already have this public ip in the list */
595 if (tmp_ip->sin.sin_addr.s_addr == ip->sin.sin_addr.s_addr) {
600 /* this is a new public ip, we must add it to the list */
601 tmp_ip = talloc_zero(tmp_ctx, struct ctdb_public_ip_list);
602 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
603 tmp_ip->pnn = ip->pnn;
604 tmp_ip->sin = ip->sin;
605 tmp_ip->next = ip_list;
610 struct ctdb_public_ip_list *
611 create_merged_ip_list(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx)
614 struct ctdb_public_ip_list *ip_list = NULL;
615 struct ctdb_all_public_ips *public_ips;
617 for (i=0;i<ctdb->num_nodes;i++) {
618 public_ips = ctdb->nodes[i]->public_ips;
620 /* there were no public ips for this node */
621 if (public_ips == NULL) {
625 for (j=0;j<public_ips->num;j++) {
626 ip_list = add_ip_to_merged_list(ctdb, tmp_ctx,
627 ip_list, &public_ips->ips[j]);
635 make any IP alias changes for public addresses that are necessary
637 int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
639 int i, num_healthy, retries;
641 struct ctdb_public_ip ip;
643 struct ctdb_public_ip_list *all_ips, *tmp_ip;
644 int maxnode, maxnum=0, minnode, minnum=0, num;
645 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
650 /* Count how many completely healthy nodes we have */
652 for (i=0;i<nodemap->num;i++) {
653 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
658 if (num_healthy > 0) {
659 /* We have healthy nodes, so only consider them for
660 serving public addresses
662 mask = NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED;
664 /* We didnt have any completely healthy nodes so
665 use "disabled" nodes as a fallback
667 mask = NODE_FLAGS_INACTIVE;
670 /* since nodes only know about those public addresses that
671 can be served by that particular node, no single node has
672 a full list of all public addresses that exist in the cluster.
673 Walk over all node structures and create a merged list of
674 all public addresses that exist in the cluster.
676 all_ips = create_merged_ip_list(ctdb, tmp_ctx);
679 /* mark all public addresses with a masked node as being served by
682 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
683 if (tmp_ip->pnn == -1) {
686 if (nodemap->nodes[tmp_ip->pnn].flags & mask) {
692 /* now we must redistribute all public addresses with takeover node
693 -1 among the nodes available
697 /* loop over all ip's and find a physical node to cover for
700 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
701 if (tmp_ip->pnn == -1) {
702 if (find_takeover_node(ctdb, nodemap, mask, tmp_ip, all_ips)) {
703 DEBUG(0,("Failed to find node to cover ip %s\n", inet_ntoa(tmp_ip->sin.sin_addr)));
709 /* now, try to make sure the ip adresses are evenly distributed
711 for each ip address, loop over all nodes that can serve this
712 ip and make sure that the difference between the node
713 serving the most and the node serving the least ip's are not greater
716 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
717 if (tmp_ip->pnn == -1) {
721 /* Get the highest and lowest number of ips's served by any
722 valid node which can serve this ip.
726 for (i=0;i<nodemap->num;i++) {
727 if (nodemap->nodes[i].flags & mask) {
731 /* only check nodes that can actually serve this ip */
732 if (can_node_serve_ip(ctdb, i, tmp_ip)) {
733 /* no it couldnt so skip to the next node */
737 num = node_ip_coverage(ctdb, i, all_ips);
758 DEBUG(0,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n", inet_ntoa(tmp_ip->sin.sin_addr)));
762 /* if the spread between the smallest and largest coverage by
763 a node is >=2 we steal one of the ips from the node with
764 most coverage to even things out a bit.
765 try to do this at most 5 times since we dont want to spend
766 too much time balancing the ip coverage.
768 if ( (maxnum > minnum+1)
770 struct ctdb_public_ip_list *tmp;
772 /* mark one of maxnode's vnn's as unassigned and try
775 for (tmp=all_ips;tmp;tmp=tmp->next) {
776 if (tmp->pnn == maxnode) {
787 /* at this point ->pnn is the node which will own each IP
788 or -1 if there is no node that can cover this ip
791 /* now tell all nodes to delete any alias that they should not
792 have. This will be a NOOP on nodes that don't currently
793 hold the given alias */
794 for (i=0;i<nodemap->num;i++) {
795 /* don't talk to unconnected nodes, but do talk to banned nodes */
796 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
800 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
801 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
802 /* This node should be serving this
803 vnn so dont tell it to release the ip
807 ip.pnn = tmp_ip->pnn;
808 ip.sin.sin_family = AF_INET;
809 ip.sin.sin_addr = tmp_ip->sin.sin_addr;
811 ret = ctdb_ctrl_release_ip(ctdb, TAKEOVER_TIMEOUT(),
812 nodemap->nodes[i].pnn,
815 DEBUG(0,("Failed to tell vnn %u to release IP %s\n",
816 nodemap->nodes[i].pnn,
817 inet_ntoa(tmp_ip->sin.sin_addr)));
818 talloc_free(tmp_ctx);
825 /* tell all nodes to get their own IPs */
826 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
827 if (tmp_ip->pnn == -1) {
828 /* this IP won't be taken over */
831 ip.pnn = tmp_ip->pnn;
832 ip.sin.sin_family = AF_INET;
833 ip.sin.sin_addr = tmp_ip->sin.sin_addr;
835 ret = ctdb_ctrl_takeover_ip(ctdb, TAKEOVER_TIMEOUT(),
839 DEBUG(0,("Failed asking vnn %u to take over IP %s\n",
841 inet_ntoa(tmp_ip->sin.sin_addr)));
842 talloc_free(tmp_ctx);
847 talloc_free(tmp_ctx);
853 destroy a ctdb_client_ip structure
855 static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
857 DEBUG(3,("destroying client tcp for %s:%u (client_id %u)\n",
858 inet_ntoa(ip->ip.sin_addr), ntohs(ip->ip.sin_port), ip->client_id));
859 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
864 called by a client to inform us of a TCP connection that it is managing
865 that should tickled with an ACK when IP takeover is done
867 int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
870 struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
871 struct ctdb_control_tcp *p = (struct ctdb_control_tcp *)indata.dptr;
872 struct ctdb_tcp_list *tcp;
873 struct ctdb_control_tcp_vnn t;
876 struct ctdb_client_ip *ip;
877 struct ctdb_vnn *vnn;
879 vnn = find_public_ip_vnn(ctdb, p->dest);
881 if (ntohl(p->dest.sin_addr.s_addr) != INADDR_LOOPBACK) {
882 DEBUG(0,("Could not add client IP %s. This is not a public address.\n",
883 inet_ntoa(p->dest.sin_addr)));
888 if (vnn->pnn != ctdb->pnn) {
889 DEBUG(0,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
890 inet_ntoa(p->dest.sin_addr),
891 client_id, client->pid));
892 /* failing this call will tell smbd to die */
896 ip = talloc(client, struct ctdb_client_ip);
897 CTDB_NO_MEMORY(ctdb, ip);
901 ip->client_id = client_id;
902 talloc_set_destructor(ip, ctdb_client_ip_destructor);
903 DLIST_ADD(ctdb->client_ip_list, ip);
905 tcp = talloc(client, struct ctdb_tcp_list);
906 CTDB_NO_MEMORY(ctdb, tcp);
908 tcp->connection.saddr = p->src;
909 tcp->connection.daddr = p->dest;
911 DLIST_ADD(client->tcp_list, tcp);
916 data.dptr = (uint8_t *)&t;
917 data.dsize = sizeof(t);
919 DEBUG(1,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
920 (unsigned)ntohs(p->dest.sin_port),
921 inet_ntoa(p->src.sin_addr),
922 (unsigned)ntohs(p->src.sin_port), client_id, client->pid));
924 /* tell all nodes about this tcp connection */
925 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
926 CTDB_CONTROL_TCP_ADD,
927 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
929 DEBUG(0,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
937 see if two sockaddr_in are the same
939 static bool same_sockaddr_in(struct sockaddr_in *in1, struct sockaddr_in *in2)
941 return in1->sin_family == in2->sin_family &&
942 in1->sin_port == in2->sin_port &&
943 in1->sin_addr.s_addr == in2->sin_addr.s_addr;
947 find a tcp address on a list
949 static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
950 struct ctdb_tcp_connection *tcp)
958 for (i=0;i<array->num;i++) {
959 if (same_sockaddr_in(&array->connections[i].saddr, &tcp->saddr) &&
960 same_sockaddr_in(&array->connections[i].daddr, &tcp->daddr)) {
961 return &array->connections[i];
968 called by a daemon to inform us of a TCP connection that one of its
969 clients managing that should tickled with an ACK when IP takeover is
972 int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata)
974 struct ctdb_control_tcp_vnn *p = (struct ctdb_control_tcp_vnn *)indata.dptr;
975 struct ctdb_tcp_array *tcparray;
976 struct ctdb_tcp_connection tcp;
977 struct ctdb_vnn *vnn;
979 vnn = find_public_ip_vnn(ctdb, p->dest);
981 DEBUG(0,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
982 inet_ntoa(p->dest.sin_addr)));
987 tcparray = vnn->tcp_array;
989 /* If this is the first tickle */
990 if (tcparray == NULL) {
991 tcparray = talloc_size(ctdb->nodes,
992 offsetof(struct ctdb_tcp_array, connections) +
993 sizeof(struct ctdb_tcp_connection) * 1);
994 CTDB_NO_MEMORY(ctdb, tcparray);
995 vnn->tcp_array = tcparray;
998 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
999 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1001 tcparray->connections[tcparray->num].saddr = p->src;
1002 tcparray->connections[tcparray->num].daddr = p->dest;
1008 /* Do we already have this tickle ?*/
1010 tcp.daddr = p->dest;
1011 if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
1012 DEBUG(4,("Already had tickle info for %s:%u for vnn:%u\n",
1013 inet_ntoa(tcp.daddr.sin_addr),
1014 ntohs(tcp.daddr.sin_port),
1019 /* A new tickle, we must add it to the array */
1020 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
1021 struct ctdb_tcp_connection,
1023 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1025 vnn->tcp_array = tcparray;
1026 tcparray->connections[tcparray->num].saddr = p->src;
1027 tcparray->connections[tcparray->num].daddr = p->dest;
1030 DEBUG(2,("Added tickle info for %s:%u from vnn %u\n",
1031 inet_ntoa(tcp.daddr.sin_addr),
1032 ntohs(tcp.daddr.sin_port),
1040 called by a daemon to inform us of a TCP connection that one of its
1041 clients managing that should tickled with an ACK when IP takeover is
1044 static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
1046 struct ctdb_tcp_connection *tcpp;
1047 struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, conn->daddr);
1050 DEBUG(0,(__location__ " unable to find public address %s\n", inet_ntoa(conn->daddr.sin_addr)));
1054 /* if the array is empty we cant remove it
1055 and we dont need to do anything
1057 if (vnn->tcp_array == NULL) {
1058 DEBUG(2,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
1059 inet_ntoa(conn->daddr.sin_addr),
1060 ntohs(conn->daddr.sin_port)));
1065 /* See if we know this connection
1066 if we dont know this connection then we dont need to do anything
1068 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
1070 DEBUG(2,("Trying to remove tickle that doesnt exist %s:%u\n",
1071 inet_ntoa(conn->daddr.sin_addr),
1072 ntohs(conn->daddr.sin_port)));
1077 /* We need to remove this entry from the array.
1078 Instead of allocating a new array and copying data to it
1079 we cheat and just copy the last entry in the existing array
1080 to the entry that is to be removed and just shring the
1083 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
1084 vnn->tcp_array->num--;
1086 /* If we deleted the last entry we also need to remove the entire array
1088 if (vnn->tcp_array->num == 0) {
1089 talloc_free(vnn->tcp_array);
1090 vnn->tcp_array = NULL;
1093 vnn->tcp_update_needed = true;
1095 DEBUG(2,("Removed tickle info for %s:%u\n",
1096 inet_ntoa(conn->saddr.sin_addr),
1097 ntohs(conn->saddr.sin_port)));
1102 called when a daemon restarts - send all tickes for all public addresses
1103 we are serving immediately to the new node.
1105 int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
1107 /*XXX here we should send all tickes we are serving to the new node */
1113 called when a client structure goes away - hook to remove
1114 elements from the tcp_list in all daemons
1116 void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
1118 while (client->tcp_list) {
1119 struct ctdb_tcp_list *tcp = client->tcp_list;
1120 DLIST_REMOVE(client->tcp_list, tcp);
1121 ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
1127 release all IPs on shutdown
1129 void ctdb_release_all_ips(struct ctdb_context *ctdb)
1131 struct ctdb_vnn *vnn;
1133 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1134 if (!ctdb_sys_have_ip(vnn->public_address)) {
1137 ctdb_event_script(ctdb, "releaseip %s %s %u",
1139 inet_ntoa(vnn->public_address.sin_addr),
1140 vnn->public_netmask_bits);
1141 release_kill_clients(ctdb, vnn->public_address);
1147 get list of public IPs
1149 int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
1150 struct ctdb_req_control *c, TDB_DATA *outdata)
1153 struct ctdb_all_public_ips *ips;
1154 struct ctdb_vnn *vnn;
1156 /* count how many public ip structures we have */
1158 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1162 len = offsetof(struct ctdb_all_public_ips, ips) +
1163 num*sizeof(struct ctdb_public_ip);
1164 ips = talloc_zero_size(outdata, len);
1165 CTDB_NO_MEMORY(ctdb, ips);
1167 outdata->dsize = len;
1168 outdata->dptr = (uint8_t *)ips;
1172 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1173 ips->ips[i].pnn = vnn->pnn;
1174 ips->ips[i].sin = vnn->public_address;
1184 structure containing the listening socket and the list of tcp connections
1185 that the ctdb daemon is to kill
1187 struct ctdb_kill_tcp {
1188 struct ctdb_vnn *vnn;
1189 struct ctdb_context *ctdb;
1192 struct fd_event *fde;
1193 trbt_tree_t *connections;
1198 a tcp connection that is to be killed
1200 struct ctdb_killtcp_con {
1201 struct sockaddr_in src;
1202 struct sockaddr_in dst;
1204 struct ctdb_kill_tcp *killtcp;
1207 /* this function is used to create a key to represent this socketpair
1208 in the killtcp tree.
1209 this key is used to insert and lookup matching socketpairs that are
1210 to be tickled and RST
1212 #define KILLTCP_KEYLEN 4
1213 static uint32_t *killtcp_key(struct sockaddr_in *src, struct sockaddr_in *dst)
1215 static uint32_t key[KILLTCP_KEYLEN];
1217 key[0] = dst->sin_addr.s_addr;
1218 key[1] = src->sin_addr.s_addr;
1219 key[2] = dst->sin_port;
1220 key[3] = src->sin_port;
1226 called when we get a read event on the raw socket
1228 static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde,
1229 uint16_t flags, void *private_data)
1231 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1232 struct ctdb_killtcp_con *con;
1233 struct sockaddr_in src, dst;
1234 uint32_t ack_seq, seq;
1236 if (!(flags & EVENT_FD_READ)) {
1240 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
1241 killtcp->private_data,
1243 &ack_seq, &seq) != 0) {
1244 /* probably a non-tcp ACK packet */
1248 /* check if we have this guy in our list of connections
1251 con = trbt_lookuparray32(killtcp->connections,
1252 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
1254 /* no this was some other packet we can just ignore */
1258 /* This one has been tickled !
1259 now reset him and remove him from the list.
1261 DEBUG(1, ("sending a tcp reset to kill connection :%d -> %s:%d\n", ntohs(con->dst.sin_port), inet_ntoa(con->src.sin_addr), ntohs(con->src.sin_port)));
1263 ctdb_sys_send_tcp(killtcp->sending_fd, &con->dst,
1264 &con->src, ack_seq, seq, 1);
1269 /* when traversing the list of all tcp connections to send tickle acks to
1270 (so that we can capture the ack coming back and kill the connection
1272 this callback is called for each connection we are currently trying to kill
1274 static void tickle_connection_traverse(void *param, void *data)
1276 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
1277 struct ctdb_kill_tcp *killtcp = talloc_get_type(param, struct ctdb_kill_tcp);
1279 /* have tried too many times, just give up */
1280 if (con->count >= 5) {
1285 /* othervise, try tickling it again */
1287 ctdb_sys_send_tcp(killtcp->sending_fd, &con->dst, &con->src, 0, 0, 0);
1292 called every second until all sentenced connections have been reset
1294 static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te,
1295 struct timeval t, void *private_data)
1297 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
1300 /* loop over all connections sending tickle ACKs */
1301 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, killtcp);
1304 /* If there are no more connections to kill we can remove the
1305 entire killtcp structure
1307 if ( (killtcp->connections == NULL) ||
1308 (killtcp->connections->root == NULL) ) {
1309 talloc_free(killtcp);
1313 /* try tickling them again in a seconds time
1315 event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0),
1316 ctdb_tickle_sentenced_connections, killtcp);
1320 destroy the killtcp structure
1322 static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
1324 if (killtcp->sending_fd != -1) {
1325 close(killtcp->sending_fd);
1326 killtcp->sending_fd = -1;
1328 killtcp->vnn->killtcp = NULL;
1333 /* nothing fancy here, just unconditionally replace any existing
1334 connection structure with the new one.
1336 dont even free the old one if it did exist, that one is talloc_stolen
1337 by the same node in the tree anyway and will be deleted when the new data
1340 static void *add_killtcp_callback(void *parm, void *data)
1346 add a tcp socket to the list of connections we want to RST
1348 static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
1349 struct sockaddr_in *src, struct sockaddr_in *dst)
1351 struct ctdb_kill_tcp *killtcp;
1352 struct ctdb_killtcp_con *con;
1353 struct ctdb_vnn *vnn;
1355 vnn = find_public_ip_vnn(ctdb, *dst);
1357 vnn = find_public_ip_vnn(ctdb, *src);
1360 DEBUG(0,(__location__ " Could not killtcp, not a public address\n"));
1364 killtcp = vnn->killtcp;
1366 /* If this is the first connection to kill we must allocate
1369 if (killtcp == NULL) {
1370 killtcp = talloc_zero(ctdb, struct ctdb_kill_tcp);
1371 CTDB_NO_MEMORY(ctdb, killtcp);
1374 killtcp->ctdb = ctdb;
1375 killtcp->capture_fd = -1;
1376 killtcp->sending_fd = -1;
1377 killtcp->connections = trbt_create(killtcp, 0);
1379 vnn->killtcp = killtcp;
1380 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
1385 /* create a structure that describes this connection we want to
1386 RST and store it in killtcp->connections
1388 con = talloc(killtcp, struct ctdb_killtcp_con);
1389 CTDB_NO_MEMORY(ctdb, con);
1393 con->killtcp = killtcp;
1396 trbt_insertarray32_callback(killtcp->connections,
1397 KILLTCP_KEYLEN, killtcp_key(&con->dst, &con->src),
1398 add_killtcp_callback, con);
1401 If we dont have a socket to send from yet we must create it
1403 if (killtcp->sending_fd == -1) {
1404 killtcp->sending_fd = ctdb_sys_open_sending_socket();
1405 if (killtcp->sending_fd == -1) {
1406 DEBUG(0,(__location__ " Failed to open sending socket for killtcp\n"));
1412 If we dont have a socket to listen on yet we must create it
1414 if (killtcp->capture_fd == -1) {
1415 killtcp->capture_fd = ctdb_sys_open_capture_socket(vnn->iface, &killtcp->private_data);
1416 if (killtcp->capture_fd == -1) {
1417 DEBUG(0,(__location__ " Failed to open capturing socket for killtcp\n"));
1423 if (killtcp->fde == NULL) {
1424 killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd,
1425 EVENT_FD_READ | EVENT_FD_AUTOCLOSE,
1426 capture_tcp_handler, killtcp);
1428 /* We also need to set up some events to tickle all these connections
1429 until they are all reset
1431 event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
1432 ctdb_tickle_sentenced_connections, killtcp);
1435 /* tickle him once now */
1436 ctdb_sys_send_tcp(killtcp->sending_fd, &con->dst, &con->src, 0, 0, 0);
1441 talloc_free(vnn->killtcp);
1442 vnn->killtcp = NULL;
1447 kill a TCP connection.
1449 int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
1451 struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
1453 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
1457 called by a daemon to inform us of the entire list of TCP tickles for
1458 a particular public address.
1459 this control should only be sent by the node that is currently serving
1460 that public address.
1462 int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
1464 struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
1465 struct ctdb_tcp_array *tcparray;
1466 struct ctdb_vnn *vnn;
1468 /* We must at least have tickles.num or else we cant verify the size
1469 of the received data blob
1471 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
1472 tickles.connections)) {
1473 DEBUG(0,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
1477 /* verify that the size of data matches what we expect */
1478 if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list,
1479 tickles.connections)
1480 + sizeof(struct ctdb_tcp_connection)
1481 * list->tickles.num) {
1482 DEBUG(0,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
1486 vnn = find_public_ip_vnn(ctdb, list->ip);
1488 DEBUG(0,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
1489 inet_ntoa(list->ip.sin_addr)));
1493 /* remove any old ticklelist we might have */
1494 talloc_free(vnn->tcp_array);
1495 vnn->tcp_array = NULL;
1497 tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
1498 CTDB_NO_MEMORY(ctdb, tcparray);
1500 tcparray->num = list->tickles.num;
1502 tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
1503 CTDB_NO_MEMORY(ctdb, tcparray->connections);
1505 memcpy(tcparray->connections, &list->tickles.connections[0],
1506 sizeof(struct ctdb_tcp_connection)*tcparray->num);
1508 /* We now have a new fresh tickle list array for this vnn */
1509 vnn->tcp_array = talloc_steal(vnn, tcparray);
1515 called to return the full list of tickles for the puclic address associated
1516 with the provided vnn
1518 int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1520 struct sockaddr_in *ip = (struct sockaddr_in *)indata.dptr;
1521 struct ctdb_control_tcp_tickle_list *list;
1522 struct ctdb_tcp_array *tcparray;
1524 struct ctdb_vnn *vnn;
1526 vnn = find_public_ip_vnn(ctdb, *ip);
1528 DEBUG(0,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
1529 inet_ntoa(ip->sin_addr)));
1533 tcparray = vnn->tcp_array;
1535 num = tcparray->num;
1540 outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list,
1541 tickles.connections)
1542 + sizeof(struct ctdb_tcp_connection) * num;
1544 outdata->dptr = talloc_size(outdata, outdata->dsize);
1545 CTDB_NO_MEMORY(ctdb, outdata->dptr);
1546 list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
1549 list->tickles.num = num;
1551 memcpy(&list->tickles.connections[0], tcparray->connections,
1552 sizeof(struct ctdb_tcp_connection) * num);
1560 set the list of all tcp tickles for a public address
1562 static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb,
1563 struct timeval timeout, uint32_t destnode,
1564 struct sockaddr_in *ip,
1565 struct ctdb_tcp_array *tcparray)
1569 struct ctdb_control_tcp_tickle_list *list;
1572 num = tcparray->num;
1577 data.dsize = offsetof(struct ctdb_control_tcp_tickle_list,
1578 tickles.connections) +
1579 sizeof(struct ctdb_tcp_connection) * num;
1580 data.dptr = talloc_size(ctdb, data.dsize);
1581 CTDB_NO_MEMORY(ctdb, data.dptr);
1583 list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
1585 list->tickles.num = num;
1587 memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
1590 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
1591 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
1592 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
1594 DEBUG(0,(__location__ " ctdb_control for set tcp tickles failed\n"));
1598 talloc_free(data.dptr);
1605 perform tickle updates if required
1607 static void ctdb_update_tcp_tickles(struct event_context *ev,
1608 struct timed_event *te,
1609 struct timeval t, void *private_data)
1611 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
1613 struct ctdb_vnn *vnn;
1615 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1616 /* we only send out updates for public addresses that
1619 if (ctdb->pnn != vnn->pnn) {
1622 /* We only send out the updates if we need to */
1623 if (!vnn->tcp_update_needed) {
1626 ret = ctdb_ctrl_set_tcp_tickles(ctdb,
1628 CTDB_BROADCAST_CONNECTED,
1629 &vnn->public_address,
1632 DEBUG(0,("Failed to send the tickle update for public address %s\n",
1633 inet_ntoa(vnn->public_address.sin_addr)));
1637 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
1638 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
1639 ctdb_update_tcp_tickles, ctdb);
1644 start periodic update of tcp tickles
1646 void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
1648 ctdb->tickle_update_context = talloc_new(ctdb);
1650 event_add_timed(ctdb->ev, ctdb->tickle_update_context,
1651 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
1652 ctdb_update_tcp_tickles, ctdb);
1658 struct control_gratious_arp {
1659 struct ctdb_context *ctdb;
1660 struct sockaddr_in sin;
1666 send a control_gratuitous arp
1668 static void send_gratious_arp(struct event_context *ev, struct timed_event *te,
1669 struct timeval t, void *private_data)
1672 struct control_gratious_arp *arp = talloc_get_type(private_data,
1673 struct control_gratious_arp);
1675 ret = ctdb_sys_send_arp(&arp->sin, arp->iface);
1677 DEBUG(0,(__location__ " sending of gratious arp failed (%s)\n", strerror(errno)));
1682 if (arp->count == CTDB_ARP_REPEAT) {
1687 event_add_timed(arp->ctdb->ev, arp,
1688 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
1689 send_gratious_arp, arp);
1696 int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
1698 struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
1699 struct control_gratious_arp *arp;
1702 /* verify the size of indata */
1703 if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
1704 DEBUG(0,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure\n"));
1708 ( offsetof(struct ctdb_control_gratious_arp, iface)
1709 + gratious_arp->len ) ){
1711 DEBUG(0,(__location__ " Wrong size of indata. Was %d bytes "
1712 "but should be %d bytes\n",
1714 offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len));
1719 arp = talloc(ctdb, struct control_gratious_arp);
1720 CTDB_NO_MEMORY(ctdb, arp);
1723 arp->sin = gratious_arp->sin;
1724 arp->iface = talloc_strdup(arp, gratious_arp->iface);
1725 CTDB_NO_MEMORY(ctdb, arp->iface);
1728 event_add_timed(arp->ctdb->ev, arp,
1729 timeval_zero(), send_gratious_arp, arp);