c088970e256218665fedb1a3b6213c28b75dd1e7
[samba.git] / ctdb / server / ctdb_takeover_helper.c
1 /*
2    CTDB IP takeover helper
3
4    Copyright (C) Martin Schwenke  2016
5
6    Based on ctdb_recovery_helper.c
7    Copyright (C) Amitay Isaacs  2015
8
9    and ctdb_takeover.c
10    Copyright (C) Ronnie Sahlberg  2007
11    Copyright (C) Andrew Tridgell  2007
12    Copyright (C) Martin Schwenke  2011
13
14    This program is free software; you can redistribute it and/or modify
15    it under the terms of the GNU General Public License as published by
16    the Free Software Foundation; either version 3 of the License, or
17    (at your option) any later version.
18
19    This program is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22    GNU General Public License for more details.
23
24    You should have received a copy of the GNU General Public License
25    along with this program; if not, see <http://www.gnu.org/licenses/>.
26 */
27
28 #include "replace.h"
29 #include "system/network.h"
30 #include "system/filesys.h"
31
32 #include <popt.h>
33 #include <talloc.h>
34 #include <tevent.h>
35
36 #include "lib/util/debug.h"
37 #include "lib/util/strv.h"
38 #include "lib/util/strv_util.h"
39 #include "lib/util/sys_rw.h"
40 #include "lib/util/time.h"
41 #include "lib/util/tevent_unix.h"
42
43 #include "protocol/protocol.h"
44 #include "protocol/protocol_api.h"
45 #include "protocol/protocol_util.h"
46 #include "client/client.h"
47
48 #include "common/logging.h"
49
50 #include "server/ipalloc.h"
51
52 static int takeover_timeout = 9;
53
54 #define TIMEOUT()       timeval_current_ofs(takeover_timeout, 0)
55
56 /*
57  * Utility functions
58  */
59
60 static bool generic_recv(struct tevent_req *req, int *perr)
61 {
62         int err;
63
64         if (tevent_req_is_unix_error(req, &err)) {
65                 if (perr != NULL) {
66                         *perr = err;
67                 }
68                 return false;
69         }
70
71         return true;
72 }
73
74 static enum ipalloc_algorithm
75 determine_algorithm(const struct ctdb_tunable_list *tunables)
76 {
77         switch (tunables->ip_alloc_algorithm) {
78         case 0:
79                 return IPALLOC_DETERMINISTIC;
80         case 1:
81                 return IPALLOC_NONDETERMINISTIC;
82         case 2:
83                 return IPALLOC_LCP2;
84         default:
85                 return IPALLOC_LCP2;
86         };
87 }
88
89 /**********************************************************************/
90
91 struct get_public_ips_state {
92         uint32_t *pnns;
93         int count;
94         struct ctdb_public_ip_list *ips;
95         uint32_t *ban_credits;
96 };
97
98 static void get_public_ips_done(struct tevent_req *subreq);
99
100 static struct tevent_req *get_public_ips_send(
101                                 TALLOC_CTX *mem_ctx,
102                                 struct tevent_context *ev,
103                                 struct ctdb_client_context *client,
104                                 uint32_t *pnns,
105                                 int count, int num_nodes,
106                                 uint32_t *ban_credits,
107                                 bool available_only)
108 {
109         struct tevent_req *req, *subreq;
110         struct get_public_ips_state *state;
111         struct ctdb_req_control request;
112
113         req = tevent_req_create(mem_ctx, &state, struct get_public_ips_state);
114         if (req == NULL) {
115                 return NULL;
116         }
117
118         state->pnns = pnns;
119         state->count = count;
120         state->ban_credits = ban_credits;
121
122         state->ips  = talloc_zero_array(state,
123                                         struct ctdb_public_ip_list,
124                                         num_nodes);
125         if (tevent_req_nomem(state->ips, req)) {
126                 return tevent_req_post(req, ev);
127         }
128
129         /* Short circuit if no nodes being asked for IPs */
130         if (state->count == 0) {
131                 tevent_req_done(req);
132                 return tevent_req_post(req, ev);
133         }
134
135         ctdb_req_control_get_public_ips(&request, available_only);
136         subreq = ctdb_client_control_multi_send(mem_ctx, ev, client,
137                                                 state->pnns,
138                                                 state->count,
139                                                 TIMEOUT(), &request);
140         if (tevent_req_nomem(subreq, req)) {
141                 return tevent_req_post(req, ev);
142         }
143         tevent_req_set_callback(subreq, get_public_ips_done, req);
144
145         return req;
146 }
147
148 static void get_public_ips_done(struct tevent_req *subreq)
149 {
150         struct tevent_req *req = tevent_req_callback_data(
151                 subreq, struct tevent_req);
152         struct get_public_ips_state *state = tevent_req_data(
153                 req, struct get_public_ips_state);
154         struct ctdb_reply_control **reply;
155         int *err_list;
156         int ret, i;
157         bool status, found_errors;
158
159         status = ctdb_client_control_multi_recv(subreq, &ret, state, &err_list,
160                                                 &reply);
161         TALLOC_FREE(subreq);
162         if (! status) {
163                 for (i = 0; i < state->count; i++) {
164                         if (err_list[i] != 0) {
165                                 uint32_t pnn = state->pnns[i];
166
167                                 D_ERR("control GET_PUBLIC_IPS failed on "
168                                       "node %u, ret=%d\n", pnn, err_list[i]);
169
170                                 state->ban_credits[pnn]++;
171                         }
172                 }
173
174                 tevent_req_error(req, ret);
175                 return;
176         }
177
178         found_errors = false;
179         for (i = 0; i < state->count; i++) {
180                 uint32_t pnn;
181                 struct ctdb_public_ip_list *ips;
182
183                 pnn = state->pnns[i];
184                 ret = ctdb_reply_control_get_public_ips(reply[i], state->ips,
185                                                         &ips);
186                 if (ret != 0) {
187                         D_ERR("control GET_PUBLIC_IPS failed on "
188                               "node %u\n", pnn);
189                         state->ban_credits[pnn]++;
190                         found_errors = true;
191                         continue;
192                 }
193
194                 D_INFO("Fetched public IPs from node %u\n", pnn);
195                 state->ips[pnn] = *ips;
196         }
197
198         if (found_errors) {
199                 tevent_req_error(req, EIO);
200                 return;
201         }
202
203         talloc_free(reply);
204
205         tevent_req_done(req);
206 }
207
208 static bool get_public_ips_recv(struct tevent_req *req, int *perr,
209                                 TALLOC_CTX *mem_ctx,
210                                 struct ctdb_public_ip_list **ips)
211 {
212         struct get_public_ips_state *state = tevent_req_data(
213                 req, struct get_public_ips_state);
214         int err;
215
216         if (tevent_req_is_unix_error(req, &err)) {
217                 if (perr != NULL) {
218                         *perr = err;
219                 }
220                 return false;
221         }
222
223         *ips = talloc_steal(mem_ctx, state->ips);
224
225         return true;
226 }
227
228 /**********************************************************************/
229
230 struct release_ip_state {
231         int num_sent;
232         int num_replies;
233         int num_fails;
234         int err_any;
235         uint32_t *ban_credits;
236 };
237
238 struct release_ip_one_state {
239         struct tevent_req *req;
240         uint32_t *pnns;
241         int count;
242         const char *ip_str;
243 };
244
245 static void release_ip_done(struct tevent_req *subreq);
246
247 static struct tevent_req *release_ip_send(TALLOC_CTX *mem_ctx,
248                                           struct tevent_context *ev,
249                                           struct ctdb_client_context *client,
250                                           uint32_t *pnns,
251                                           int count,
252                                           struct timeval timeout,
253                                           struct public_ip_list *all_ips,
254                                           uint32_t *ban_credits)
255 {
256         struct tevent_req *req, *subreq;
257         struct release_ip_state *state;
258         struct ctdb_req_control request;
259         struct public_ip_list *tmp_ip;
260
261         req = tevent_req_create(mem_ctx, &state, struct release_ip_state);
262         if (req == NULL) {
263                 return NULL;
264         }
265
266         state->num_sent = 0;
267         state->num_replies = 0;
268         state->num_fails = 0;
269         state->ban_credits = ban_credits;
270
271         /* Send a RELEASE_IP to all nodes that should not be hosting
272          * each IP.  For each IP, all but one of these will be
273          * redundant.  However, the redundant ones are used to tell
274          * nodes which node should be hosting the IP so that commands
275          * like "ctdb ip" can display a particular nodes idea of who
276          * is hosting what. */
277         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
278                 struct release_ip_one_state *substate;
279                 struct ctdb_public_ip ip;
280                 int i;
281
282                 substate = talloc_zero(state, struct release_ip_one_state);
283                 if (tevent_req_nomem(substate, req)) {
284                         return tevent_req_post(req, ev);
285                 }
286
287                 substate->pnns = talloc_zero_array(substate, uint32_t, count);
288                 if (tevent_req_nomem(substate->pnns, req)) {
289                         return tevent_req_post(req, ev);
290                 }
291
292                 substate->count = 0;
293                 substate->req = req;
294
295                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
296                                                              &tmp_ip->addr,
297                                                              false);
298                 if (tevent_req_nomem(substate->ip_str, req)) {
299                         return tevent_req_post(req, ev);
300                 }
301
302                 for (i = 0; i < count; i++) {
303                         uint32_t pnn = pnns[i];
304
305                         /* Skip this node if IP is not known */
306                         if (! bitmap_query(tmp_ip->known_on, pnn)) {
307                                 continue;
308                         }
309
310                         /* If pnn is not the node that should be
311                          * hosting the IP then add it to the list of
312                          * nodes that need to do a release. */
313                         if (tmp_ip->pnn != pnn) {
314                                 substate->pnns[substate->count] = pnn;
315                                 substate->count++;
316                         }
317                 }
318
319                 if (substate->count == 0) {
320                         /* No releases to send for this address... */
321                         TALLOC_FREE(substate);
322                         continue;
323                 }
324
325                 ip.pnn = tmp_ip->pnn;
326                 ip.addr = tmp_ip->addr;
327                 ctdb_req_control_release_ip(&request, &ip);
328                 subreq = ctdb_client_control_multi_send(state, ev, client,
329                                                         substate->pnns,
330                                                         substate->count,
331                                                         timeout,/* cumulative */
332                                                         &request);
333                 if (tevent_req_nomem(subreq, req)) {
334                         return tevent_req_post(req, ev);
335                 }
336                 tevent_req_set_callback(subreq, release_ip_done, substate);
337
338                 state->num_sent++;
339         }
340
341         /* None sent, finished... */
342         if (state->num_sent == 0) {
343                 tevent_req_done(req);
344                 return tevent_req_post(req, ev);
345         }
346
347         return req;
348 }
349
350 static void release_ip_done(struct tevent_req *subreq)
351 {
352         struct release_ip_one_state *substate = tevent_req_callback_data(
353                 subreq, struct release_ip_one_state);
354         struct tevent_req *req = substate->req;
355         struct release_ip_state *state = tevent_req_data(
356                 req, struct release_ip_state);
357         int ret, i;
358         int *err_list;
359         bool status, found_errors;
360
361         status = ctdb_client_control_multi_recv(subreq, &ret, state,
362                                                 &err_list, NULL);
363         TALLOC_FREE(subreq);
364
365         if (status) {
366                 D_INFO("RELEASE_IP %s succeeded on %d nodes\n",
367                        substate->ip_str, substate->count);
368                 goto done;
369         }
370
371         /* Get some clear error messages out of err_list and count
372          * banning credits
373          */
374         found_errors = false;
375         for (i = 0; i < substate->count; i++) {
376                 int err = err_list[i];
377                 if (err != 0) {
378                         uint32_t pnn = substate->pnns[i];
379
380                         D_ERR("RELEASE_IP %s failed on node %u, "
381                               "ret=%d\n", substate->ip_str, pnn, err);
382
383                         state->ban_credits[pnn]++;
384                         state->err_any = err;
385                         found_errors = true;
386                 }
387         }
388         if (! found_errors) {
389                 D_ERR("RELEASE_IP %s internal error, ret=%d\n",
390                       substate->ip_str, ret);
391                 state->err_any = EIO;
392         }
393
394         state->num_fails++;
395
396 done:
397         talloc_free(substate);
398
399         state->num_replies++;
400
401         if (state->num_replies < state->num_sent) {
402                 /* Not all replies received, don't go further */
403                 return;
404         }
405
406         if (state->num_fails > 0) {
407                 tevent_req_error(req, state->err_any);
408                 return;
409         }
410
411         tevent_req_done(req);
412 }
413
414 static bool release_ip_recv(struct tevent_req *req, int *perr)
415 {
416         return generic_recv(req, perr);
417 }
418
419 /**********************************************************************/
420
421 struct take_ip_state {
422         int num_sent;
423         int num_replies;
424         int num_fails;
425         int err_any;
426         uint32_t *ban_credits;
427 };
428
429 struct take_ip_one_state {
430         struct tevent_req *req;
431         uint32_t pnn;
432         const char *ip_str;
433 };
434
435 static void take_ip_done(struct tevent_req *subreq);
436
437 static struct tevent_req *take_ip_send(TALLOC_CTX *mem_ctx,
438                                        struct tevent_context *ev,
439                                        struct ctdb_client_context *client,
440                                        struct timeval timeout,
441                                        struct public_ip_list *all_ips,
442                                        uint32_t *ban_credits)
443 {
444         struct tevent_req *req, *subreq;
445         struct take_ip_state *state;
446         struct ctdb_req_control request;
447         struct public_ip_list *tmp_ip;
448
449         req = tevent_req_create(mem_ctx, &state, struct take_ip_state);
450         if (req == NULL) {
451                 return NULL;
452         }
453
454         state->num_sent = 0;
455         state->num_replies = 0;
456         state->num_fails = 0;
457         state->ban_credits = ban_credits;
458
459         /* For each IP, send a TAKOVER_IP to the node that should be
460          * hosting it.  Many of these will often be redundant (since
461          * the allocation won't have changed) but they can be useful
462          * to recover from inconsistencies. */
463         for (tmp_ip = all_ips; tmp_ip != NULL; tmp_ip = tmp_ip->next) {
464                 struct take_ip_one_state *substate;
465                 struct ctdb_public_ip ip;
466
467                 if (tmp_ip->pnn == CTDB_UNKNOWN_PNN) {
468                         /* IP will be unassigned */
469                         continue;
470                 }
471
472                 substate = talloc_zero(state, struct take_ip_one_state);
473                 if (tevent_req_nomem(substate, req)) {
474                         return tevent_req_post(req, ev);
475                 }
476
477                 substate->req = req;
478                 substate->pnn = tmp_ip->pnn;
479
480                 substate->ip_str  = ctdb_sock_addr_to_string(substate,
481                                                              &tmp_ip->addr,
482                                                              false);
483                 if (tevent_req_nomem(substate->ip_str, req)) {
484                         return tevent_req_post(req, ev);
485                 }
486
487                 ip.pnn = tmp_ip->pnn;
488                 ip.addr = tmp_ip->addr;
489                 ctdb_req_control_takeover_ip(&request, &ip);
490                 subreq = ctdb_client_control_send(
491                                         state, ev, client, tmp_ip->pnn,
492                                         timeout, /* cumulative */
493                                         &request);
494                 if (tevent_req_nomem(subreq, req)) {
495                         return tevent_req_post(req, ev);
496                 }
497                 tevent_req_set_callback(subreq, take_ip_done, substate);
498
499                 state->num_sent++;
500         }
501
502         /* None sent, finished... */
503         if (state->num_sent == 0) {
504                 tevent_req_done(req);
505                 return tevent_req_post(req, ev);
506         }
507
508         return req;
509 }
510
511 static void take_ip_done(struct tevent_req *subreq)
512 {
513         struct take_ip_one_state *substate = tevent_req_callback_data(
514                 subreq, struct take_ip_one_state);
515         struct tevent_req *req = substate->req;
516         struct ctdb_reply_control *reply;
517         struct take_ip_state *state = tevent_req_data(
518                 req, struct take_ip_state);
519         int ret = 0;
520         bool status;
521
522         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
523         TALLOC_FREE(subreq);
524
525         if (! status) {
526                 D_ERR("TAKEOVER_IP %s failed to node %u, ret=%d\n",
527                       substate->ip_str, substate->pnn, ret);
528                 goto fail;
529         }
530
531         ret = ctdb_reply_control_takeover_ip(reply);
532         if (ret != 0) {
533                 D_ERR("TAKEOVER_IP %s failed on node %u, ret=%d\n",
534                       substate->ip_str, substate->pnn, ret);
535                 goto fail;
536         }
537
538         D_INFO("TAKEOVER_IP %s succeeded on node %u\n",
539                substate->ip_str, substate->pnn);
540         goto done;
541
542 fail:
543         state->ban_credits[substate->pnn]++;
544         state->num_fails++;
545         state->err_any = ret;
546
547 done:
548         talloc_free(substate);
549
550         state->num_replies++;
551
552         if (state->num_replies < state->num_sent) {
553                 /* Not all replies received, don't go further */
554                 return;
555         }
556
557         if (state->num_fails > 0) {
558                 tevent_req_error(req, state->err_any);
559                 return;
560         }
561
562         tevent_req_done(req);
563 }
564
565 static bool take_ip_recv(struct tevent_req *req, int *perr)
566 {
567         return generic_recv(req, perr);
568 }
569
570 /**********************************************************************/
571
572 struct ipreallocated_state {
573         uint32_t *pnns;
574         int count;
575         uint32_t *ban_credits;
576 };
577
578 static void ipreallocated_done(struct tevent_req *subreq);
579
580 static struct tevent_req *ipreallocated_send(TALLOC_CTX *mem_ctx,
581                                              struct tevent_context *ev,
582                                              struct ctdb_client_context *client,
583                                              uint32_t *pnns,
584                                              int count,
585                                              struct timeval timeout,
586                                              uint32_t *ban_credits)
587 {
588         struct tevent_req *req, *subreq;
589         struct ipreallocated_state *state;
590         struct ctdb_req_control request;
591
592         req = tevent_req_create(mem_ctx, &state, struct ipreallocated_state);
593         if (req == NULL) {
594                 return NULL;
595         }
596
597         state->pnns = pnns;
598         state->count = count;
599         state->ban_credits = ban_credits;
600
601         ctdb_req_control_ipreallocated(&request);
602         subreq = ctdb_client_control_multi_send(state, ev, client,
603                                                 pnns, count,
604                                                 timeout, /* cumulative */
605                                                 &request);
606         if (tevent_req_nomem(subreq, req)) {
607                 return tevent_req_post(req, ev);
608         }
609         tevent_req_set_callback(subreq, ipreallocated_done, req);
610
611         return req;
612 }
613
614 static void ipreallocated_done(struct tevent_req *subreq)
615 {
616         struct tevent_req *req = tevent_req_callback_data(
617                 subreq, struct tevent_req);
618         struct ipreallocated_state *state = tevent_req_data(
619                 req, struct ipreallocated_state);
620         int *err_list = NULL;
621         int ret, i;
622         bool status, found_errors;
623
624         status = ctdb_client_control_multi_recv(subreq, &ret, state,
625                                                 &err_list, NULL);
626         TALLOC_FREE(subreq);
627
628         if (status) {
629                 D_INFO("IPREALLOCATED succeeded on %d nodes\n", state->count);
630                 tevent_req_done(req);
631                 return;
632         }
633
634         /* Get some clear error messages out of err_list and count
635          * banning credits
636          */
637         found_errors = false;
638         for (i = 0; i < state->count; i++) {
639                 int err = err_list[i];
640                 if (err != 0) {
641                         uint32_t pnn = state->pnns[i];
642
643                         D_ERR("IPREALLOCATED failed on node %u, ret=%d\n",
644                               pnn, err);
645
646                         state->ban_credits[pnn]++;
647                         found_errors = true;
648                 }
649         }
650
651         if (! found_errors) {
652                 D_ERR("IPREALLOCATED internal error, ret=%d\n", ret);
653         }
654
655         tevent_req_error(req, ret);
656 }
657
658 static bool ipreallocated_recv(struct tevent_req *req, int *perr)
659 {
660         return generic_recv(req, perr);
661 }
662
663 /**********************************************************************/
664
665 /*
666  * Recalculate the allocation of public IPs to nodes and have the
667  * nodes host their allocated addresses.
668  *
669  * - Get tunables
670  * - Get nodemap
671  * - Initialise IP allocation state.  Pass:
672  *   + algorithm to be used;
673  *   + various tunables (NoIPTakeover, NoIPFailback)
674  *   + list of nodes to force rebalance (internal structure, currently
675  *     no way to fetch, only used by LCP2 for nodes that have had new
676  *     IP addresses added).
677  * - Set IP flags for IP allocation based on node map
678  * - Retrieve known and available IP addresses (done separately so
679  *   values can be faked in unit testing)
680  * - Use ipalloc_set_public_ips() to set known and available IP
681  *   addresses for allocation
682  * - If cluster can't host IP addresses then jump to IPREALLOCATED
683  * - Run IP allocation algorithm
684  * - Send RELEASE_IP to all nodes for IPs they should not host
685  * - Send TAKE_IP to all nodes for IPs they should host
686  * - Send IPREALLOCATED to all nodes
687  */
688
689 struct takeover_state {
690         struct tevent_context *ev;
691         struct ctdb_client_context *client;
692         struct timeval timeout;
693         unsigned int num_nodes;
694         uint32_t *pnns_connected;
695         int num_connected;
696         uint32_t *pnns_active;
697         int num_active;
698         uint32_t destnode;
699         uint32_t *force_rebalance_nodes;
700         struct ctdb_tunable_list *tun_list;
701         struct ipalloc_state *ipalloc_state;
702         struct ctdb_public_ip_list *known_ips;
703         struct public_ip_list *all_ips;
704         uint32_t *ban_credits;
705 };
706
707 static void takeover_tunables_done(struct tevent_req *subreq);
708 static void takeover_nodemap_done(struct tevent_req *subreq);
709 static void takeover_known_ips_done(struct tevent_req *subreq);
710 static void takeover_avail_ips_done(struct tevent_req *subreq);
711 static void takeover_release_ip_done(struct tevent_req *subreq);
712 static void takeover_take_ip_done(struct tevent_req *subreq);
713 static void takeover_ipreallocated(struct tevent_req *req);
714 static void takeover_ipreallocated_done(struct tevent_req *subreq);
715 static void takeover_failed(struct tevent_req *subreq, int ret);
716 static void takeover_failed_done(struct tevent_req *subreq);
717
718 static struct tevent_req *takeover_send(TALLOC_CTX *mem_ctx,
719                                         struct tevent_context *ev,
720                                         struct ctdb_client_context *client,
721                                         uint32_t *force_rebalance_nodes)
722 {
723         struct tevent_req *req, *subreq;
724         struct takeover_state *state;
725         struct ctdb_req_control request;
726
727         req = tevent_req_create(mem_ctx, &state, struct takeover_state);
728         if (req == NULL) {
729                 return NULL;
730         }
731
732         state->ev = ev;
733         state->client = client;
734         state->force_rebalance_nodes = force_rebalance_nodes;
735         state->destnode = ctdb_client_pnn(client);
736
737         ctdb_req_control_get_all_tunables(&request);
738         subreq = ctdb_client_control_send(state, state->ev, state->client,
739                                           state->destnode, TIMEOUT(),
740                                           &request);
741         if (tevent_req_nomem(subreq, req)) {
742                 return tevent_req_post(req, ev);
743         }
744         tevent_req_set_callback(subreq, takeover_tunables_done, req);
745
746         return req;
747 }
748
749 static void takeover_tunables_done(struct tevent_req *subreq)
750 {
751         struct tevent_req *req = tevent_req_callback_data(
752                 subreq, struct tevent_req);
753         struct takeover_state *state = tevent_req_data(
754                 req, struct takeover_state);
755         struct ctdb_reply_control *reply;
756         struct ctdb_req_control request;
757         int ret;
758         bool status;
759
760         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
761         TALLOC_FREE(subreq);
762         if (! status) {
763                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
764                 tevent_req_error(req, ret);
765                 return;
766         }
767
768         ret = ctdb_reply_control_get_all_tunables(reply, state,
769                                                   &state->tun_list);
770         if (ret != 0) {
771                 D_ERR("control GET_ALL_TUNABLES failed, ret=%d\n", ret);
772                 tevent_req_error(req, ret);
773                 return;
774         }
775
776         talloc_free(reply);
777
778         takeover_timeout = state->tun_list->takeover_timeout;
779
780         ctdb_req_control_get_nodemap(&request);
781         subreq = ctdb_client_control_send(state, state->ev, state->client,
782                                           state->destnode, TIMEOUT(),
783                                           &request);
784         if (tevent_req_nomem(subreq, req)) {
785                 return;
786         }
787         tevent_req_set_callback(subreq, takeover_nodemap_done, req);
788 }
789
790 static void takeover_nodemap_done(struct tevent_req *subreq)
791 {
792         struct tevent_req *req = tevent_req_callback_data(
793                 subreq, struct tevent_req);
794         struct takeover_state *state = tevent_req_data(
795                 req, struct takeover_state);
796         struct ctdb_reply_control *reply;
797         bool status;
798         int ret;
799         struct ctdb_node_map *nodemap;
800         const char *ptr;
801
802         status = ctdb_client_control_recv(subreq, &ret, state, &reply);
803         TALLOC_FREE(subreq);
804         if (! status) {
805                 D_ERR("control GET_NODEMAP failed to node %u, ret=%d\n",
806                         state->destnode, ret);
807                 tevent_req_error(req, ret);
808                 return;
809         }
810
811         ret = ctdb_reply_control_get_nodemap(reply, state, &nodemap);
812         if (ret != 0) {
813                 D_ERR("control GET_NODEMAP failed, ret=%d\n", ret);
814                 tevent_req_error(req, ret);
815                 return;
816         }
817
818         state->num_nodes = nodemap->num;
819
820         state->num_connected = list_of_connected_nodes(nodemap,
821                                                        CTDB_UNKNOWN_PNN, state,
822                                                        &state->pnns_connected);
823         if (state->num_connected <= 0) {
824                 tevent_req_error(req, ENOMEM);
825                 return;
826         }
827
828         state->num_active = list_of_active_nodes(nodemap,
829                                                  CTDB_UNKNOWN_PNN, state,
830                                                  &state->pnns_active);
831         if (state->num_active <= 0) {
832                 tevent_req_error(req, ENOMEM);
833                 return;
834         }
835
836         /* Default timeout for early jump to IPREALLOCATED.  See below
837          * for explanation of 3 times...
838          */
839         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
840
841         state->ban_credits = talloc_zero_array(state, uint32_t,
842                                                state->num_nodes);
843         if (tevent_req_nomem(state->ban_credits, req)) {
844                 return;
845         }
846
847         ptr = getenv("CTDB_DISABLE_IP_FAILOVER");
848         if (ptr != NULL) {
849                 /* IP failover is completely disabled so just send out
850                  * ipreallocated event.
851                  */
852                 takeover_ipreallocated(req);
853                 return;
854         }
855
856         state->ipalloc_state =
857                 ipalloc_state_init(
858                         state, state->num_nodes,
859                         determine_algorithm(state->tun_list),
860                         (state->tun_list->no_ip_takeover != 0),
861                         (state->tun_list->no_ip_failback != 0),
862                         state->force_rebalance_nodes);
863         if (tevent_req_nomem(state->ipalloc_state, req)) {
864                 return;
865         }
866
867         subreq = get_public_ips_send(state, state->ev, state->client,
868                                      state->pnns_connected, state->num_connected,
869                                      state->num_nodes, state->ban_credits,
870                                      false);
871         if (tevent_req_nomem(subreq, req)) {
872                 return;
873         }
874
875         tevent_req_set_callback(subreq, takeover_known_ips_done, req);
876 }
877
878 static void takeover_known_ips_done(struct tevent_req *subreq)
879 {
880         struct tevent_req *req = tevent_req_callback_data(
881                 subreq, struct tevent_req);
882         struct takeover_state *state = tevent_req_data(
883                 req, struct takeover_state);
884         int ret;
885         bool status;
886         uint32_t *pnns = NULL;
887         int count, i;
888
889         status = get_public_ips_recv(subreq, &ret, state, &state->known_ips);
890         TALLOC_FREE(subreq);
891
892         if (! status) {
893                 D_ERR("Failed to fetch known public IPs\n");
894                 takeover_failed(req, ret);
895                 return;
896         }
897
898         /* Get available IPs from active nodes that actually have known IPs */
899
900         pnns = talloc_zero_array(state, uint32_t, state->num_active);
901         if (tevent_req_nomem(pnns, req)) {
902                 return;
903         }
904
905         count = 0;
906         for (i = 0; i < state->num_active; i++) {
907                 uint32_t pnn = state->pnns_active[i];
908
909                 /* If pnn has IPs then fetch available IPs from it */
910                 if (state->known_ips[pnn].num > 0) {
911                         pnns[count] = pnn;
912                         count++;
913                 }
914         }
915
916         subreq = get_public_ips_send(state, state->ev, state->client,
917                                      pnns, count,
918                                      state->num_nodes, state->ban_credits,
919                                      true);
920         if (tevent_req_nomem(subreq, req)) {
921                 return;
922         }
923
924         tevent_req_set_callback(subreq, takeover_avail_ips_done, req);
925 }
926
927 static void takeover_avail_ips_done(struct tevent_req *subreq)
928 {
929         struct tevent_req *req = tevent_req_callback_data(
930                 subreq, struct tevent_req);
931         struct takeover_state *state = tevent_req_data(
932                 req, struct takeover_state);
933         bool status;
934         int ret;
935         struct ctdb_public_ip_list *available_ips;
936
937         status = get_public_ips_recv(subreq, &ret, state, &available_ips);
938         TALLOC_FREE(subreq);
939
940         if (! status) {
941                 D_ERR("Failed to fetch available public IPs\n");
942                 takeover_failed(req, ret);
943                 return;
944         }
945
946         ipalloc_set_public_ips(state->ipalloc_state,
947                                state->known_ips, available_ips);
948
949         if (! ipalloc_can_host_ips(state->ipalloc_state)) {
950                 D_NOTICE("No nodes available to host public IPs yet\n");
951                 takeover_ipreallocated(req);
952                 return;
953         }
954
955         /* Do the IP reassignment calculations */
956         state->all_ips = ipalloc(state->ipalloc_state);
957         if (tevent_req_nomem(state->all_ips, req)) {
958                 return;
959         }
960
961         /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
962          * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
963          * seconds.  However, RELEASE_IP can take longer due to TCP
964          * connection killing, so sometimes needs more time.
965          * Therefore, use a cumulative timeout of TakeoverTimeout * 3
966          * seconds across all 3 stages.  No explicit expiry checks are
967          * needed before each stage because tevent is smart enough to
968          * fire the timeouts even if they are in the past.  Initialise
969          * this here so it explicitly covers the stages we're
970          * interested in but, in particular, not the time taken by the
971          * ipalloc().
972          */
973         state->timeout = timeval_current_ofs(3 * takeover_timeout, 0);
974
975         subreq = release_ip_send(state, state->ev, state->client,
976                                  state->pnns_connected, state->num_connected,
977                                  state->timeout, state->all_ips,
978                                  state->ban_credits);
979         if (tevent_req_nomem(subreq, req)) {
980                 return;
981         }
982         tevent_req_set_callback(subreq, takeover_release_ip_done, req);
983 }
984
985 static void takeover_release_ip_done(struct tevent_req *subreq)
986 {
987         struct tevent_req *req = tevent_req_callback_data(
988                 subreq, struct tevent_req);
989         struct takeover_state *state = tevent_req_data(
990                 req, struct takeover_state);
991         int ret;
992         bool status;
993
994         status = release_ip_recv(subreq, &ret);
995         TALLOC_FREE(subreq);
996
997         if (! status) {
998                 takeover_failed(req, ret);
999                 return;
1000         }
1001
1002         /* All released, now for takeovers */
1003
1004         subreq = take_ip_send(state, state->ev, state->client,
1005                               state->timeout, state->all_ips,
1006                               state->ban_credits);
1007         if (tevent_req_nomem(subreq, req)) {
1008                 return;
1009         }
1010         tevent_req_set_callback(subreq, takeover_take_ip_done, req);
1011 }
1012
1013 static void takeover_take_ip_done(struct tevent_req *subreq)
1014 {
1015         struct tevent_req *req = tevent_req_callback_data(
1016                 subreq, struct tevent_req);
1017         int ret = 0;
1018         bool status;
1019
1020         status = take_ip_recv(subreq, &ret);
1021         TALLOC_FREE(subreq);
1022
1023         if (! status) {
1024                 takeover_failed(req, ret);
1025                 return;
1026         }
1027
1028         takeover_ipreallocated(req);
1029 }
1030
1031 static void takeover_ipreallocated(struct tevent_req *req)
1032 {
1033         struct takeover_state *state = tevent_req_data(
1034                 req, struct takeover_state);
1035         struct tevent_req *subreq;
1036
1037         subreq = ipreallocated_send(state, state->ev, state->client,
1038                                     state->pnns_connected,
1039                                     state->num_connected,
1040                                     state->timeout,
1041                                     state->ban_credits);
1042         if (tevent_req_nomem(subreq, req)) {
1043                 return;
1044         }
1045         tevent_req_set_callback(subreq, takeover_ipreallocated_done, req);
1046 }
1047
1048 static void takeover_ipreallocated_done(struct tevent_req *subreq)
1049 {
1050         struct tevent_req *req = tevent_req_callback_data(
1051                 subreq, struct tevent_req);
1052         int ret;
1053         bool status;
1054
1055         status = ipreallocated_recv(subreq, &ret);
1056         TALLOC_FREE(subreq);
1057
1058         if (! status) {
1059                 takeover_failed(req, ret);
1060                 return;
1061         }
1062
1063         tevent_req_done(req);
1064 }
1065
1066 struct takeover_failed_state {
1067         struct tevent_req *req;
1068         int ret;
1069 };
1070
1071 void takeover_failed(struct tevent_req *req, int ret)
1072 {
1073         struct takeover_state *state = tevent_req_data(
1074                 req, struct takeover_state);
1075         struct tevent_req *subreq;
1076         uint32_t max_pnn = CTDB_UNKNOWN_PNN;
1077         unsigned int max_credits = 0;
1078         uint32_t pnn;
1079
1080         /* Check that bans are enabled */
1081         if (state->tun_list->enable_bans == 0) {
1082                 tevent_req_error(req, ret);
1083                 return;
1084         }
1085
1086         for (pnn = 0; pnn < state->num_nodes; pnn++) {
1087                 if (state->ban_credits[pnn] > max_credits) {
1088                         max_pnn = pnn;
1089                         max_credits = state->ban_credits[pnn];
1090                 }
1091         }
1092
1093         if (max_credits > 0) {
1094                 struct ctdb_req_message message;
1095                 struct takeover_failed_state *substate;
1096
1097                 D_WARNING("Assigning banning credits to node %u\n", max_pnn);
1098
1099                 substate = talloc_zero(state, struct takeover_failed_state);
1100                 if (tevent_req_nomem(substate, req)) {
1101                         return;
1102                 }
1103                 substate->req = req;
1104                 substate->ret = ret;
1105
1106                 message.srvid = CTDB_SRVID_BANNING;
1107                 message.data.pnn = max_pnn;
1108
1109                 subreq = ctdb_client_message_send(
1110                         state, state->ev, state->client,
1111                         ctdb_client_pnn(state->client),
1112                         &message);
1113                 if (subreq == NULL) {
1114                         D_ERR("failed to assign banning credits\n");
1115                         tevent_req_error(req, ret);
1116                         return;
1117                 }
1118                 tevent_req_set_callback(subreq, takeover_failed_done, substate);
1119         } else {
1120                 tevent_req_error(req, ret);
1121         }
1122 }
1123
1124 static void takeover_failed_done(struct tevent_req *subreq)
1125 {
1126         struct takeover_failed_state *substate = tevent_req_callback_data(
1127                 subreq, struct takeover_failed_state);
1128         struct tevent_req *req = substate->req;
1129         int ret;
1130         bool status;
1131
1132         status = ctdb_client_message_recv(subreq, &ret);
1133         TALLOC_FREE(subreq);
1134         if (! status) {
1135                 D_ERR("failed to assign banning credits, ret=%d\n", ret);
1136         }
1137
1138         ret = substate->ret;
1139         talloc_free(substate);
1140         tevent_req_error(req, ret);
1141 }
1142
1143 static void takeover_recv(struct tevent_req *req, int *perr)
1144 {
1145         generic_recv(req, perr);
1146 }
1147
1148 static uint32_t *parse_node_list(TALLOC_CTX *mem_ctx, const char* s)
1149 {
1150         char *strv = NULL;
1151         int num, i, ret;
1152         char *t;
1153         uint32_t *nodes;
1154
1155         ret = strv_split(mem_ctx, &strv, s, ",");
1156         if (ret != 0) {
1157                 D_ERR("out of memory\n");
1158                 return NULL;
1159         }
1160
1161         num = strv_count(strv);
1162
1163         nodes = talloc_array(mem_ctx, uint32_t, num);
1164         if (nodes == NULL) {
1165                 D_ERR("out of memory\n");
1166                 return NULL;
1167         }
1168
1169         t = NULL;
1170         for (i = 0; i < num; i++) {
1171                 t = strv_next(strv, t);
1172                 nodes[i] = atoi(t);
1173         }
1174
1175         return nodes;
1176 }
1177
1178 static void usage(const char *progname)
1179 {
1180         fprintf(stderr,
1181                 "\nUsage: %s <output-fd> <ctdb-socket-path> "
1182                 "[<force-rebalance-nodes>]\n",
1183                 progname);
1184 }
1185
1186 /*
1187  * Arguments - write fd, socket path
1188  */
1189 int main(int argc, const char *argv[])
1190 {
1191         int write_fd;
1192         const char *sockpath;
1193         TALLOC_CTX *mem_ctx;
1194         struct tevent_context *ev;
1195         struct ctdb_client_context *client;
1196         bool status;
1197         int ret;
1198         struct tevent_req *req;
1199         uint32_t *force_rebalance_nodes = NULL;
1200
1201         if (argc < 3 || argc > 4) {
1202                 usage(argv[0]);
1203                 exit(1);
1204         }
1205
1206         write_fd = atoi(argv[1]);
1207         sockpath = argv[2];
1208
1209         mem_ctx = talloc_new(NULL);
1210         if (mem_ctx == NULL) {
1211                 fprintf(stderr, "talloc_new() failed\n");
1212                 ret = ENOMEM;
1213                 goto done;
1214         }
1215
1216         if (argc == 4) {
1217                 force_rebalance_nodes = parse_node_list(mem_ctx, argv[3]);
1218                 if (force_rebalance_nodes == NULL) {
1219                         usage(argv[0]);
1220                         ret = EINVAL;
1221                         goto done;
1222                 }
1223         }
1224
1225         ret = logging_init(mem_ctx, NULL, NULL, "ctdb-takeover");
1226         if (ret != 0) {
1227                 fprintf(stderr,
1228                         "ctdb-takeover: Unable to initialize logging\n");
1229                 goto done;
1230         }
1231
1232         ev = tevent_context_init(mem_ctx);
1233         if (ev == NULL) {
1234                 D_ERR("tevent_context_init() failed\n");
1235                 ret = ENOMEM;
1236                 goto done;
1237         }
1238
1239         status = logging_setup_sighup_handler(ev, mem_ctx, NULL, NULL);
1240         if (!status) {
1241                 D_ERR("logging_setup_sighup_handler() failed\n");
1242                 ret = ENOMEM;
1243                 goto done;
1244         }
1245
1246         ret = ctdb_client_init(mem_ctx, ev, sockpath, &client);
1247         if (ret != 0) {
1248                 D_ERR("ctdb_client_init() failed, ret=%d\n", ret);
1249                 goto done;
1250         }
1251
1252         req = takeover_send(mem_ctx, ev, client, force_rebalance_nodes);
1253         if (req == NULL) {
1254                 D_ERR("takeover_send() failed\n");
1255                 ret = 1;
1256                 goto done;
1257         }
1258
1259         if (! tevent_req_poll(req, ev)) {
1260                 D_ERR("tevent_req_poll() failed\n");
1261                 ret = 1;
1262                 goto done;
1263         }
1264
1265         takeover_recv(req, &ret);
1266         TALLOC_FREE(req);
1267         if (ret != 0) {
1268                 D_ERR("takeover run failed, ret=%d\n", ret);
1269         }
1270
1271 done:
1272         sys_write_v(write_fd, &ret, sizeof(ret));
1273
1274         talloc_free(mem_ctx);
1275         return ret;
1276 }