4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "lib/util/dlinklist.h"
32 lock all databases - mark only
34 static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t priority)
36 struct ctdb_db_context *ctdb_db;
38 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
39 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
43 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
44 DEBUG(DEBUG_ERR,("Attempt to mark all databases locked when not frozen\n"));
47 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
48 if (ctdb_db->priority != priority) {
51 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
59 lock all databases - unmark only
61 static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t priority)
63 struct ctdb_db_context *ctdb_db;
65 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
66 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
70 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
71 DEBUG(DEBUG_ERR,("Attempt to unmark all databases locked when not frozen\n"));
74 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
75 if (ctdb_db->priority != priority) {
78 if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
87 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
89 CHECK_CONTROL_DATA_SIZE(0);
90 struct ctdb_vnn_map_wire *map;
93 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
94 map = talloc_size(outdata, len);
95 CTDB_NO_MEMORY(ctdb, map);
97 map->generation = ctdb->vnn_map->generation;
98 map->size = ctdb->vnn_map->size;
99 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
101 outdata->dsize = len;
102 outdata->dptr = (uint8_t *)map;
108 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
110 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
113 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
114 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
115 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
120 talloc_free(ctdb->vnn_map);
122 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
123 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
125 ctdb->vnn_map->generation = map->generation;
126 ctdb->vnn_map->size = map->size;
127 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
128 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
130 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
136 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
139 struct ctdb_db_context *ctdb_db;
140 struct ctdb_dbid_map *dbid_map;
142 CHECK_CONTROL_DATA_SIZE(0);
145 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
150 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
151 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
152 if (!outdata->dptr) {
153 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
157 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
159 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
160 dbid_map->dbs[i].dbid = ctdb_db->db_id;
161 dbid_map->dbs[i].persistent = ctdb_db->persistent;
168 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
170 uint32_t i, num_nodes;
171 struct ctdb_node_map *node_map;
173 CHECK_CONTROL_DATA_SIZE(0);
175 num_nodes = ctdb->num_nodes;
177 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
178 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
179 if (!outdata->dptr) {
180 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
184 node_map = (struct ctdb_node_map *)outdata->dptr;
185 node_map->num = num_nodes;
186 for (i=0; i<num_nodes; i++) {
187 if (parse_ip(ctdb->nodes[i]->address.address,
188 NULL, /* TODO: pass in the correct interface here*/
190 &node_map->nodes[i].addr) == 0)
192 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
195 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
196 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
203 get an old style ipv4-only nodemap
206 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
208 uint32_t i, num_nodes;
209 struct ctdb_node_mapv4 *node_map;
211 CHECK_CONTROL_DATA_SIZE(0);
213 num_nodes = ctdb->num_nodes;
215 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
216 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
217 if (!outdata->dptr) {
218 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
222 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
223 node_map->num = num_nodes;
224 for (i=0; i<num_nodes; i++) {
225 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
226 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
230 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
231 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
238 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
239 struct timeval t, void *private_data)
242 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
244 struct ctdb_node **nodes;
246 tmp_ctx = talloc_new(ctdb);
248 /* steal the old nodes file for a while */
249 talloc_steal(tmp_ctx, ctdb->nodes);
252 num_nodes = ctdb->num_nodes;
255 /* load the new nodes file */
256 ctdb_load_nodes_file(ctdb);
258 for (i=0; i<ctdb->num_nodes; i++) {
259 /* keep any identical pre-existing nodes and connections */
260 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
261 talloc_free(ctdb->nodes[i]);
262 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
266 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
270 /* any new or different nodes must be added */
271 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
272 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
273 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
275 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
276 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
277 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
281 /* tell the recovery daemon to reaload the nodes file too */
282 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
284 talloc_free(tmp_ctx);
289 reload the nodes file after a short delay (so that we can send the response
293 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
295 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
301 a traverse function for pulling all relevent records from pulldb
304 struct ctdb_context *ctdb;
305 struct ctdb_marshall_buffer *pulldata;
310 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
312 struct pulldb_data *params = (struct pulldb_data *)p;
313 struct ctdb_rec_data *rec;
315 /* add the record to the blob */
316 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
318 params->failed = true;
321 params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
322 if (params->pulldata == NULL) {
323 DEBUG(DEBUG_ERR,(__location__ " Failed to expand pulldb_data to %u (%u records)\n",
324 rec->length + params->len, params->pulldata->count));
325 params->failed = true;
328 params->pulldata->count++;
329 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
330 params->len += rec->length;
337 pul a bunch of records from a ltdb, filtering by lmaster
339 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
341 struct ctdb_control_pulldb *pull;
342 struct ctdb_db_context *ctdb_db;
343 struct pulldb_data params;
344 struct ctdb_marshall_buffer *reply;
346 pull = (struct ctdb_control_pulldb *)indata.dptr;
348 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
350 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
354 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
355 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
359 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
360 CTDB_NO_MEMORY(ctdb, reply);
362 reply->db_id = pull->db_id;
365 params.pulldata = reply;
366 params.len = offsetof(struct ctdb_marshall_buffer, data);
367 params.failed = false;
369 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
370 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
374 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
375 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
376 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
377 talloc_free(params.pulldata);
381 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
383 outdata->dptr = (uint8_t *)params.pulldata;
384 outdata->dsize = params.len;
390 push a bunch of records into a ltdb, filtering by rsn
392 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
394 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
395 struct ctdb_db_context *ctdb_db;
397 struct ctdb_rec_data *rec;
399 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
400 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
404 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
406 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
410 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
411 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
415 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
416 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
420 rec = (struct ctdb_rec_data *)&reply->data[0];
422 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
423 reply->count, reply->db_id));
425 for (i=0;i<reply->count;i++) {
427 struct ctdb_ltdb_header *hdr;
429 key.dptr = &rec->data[0];
430 key.dsize = rec->keylen;
431 data.dptr = &rec->data[key.dsize];
432 data.dsize = rec->datalen;
434 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
435 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
438 hdr = (struct ctdb_ltdb_header *)data.dptr;
439 data.dptr += sizeof(*hdr);
440 data.dsize -= sizeof(*hdr);
442 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
444 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
448 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
451 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
452 reply->count, reply->db_id));
454 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
458 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
463 static int traverse_setdmaster(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
465 uint32_t *dmaster = (uint32_t *)p;
466 struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)data.dptr;
469 /* skip if already correct */
470 if (header->dmaster == *dmaster) {
474 header->dmaster = *dmaster;
476 ret = tdb_store(tdb, key, data, TDB_REPLACE);
478 DEBUG(DEBUG_CRIT,(__location__ " failed to write tdb data back ret:%d\n",ret));
482 /* TODO: add error checking here */
487 int32_t ctdb_control_set_dmaster(struct ctdb_context *ctdb, TDB_DATA indata)
489 struct ctdb_control_set_dmaster *p = (struct ctdb_control_set_dmaster *)indata.dptr;
490 struct ctdb_db_context *ctdb_db;
492 ctdb_db = find_ctdb_db(ctdb, p->db_id);
494 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", p->db_id));
498 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
499 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_set_dmaster when not frozen\n"));
503 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
504 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
508 tdb_traverse(ctdb_db->ltdb->tdb, traverse_setdmaster, &p->dmaster);
510 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
515 struct ctdb_set_recmode_state {
516 struct ctdb_context *ctdb;
517 struct ctdb_req_control *c;
520 struct timed_event *te;
521 struct fd_event *fde;
523 struct timeval start_time;
527 called if our set_recmode child times out. this would happen if
528 ctdb_recovery_lock() would block.
530 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
531 struct timeval t, void *private_data)
533 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
534 struct ctdb_set_recmode_state);
536 /* we consider this a success, not a failure, as we failed to
537 set the recovery lock which is what we wanted. This can be
538 caused by the cluster filesystem being very slow to
539 arbitrate locks immediately after a node failure.
541 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
542 state->ctdb->recovery_mode = state->recmode;
543 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
548 /* when we free the recmode state we must kill any child process.
550 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
552 double l = timeval_elapsed(&state->start_time);
554 ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l);
556 if (state->fd[0] != -1) {
559 if (state->fd[1] != -1) {
562 kill(state->child, SIGKILL);
566 /* this is called when the client process has completed ctdb_recovery_lock()
567 and has written data back to us through the pipe.
569 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
570 uint16_t flags, void *private_data)
572 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
573 struct ctdb_set_recmode_state);
577 /* we got a response from our child process so we can abort the
580 talloc_free(state->te);
584 /* read the childs status when trying to lock the reclock file.
585 child wrote 0 if everything is fine and 1 if it did manage
586 to lock the file, which would be a problem since that means
587 we got a request to exit from recovery but we could still lock
588 the file which at this time SHOULD be locked by the recovery
589 daemon on the recmaster
591 ret = read(state->fd[0], &c, 1);
592 if (ret != 1 || c != 0) {
593 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
598 state->ctdb->recovery_mode = state->recmode;
600 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
606 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
607 struct timeval t, void *private_data)
609 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
611 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
612 talloc_free(ctdb->release_ips_ctx);
613 ctdb->release_ips_ctx = NULL;
615 ctdb_release_all_ips(ctdb);
619 set the recovery mode
621 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
622 struct ctdb_req_control *c,
623 TDB_DATA indata, bool *async_reply,
624 const char **errormsg)
626 uint32_t recmode = *(uint32_t *)indata.dptr;
628 struct ctdb_set_recmode_state *state;
629 pid_t parent = getpid();
631 /* if we enter recovery but stay in recovery for too long
632 we will eventually drop all our ip addresses
634 if (recmode == CTDB_RECOVERY_NORMAL) {
635 talloc_free(ctdb->release_ips_ctx);
636 ctdb->release_ips_ctx = NULL;
638 talloc_free(ctdb->release_ips_ctx);
639 ctdb->release_ips_ctx = talloc_new(ctdb);
640 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
642 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
645 if (recmode != ctdb->recovery_mode) {
646 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
647 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
650 if (recmode != CTDB_RECOVERY_NORMAL ||
651 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
652 ctdb->recovery_mode = recmode;
656 /* some special handling when ending recovery mode */
658 /* force the databases to thaw */
659 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
660 if (ctdb->freeze_handles[i] != NULL) {
661 ctdb_control_thaw(ctdb, i);
665 state = talloc(ctdb, struct ctdb_set_recmode_state);
666 CTDB_NO_MEMORY(ctdb, state);
668 state->start_time = timeval_current();
672 if (ctdb->tunable.verify_recovery_lock == 0) {
673 /* dont need to verify the reclock file */
674 ctdb->recovery_mode = recmode;
678 /* For the rest of what needs to be done, we need to do this in
679 a child process since
680 1, the call to ctdb_recovery_lock() can block if the cluster
681 filesystem is in the process of recovery.
683 ret = pipe(state->fd);
686 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
690 state->child = fork();
691 if (state->child == (pid_t)-1) {
698 if (state->child == 0) {
702 /* we should not be able to get the lock on the reclock file,
703 as it should be held by the recovery master
705 if (ctdb_recovery_lock(ctdb, false)) {
706 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
710 write(state->fd[1], &cc, 1);
711 /* make sure we die when our parent dies */
712 while (kill(parent, 0) == 0 || errno != ESRCH) {
714 write(state->fd[1], &cc, 1);
721 talloc_set_destructor(state, set_recmode_destructor);
723 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
724 ctdb_set_recmode_timeout, state);
726 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
727 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
731 if (state->fde == NULL) {
737 state->recmode = recmode;
738 state->c = talloc_steal(state, c);
747 try and get the recovery lock in shared storage - should only work
748 on the recovery master recovery daemon. Anywhere else is a bug
750 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
755 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
757 if (ctdb->recovery_lock_fd != -1) {
758 close(ctdb->recovery_lock_fd);
759 ctdb->recovery_lock_fd = -1;
762 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
763 if (ctdb->recovery_lock_fd == -1) {
764 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
765 ctdb->recovery_lock_file, strerror(errno)));
769 set_close_on_exec(ctdb->recovery_lock_fd);
771 lock.l_type = F_WRLCK;
772 lock.l_whence = SEEK_SET;
777 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
778 close(ctdb->recovery_lock_fd);
779 ctdb->recovery_lock_fd = -1;
781 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
787 close(ctdb->recovery_lock_fd);
788 ctdb->recovery_lock_fd = -1;
792 DEBUG(DEBUG_ERR, ("Recovery lock taken successfully\n"));
795 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
801 delete a record as part of the vacuum process
802 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
803 use non-blocking locks
805 return 0 if the record was successfully deleted (i.e. it does not exist
806 when the function returns)
807 or !0 is the record still exists in the tdb after returning.
809 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
812 struct ctdb_ltdb_header *hdr, *hdr2;
814 /* these are really internal tdb functions - but we need them here for
815 non-blocking lock of the freelist */
816 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
817 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
820 key.dsize = rec->keylen;
821 key.dptr = &rec->data[0];
822 data.dsize = rec->datalen;
823 data.dptr = &rec->data[rec->keylen];
825 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
826 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
830 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
831 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
835 hdr = (struct ctdb_ltdb_header *)data.dptr;
837 /* use a non-blocking lock */
838 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
842 data = tdb_fetch(ctdb_db->ltdb->tdb, key);
843 if (data.dptr == NULL) {
844 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
848 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
849 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
850 tdb_delete(ctdb_db->ltdb->tdb, key);
851 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
852 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
854 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
859 hdr2 = (struct ctdb_ltdb_header *)data.dptr;
861 if (hdr2->rsn > hdr->rsn) {
862 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
863 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
864 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
869 if (hdr2->dmaster == ctdb->pnn) {
870 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
871 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
876 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
877 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
882 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
883 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
884 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
885 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
890 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
891 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
898 struct recovery_callback_state {
899 struct ctdb_req_control *c;
904 called when the 'recovered' event script has finished
906 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
908 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
910 ctdb_enable_monitoring(ctdb);
913 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
916 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
919 gettimeofday(&ctdb->last_recovery_finished, NULL);
923 recovery has finished
925 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
926 struct ctdb_req_control *c,
930 struct recovery_callback_state *state;
932 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
934 state = talloc(ctdb, struct recovery_callback_state);
935 CTDB_NO_MEMORY(ctdb, state);
937 state->c = talloc_steal(state, c);
939 ctdb_disable_monitoring(ctdb);
941 ret = ctdb_event_script_callback(ctdb,
942 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
944 ctdb_end_recovery_callback,
948 ctdb_enable_monitoring(ctdb);
950 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
955 /* tell the control that we will be reply asynchronously */
961 called when the 'startrecovery' event script has finished
963 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
965 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
968 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
971 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
976 run the startrecovery eventscript
978 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
979 struct ctdb_req_control *c,
983 struct recovery_callback_state *state;
985 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
986 gettimeofday(&ctdb->last_recovery_started, NULL);
988 state = talloc(ctdb, struct recovery_callback_state);
989 CTDB_NO_MEMORY(ctdb, state);
991 state->c = talloc_steal(state, c);
993 ctdb_disable_monitoring(ctdb);
995 ret = ctdb_event_script_callback(ctdb,
996 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
998 ctdb_start_recovery_callback,
999 state, "startrecovery");
1002 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
1007 /* tell the control that we will be reply asynchronously */
1008 *async_reply = true;
1013 try to delete all these records as part of the vacuuming process
1014 and return the records we failed to delete
1016 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1018 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1019 struct ctdb_db_context *ctdb_db;
1021 struct ctdb_rec_data *rec;
1022 struct ctdb_marshall_buffer *records;
1024 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1025 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1029 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1031 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1036 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1037 reply->count, reply->db_id));
1040 /* create a blob to send back the records we couldnt delete */
1041 records = (struct ctdb_marshall_buffer *)
1042 talloc_zero_size(outdata,
1043 offsetof(struct ctdb_marshall_buffer, data));
1044 if (records == NULL) {
1045 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1048 records->db_id = ctdb_db->db_id;
1051 rec = (struct ctdb_rec_data *)&reply->data[0];
1052 for (i=0;i<reply->count;i++) {
1055 key.dptr = &rec->data[0];
1056 key.dsize = rec->keylen;
1057 data.dptr = &rec->data[key.dsize];
1058 data.dsize = rec->datalen;
1060 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1061 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1065 /* If we cant delete the record we must add it to the reply
1066 so the lmaster knows it may not purge this record
1068 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1070 struct ctdb_ltdb_header *hdr;
1072 hdr = (struct ctdb_ltdb_header *)data.dptr;
1073 data.dptr += sizeof(*hdr);
1074 data.dsize -= sizeof(*hdr);
1076 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1078 old_size = talloc_get_size(records);
1079 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1080 if (records == NULL) {
1081 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1085 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1088 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1092 outdata->dptr = (uint8_t *)records;
1093 outdata->dsize = talloc_get_size(records);
1101 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1103 uint32_t *capabilities = NULL;
1105 capabilities = talloc(outdata, uint32_t);
1106 CTDB_NO_MEMORY(ctdb, capabilities);
1107 *capabilities = ctdb->capabilities;
1109 outdata->dsize = sizeof(uint32_t);
1110 outdata->dptr = (uint8_t *)capabilities;
1115 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1117 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1118 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1120 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1122 if (*count < ctdb->tunable.recd_ping_failcount) {
1124 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1125 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1126 ctdb_recd_ping_timeout, ctdb);
1130 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Shutting down ctdb daemon. (This can be caused if the cluster filesystem has hung)\n"));
1132 ctdb_stop_recoverd(ctdb);
1133 ctdb_stop_keepalive(ctdb);
1134 ctdb_stop_monitoring(ctdb);
1135 ctdb_release_all_ips(ctdb);
1136 if (ctdb->methods != NULL) {
1137 ctdb->methods->shutdown(ctdb);
1139 ctdb_event_script(ctdb, "shutdown");
1140 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Daemon has been shut down.\n"));
1144 /* The recovery daemon will ping us at regular intervals.
1145 If we havent been pinged for a while we assume the recovery
1146 daemon is inoperable and we shut down.
1148 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1150 talloc_free(ctdb->recd_ping_count);
1152 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1153 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1155 if (ctdb->tunable.recd_ping_timeout != 0) {
1156 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1157 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1158 ctdb_recd_ping_timeout, ctdb);
1166 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1168 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1170 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1175 struct stop_node_callback_state {
1176 struct ctdb_req_control *c;
1180 called when the 'stopped' event script has finished
1182 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1184 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1187 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1188 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1191 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1195 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1198 struct stop_node_callback_state *state;
1200 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1202 state = talloc(ctdb, struct stop_node_callback_state);
1203 CTDB_NO_MEMORY(ctdb, state);
1205 state->c = talloc_steal(state, c);
1207 ctdb_disable_monitoring(ctdb);
1209 ret = ctdb_event_script_callback(ctdb,
1210 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
1212 ctdb_stop_node_callback,
1216 ctdb_enable_monitoring(ctdb);
1218 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1223 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1225 *async_reply = true;
1230 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1232 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1233 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;