4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tevent/tevent.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "lib/util/dlinklist.h"
32 lock all databases - mark only
34 static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t priority)
36 struct ctdb_db_context *ctdb_db;
38 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
39 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
43 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
44 DEBUG(DEBUG_ERR,("Attempt to mark all databases locked when not frozen\n"));
47 /* The dual loop is a woraround for older versions of samba
48 that does not yet support the set-db-priority/lock order
49 call. So that we get basic deadlock avoiidance also for
50 these old versions of samba.
51 This code will be removed in the future.
53 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
54 if (ctdb_db->priority != priority) {
57 if (strstr(ctdb_db->db_name, "notify") != NULL) {
60 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
64 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
65 if (ctdb_db->priority != priority) {
68 if (strstr(ctdb_db->db_name, "notify") == NULL) {
71 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
79 lock all databases - unmark only
81 static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t priority)
83 struct ctdb_db_context *ctdb_db;
85 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
86 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
90 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
91 DEBUG(DEBUG_ERR,("Attempt to unmark all databases locked when not frozen\n"));
94 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
95 if (ctdb_db->priority != priority) {
98 if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
107 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
109 CHECK_CONTROL_DATA_SIZE(0);
110 struct ctdb_vnn_map_wire *map;
113 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
114 map = talloc_size(outdata, len);
115 CTDB_NO_MEMORY(ctdb, map);
117 map->generation = ctdb->vnn_map->generation;
118 map->size = ctdb->vnn_map->size;
119 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
121 outdata->dsize = len;
122 outdata->dptr = (uint8_t *)map;
128 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
130 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
133 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
134 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
135 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
140 talloc_free(ctdb->vnn_map);
142 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
143 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
145 ctdb->vnn_map->generation = map->generation;
146 ctdb->vnn_map->size = map->size;
147 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
148 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
150 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
156 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
159 struct ctdb_db_context *ctdb_db;
160 struct ctdb_dbid_map *dbid_map;
162 CHECK_CONTROL_DATA_SIZE(0);
165 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
170 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
171 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
172 if (!outdata->dptr) {
173 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
177 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
179 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
180 dbid_map->dbs[i].dbid = ctdb_db->db_id;
181 dbid_map->dbs[i].persistent = ctdb_db->persistent;
188 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
190 uint32_t i, num_nodes;
191 struct ctdb_node_map *node_map;
193 CHECK_CONTROL_DATA_SIZE(0);
195 num_nodes = ctdb->num_nodes;
197 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
198 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
199 if (!outdata->dptr) {
200 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
204 node_map = (struct ctdb_node_map *)outdata->dptr;
205 node_map->num = num_nodes;
206 for (i=0; i<num_nodes; i++) {
207 if (parse_ip(ctdb->nodes[i]->address.address,
208 NULL, /* TODO: pass in the correct interface here*/
210 &node_map->nodes[i].addr) == 0)
212 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
215 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
216 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
223 get an old style ipv4-only nodemap
226 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
228 uint32_t i, num_nodes;
229 struct ctdb_node_mapv4 *node_map;
231 CHECK_CONTROL_DATA_SIZE(0);
233 num_nodes = ctdb->num_nodes;
235 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
236 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
237 if (!outdata->dptr) {
238 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
242 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
243 node_map->num = num_nodes;
244 for (i=0; i<num_nodes; i++) {
245 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
246 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
250 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
251 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
258 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
259 struct timeval t, void *private_data)
262 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
264 struct ctdb_node **nodes;
266 tmp_ctx = talloc_new(ctdb);
268 /* steal the old nodes file for a while */
269 talloc_steal(tmp_ctx, ctdb->nodes);
272 num_nodes = ctdb->num_nodes;
275 /* load the new nodes file */
276 ctdb_load_nodes_file(ctdb);
278 for (i=0; i<ctdb->num_nodes; i++) {
279 /* keep any identical pre-existing nodes and connections */
280 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
281 talloc_free(ctdb->nodes[i]);
282 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
286 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
290 /* any new or different nodes must be added */
291 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
292 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
293 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
295 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
296 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
297 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
301 /* tell the recovery daemon to reaload the nodes file too */
302 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
304 talloc_free(tmp_ctx);
309 reload the nodes file after a short delay (so that we can send the response
313 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
315 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
321 a traverse function for pulling all relevent records from pulldb
324 struct ctdb_context *ctdb;
325 struct ctdb_marshall_buffer *pulldata;
330 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
332 struct pulldb_data *params = (struct pulldb_data *)p;
333 struct ctdb_rec_data *rec;
335 /* add the record to the blob */
336 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
338 params->failed = true;
341 params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
342 if (params->pulldata == NULL) {
343 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand pulldb_data to %u\n", rec->length + params->len));
344 ctdb_fatal(params->ctdb, "failed to allocate memory for recovery. shutting down\n");
346 params->pulldata->count++;
347 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
348 params->len += rec->length;
355 pul a bunch of records from a ltdb, filtering by lmaster
357 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
359 struct ctdb_control_pulldb *pull;
360 struct ctdb_db_context *ctdb_db;
361 struct pulldb_data params;
362 struct ctdb_marshall_buffer *reply;
364 pull = (struct ctdb_control_pulldb *)indata.dptr;
366 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
368 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
372 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
373 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
377 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
378 CTDB_NO_MEMORY(ctdb, reply);
380 reply->db_id = pull->db_id;
383 params.pulldata = reply;
384 params.len = offsetof(struct ctdb_marshall_buffer, data);
385 params.failed = false;
387 if (ctdb_db->unhealthy_reason) {
388 /* this is just a warning, as the tdb should be empty anyway */
389 DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
390 ctdb_db->db_name, ctdb_db->unhealthy_reason));
393 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
394 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
398 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
399 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
400 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
401 talloc_free(params.pulldata);
405 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
407 outdata->dptr = (uint8_t *)params.pulldata;
408 outdata->dsize = params.len;
414 push a bunch of records into a ltdb, filtering by rsn
416 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
418 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
419 struct ctdb_db_context *ctdb_db;
421 struct ctdb_rec_data *rec;
423 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
424 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
428 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
430 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
434 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
435 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
439 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
440 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
444 rec = (struct ctdb_rec_data *)&reply->data[0];
446 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
447 reply->count, reply->db_id));
449 for (i=0;i<reply->count;i++) {
451 struct ctdb_ltdb_header *hdr;
453 key.dptr = &rec->data[0];
454 key.dsize = rec->keylen;
455 data.dptr = &rec->data[key.dsize];
456 data.dsize = rec->datalen;
458 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
459 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
462 hdr = (struct ctdb_ltdb_header *)data.dptr;
463 data.dptr += sizeof(*hdr);
464 data.dsize -= sizeof(*hdr);
466 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
468 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
472 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
475 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
476 reply->count, reply->db_id));
478 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
482 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
487 static int traverse_setdmaster(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
489 uint32_t *dmaster = (uint32_t *)p;
490 struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)data.dptr;
493 /* skip if already correct */
494 if (header->dmaster == *dmaster) {
498 header->dmaster = *dmaster;
500 ret = tdb_store(tdb, key, data, TDB_REPLACE);
502 DEBUG(DEBUG_CRIT,(__location__ " failed to write tdb data back ret:%d\n",ret));
506 /* TODO: add error checking here */
511 int32_t ctdb_control_set_dmaster(struct ctdb_context *ctdb, TDB_DATA indata)
513 struct ctdb_control_set_dmaster *p = (struct ctdb_control_set_dmaster *)indata.dptr;
514 struct ctdb_db_context *ctdb_db;
516 ctdb_db = find_ctdb_db(ctdb, p->db_id);
518 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", p->db_id));
522 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
523 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_set_dmaster when not frozen\n"));
527 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
528 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
532 tdb_traverse(ctdb_db->ltdb->tdb, traverse_setdmaster, &p->dmaster);
534 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
539 struct ctdb_set_recmode_state {
540 struct ctdb_context *ctdb;
541 struct ctdb_req_control *c;
544 struct timed_event *te;
545 struct fd_event *fde;
547 struct timeval start_time;
551 called if our set_recmode child times out. this would happen if
552 ctdb_recovery_lock() would block.
554 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
555 struct timeval t, void *private_data)
557 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
558 struct ctdb_set_recmode_state);
560 /* we consider this a success, not a failure, as we failed to
561 set the recovery lock which is what we wanted. This can be
562 caused by the cluster filesystem being very slow to
563 arbitrate locks immediately after a node failure.
565 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
566 state->ctdb->recovery_mode = state->recmode;
567 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
572 /* when we free the recmode state we must kill any child process.
574 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
576 double l = timeval_elapsed(&state->start_time);
578 CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock", reclock.ctdbd, l);
580 if (state->fd[0] != -1) {
583 if (state->fd[1] != -1) {
586 kill(state->child, SIGKILL);
590 /* this is called when the client process has completed ctdb_recovery_lock()
591 and has written data back to us through the pipe.
593 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
594 uint16_t flags, void *private_data)
596 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
597 struct ctdb_set_recmode_state);
601 /* we got a response from our child process so we can abort the
604 talloc_free(state->te);
608 /* read the childs status when trying to lock the reclock file.
609 child wrote 0 if everything is fine and 1 if it did manage
610 to lock the file, which would be a problem since that means
611 we got a request to exit from recovery but we could still lock
612 the file which at this time SHOULD be locked by the recovery
613 daemon on the recmaster
615 ret = read(state->fd[0], &c, 1);
616 if (ret != 1 || c != 0) {
617 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
622 state->ctdb->recovery_mode = state->recmode;
624 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
630 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
631 struct timeval t, void *private_data)
633 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
635 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
636 talloc_free(ctdb->release_ips_ctx);
637 ctdb->release_ips_ctx = NULL;
639 ctdb_release_all_ips(ctdb);
643 * Set up an event to drop all public ips if we remain in recovery for too
646 int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
648 if (ctdb->release_ips_ctx != NULL) {
649 talloc_free(ctdb->release_ips_ctx);
651 ctdb->release_ips_ctx = talloc_new(ctdb);
652 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
654 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
659 set the recovery mode
661 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
662 struct ctdb_req_control *c,
663 TDB_DATA indata, bool *async_reply,
664 const char **errormsg)
666 uint32_t recmode = *(uint32_t *)indata.dptr;
668 struct ctdb_set_recmode_state *state;
669 pid_t parent = getpid();
671 /* if we enter recovery but stay in recovery for too long
672 we will eventually drop all our ip addresses
674 if (recmode == CTDB_RECOVERY_NORMAL) {
675 talloc_free(ctdb->release_ips_ctx);
676 ctdb->release_ips_ctx = NULL;
678 if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
679 DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n"));
683 if (recmode != ctdb->recovery_mode) {
684 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
685 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
688 if (recmode != CTDB_RECOVERY_NORMAL ||
689 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
690 ctdb->recovery_mode = recmode;
694 /* some special handling when ending recovery mode */
696 /* force the databases to thaw */
697 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
698 if (ctdb->freeze_handles[i] != NULL) {
699 ctdb_control_thaw(ctdb, i);
703 state = talloc(ctdb, struct ctdb_set_recmode_state);
704 CTDB_NO_MEMORY(ctdb, state);
706 state->start_time = timeval_current();
710 if (ctdb->tunable.verify_recovery_lock == 0) {
711 /* dont need to verify the reclock file */
712 ctdb->recovery_mode = recmode;
716 /* For the rest of what needs to be done, we need to do this in
717 a child process since
718 1, the call to ctdb_recovery_lock() can block if the cluster
719 filesystem is in the process of recovery.
721 ret = pipe(state->fd);
724 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
728 state->child = fork();
729 if (state->child == (pid_t)-1) {
736 if (state->child == 0) {
740 debug_extra = talloc_asprintf(NULL, "set_recmode:");
741 /* we should not be able to get the lock on the reclock file,
742 as it should be held by the recovery master
744 if (ctdb_recovery_lock(ctdb, false)) {
745 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
749 write(state->fd[1], &cc, 1);
750 /* make sure we die when our parent dies */
751 while (kill(parent, 0) == 0 || errno != ESRCH) {
753 write(state->fd[1], &cc, 1);
758 set_close_on_exec(state->fd[0]);
762 talloc_set_destructor(state, set_recmode_destructor);
764 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
766 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
767 ctdb_set_recmode_timeout, state);
769 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
774 if (state->fde == NULL) {
778 tevent_fd_set_auto_close(state->fde);
781 state->recmode = recmode;
782 state->c = talloc_steal(state, c);
791 try and get the recovery lock in shared storage - should only work
792 on the recovery master recovery daemon. Anywhere else is a bug
794 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
799 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
801 if (ctdb->recovery_lock_fd != -1) {
802 close(ctdb->recovery_lock_fd);
803 ctdb->recovery_lock_fd = -1;
806 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
807 if (ctdb->recovery_lock_fd == -1) {
808 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
809 ctdb->recovery_lock_file, strerror(errno)));
813 set_close_on_exec(ctdb->recovery_lock_fd);
815 lock.l_type = F_WRLCK;
816 lock.l_whence = SEEK_SET;
821 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
822 close(ctdb->recovery_lock_fd);
823 ctdb->recovery_lock_fd = -1;
825 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
831 close(ctdb->recovery_lock_fd);
832 ctdb->recovery_lock_fd = -1;
836 DEBUG(DEBUG_NOTICE, ("Recovery lock taken successfully\n"));
839 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
845 delete a record as part of the vacuum process
846 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
847 use non-blocking locks
849 return 0 if the record was successfully deleted (i.e. it does not exist
850 when the function returns)
851 or !0 is the record still exists in the tdb after returning.
853 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
856 struct ctdb_ltdb_header *hdr, *hdr2;
858 /* these are really internal tdb functions - but we need them here for
859 non-blocking lock of the freelist */
860 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
861 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
864 key.dsize = rec->keylen;
865 key.dptr = &rec->data[0];
866 data.dsize = rec->datalen;
867 data.dptr = &rec->data[rec->keylen];
869 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
870 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
874 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
875 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
879 hdr = (struct ctdb_ltdb_header *)data.dptr;
881 /* use a non-blocking lock */
882 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
886 data = tdb_fetch(ctdb_db->ltdb->tdb, key);
887 if (data.dptr == NULL) {
888 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
892 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
893 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
894 tdb_delete(ctdb_db->ltdb->tdb, key);
895 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
896 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
898 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
903 hdr2 = (struct ctdb_ltdb_header *)data.dptr;
905 if (hdr2->rsn > hdr->rsn) {
906 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
907 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
908 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
913 if (hdr2->dmaster == ctdb->pnn) {
914 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
915 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
920 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
921 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
926 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
927 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
928 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
929 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
934 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
935 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
942 struct recovery_callback_state {
943 struct ctdb_req_control *c;
948 called when the 'recovered' event script has finished
950 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
952 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
954 ctdb_enable_monitoring(ctdb);
955 CTDB_INCREMENT_STAT(ctdb, num_recoveries);
958 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
959 if (status == -ETIME) {
964 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
967 gettimeofday(&ctdb->last_recovery_finished, NULL);
971 recovery has finished
973 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
974 struct ctdb_req_control *c,
978 struct recovery_callback_state *state;
980 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
982 state = talloc(ctdb, struct recovery_callback_state);
983 CTDB_NO_MEMORY(ctdb, state);
987 ctdb_disable_monitoring(ctdb);
989 ret = ctdb_event_script_callback(ctdb, state,
990 ctdb_end_recovery_callback,
993 CTDB_EVENT_RECOVERED, "%s", "");
996 ctdb_enable_monitoring(ctdb);
998 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
1003 /* tell the control that we will be reply asynchronously */
1004 state->c = talloc_steal(state, c);
1005 *async_reply = true;
1010 called when the 'startrecovery' event script has finished
1012 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
1014 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
1017 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
1020 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1025 run the startrecovery eventscript
1027 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
1028 struct ctdb_req_control *c,
1032 struct recovery_callback_state *state;
1034 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
1035 gettimeofday(&ctdb->last_recovery_started, NULL);
1037 state = talloc(ctdb, struct recovery_callback_state);
1038 CTDB_NO_MEMORY(ctdb, state);
1040 state->c = talloc_steal(state, c);
1042 ctdb_disable_monitoring(ctdb);
1044 ret = ctdb_event_script_callback(ctdb, state,
1045 ctdb_start_recovery_callback,
1047 CTDB_EVENT_START_RECOVERY,
1051 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
1056 /* tell the control that we will be reply asynchronously */
1057 *async_reply = true;
1062 try to delete all these records as part of the vacuuming process
1063 and return the records we failed to delete
1065 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1067 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1068 struct ctdb_db_context *ctdb_db;
1070 struct ctdb_rec_data *rec;
1071 struct ctdb_marshall_buffer *records;
1073 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1074 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1078 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1080 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1085 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1086 reply->count, reply->db_id));
1089 /* create a blob to send back the records we couldnt delete */
1090 records = (struct ctdb_marshall_buffer *)
1091 talloc_zero_size(outdata,
1092 offsetof(struct ctdb_marshall_buffer, data));
1093 if (records == NULL) {
1094 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1097 records->db_id = ctdb_db->db_id;
1100 rec = (struct ctdb_rec_data *)&reply->data[0];
1101 for (i=0;i<reply->count;i++) {
1104 key.dptr = &rec->data[0];
1105 key.dsize = rec->keylen;
1106 data.dptr = &rec->data[key.dsize];
1107 data.dsize = rec->datalen;
1109 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1110 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1114 /* If we cant delete the record we must add it to the reply
1115 so the lmaster knows it may not purge this record
1117 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1119 struct ctdb_ltdb_header *hdr;
1121 hdr = (struct ctdb_ltdb_header *)data.dptr;
1122 data.dptr += sizeof(*hdr);
1123 data.dsize -= sizeof(*hdr);
1125 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1127 old_size = talloc_get_size(records);
1128 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1129 if (records == NULL) {
1130 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1134 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1137 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1141 outdata->dptr = (uint8_t *)records;
1142 outdata->dsize = talloc_get_size(records);
1150 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1152 uint32_t *capabilities = NULL;
1154 capabilities = talloc(outdata, uint32_t);
1155 CTDB_NO_MEMORY(ctdb, capabilities);
1156 *capabilities = ctdb->capabilities;
1158 outdata->dsize = sizeof(uint32_t);
1159 outdata->dptr = (uint8_t *)capabilities;
1164 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1166 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1167 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1169 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1171 if (*count < ctdb->tunable.recd_ping_failcount) {
1173 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1174 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1175 ctdb_recd_ping_timeout, ctdb);
1179 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Shutting down ctdb daemon. (This can be caused if the cluster filesystem has hung)\n"));
1181 ctdb_stop_recoverd(ctdb);
1182 ctdb_stop_keepalive(ctdb);
1183 ctdb_stop_monitoring(ctdb);
1184 ctdb_release_all_ips(ctdb);
1185 if (ctdb->methods != NULL) {
1186 ctdb->methods->shutdown(ctdb);
1188 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
1189 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Daemon has been shut down.\n"));
1193 /* The recovery daemon will ping us at regular intervals.
1194 If we havent been pinged for a while we assume the recovery
1195 daemon is inoperable and we shut down.
1197 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1199 talloc_free(ctdb->recd_ping_count);
1201 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1202 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1204 if (ctdb->tunable.recd_ping_timeout != 0) {
1205 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1206 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1207 ctdb_recd_ping_timeout, ctdb);
1215 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1217 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1219 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1224 struct stop_node_callback_state {
1225 struct ctdb_req_control *c;
1229 called when the 'stopped' event script has finished
1231 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1233 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1236 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1237 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1238 if (status == -ETIME) {
1239 ctdb_ban_self(ctdb);
1243 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1247 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1250 struct stop_node_callback_state *state;
1252 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1254 state = talloc(ctdb, struct stop_node_callback_state);
1255 CTDB_NO_MEMORY(ctdb, state);
1257 state->c = talloc_steal(state, c);
1259 ctdb_disable_monitoring(ctdb);
1261 ret = ctdb_event_script_callback(ctdb, state,
1262 ctdb_stop_node_callback,
1264 CTDB_EVENT_STOPPED, "%s", "");
1267 ctdb_enable_monitoring(ctdb);
1269 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1274 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1276 *async_reply = true;
1281 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1283 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1284 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;