4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/tdb/include/tdb.h"
22 #include "system/time.h"
23 #include "system/network.h"
24 #include "system/filesys.h"
25 #include "system/wait.h"
26 #include "../include/ctdb_private.h"
27 #include "lib/util/dlinklist.h"
32 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
34 CHECK_CONTROL_DATA_SIZE(0);
35 struct ctdb_vnn_map_wire *map;
38 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
39 map = talloc_size(outdata, len);
40 CTDB_NO_MEMORY(ctdb, map);
42 map->generation = ctdb->vnn_map->generation;
43 map->size = ctdb->vnn_map->size;
44 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
47 outdata->dptr = (uint8_t *)map;
53 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
55 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
58 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
59 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
60 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
65 talloc_free(ctdb->vnn_map);
67 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
68 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
70 ctdb->vnn_map->generation = map->generation;
71 ctdb->vnn_map->size = map->size;
72 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
73 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
75 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
81 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
84 struct ctdb_db_context *ctdb_db;
85 struct ctdb_dbid_map *dbid_map;
87 CHECK_CONTROL_DATA_SIZE(0);
90 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
95 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
96 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
98 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
102 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
104 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
105 dbid_map->dbs[i].dbid = ctdb_db->db_id;
106 if (ctdb_db->persistent != 0) {
107 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_PERSISTENT;
109 if (ctdb_db->readonly != 0) {
110 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_READONLY;
112 if (ctdb_db->sticky != 0) {
113 dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_STICKY;
121 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
123 uint32_t i, num_nodes;
124 struct ctdb_node_map *node_map;
126 CHECK_CONTROL_DATA_SIZE(0);
128 num_nodes = ctdb->num_nodes;
130 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
131 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
132 if (!outdata->dptr) {
133 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
137 node_map = (struct ctdb_node_map *)outdata->dptr;
138 node_map->num = num_nodes;
139 for (i=0; i<num_nodes; i++) {
140 if (parse_ip(ctdb->nodes[i]->address.address,
141 NULL, /* TODO: pass in the correct interface here*/
143 &node_map->nodes[i].addr) == 0)
145 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
148 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
149 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
156 get an old style ipv4-only nodemap
159 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
161 uint32_t i, num_nodes;
162 struct ctdb_node_mapv4 *node_map;
164 CHECK_CONTROL_DATA_SIZE(0);
166 num_nodes = ctdb->num_nodes;
168 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
169 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
170 if (!outdata->dptr) {
171 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
175 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
176 node_map->num = num_nodes;
177 for (i=0; i<num_nodes; i++) {
178 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
179 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
183 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
184 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
191 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
192 struct timeval t, void *private_data)
195 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
197 struct ctdb_node **nodes;
199 tmp_ctx = talloc_new(ctdb);
201 /* steal the old nodes file for a while */
202 talloc_steal(tmp_ctx, ctdb->nodes);
205 num_nodes = ctdb->num_nodes;
208 /* load the new nodes file */
209 ctdb_load_nodes_file(ctdb);
211 for (i=0; i<ctdb->num_nodes; i++) {
212 /* keep any identical pre-existing nodes and connections */
213 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
214 talloc_free(ctdb->nodes[i]);
215 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
219 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
223 /* any new or different nodes must be added */
224 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
225 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
226 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
228 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
229 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
230 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
234 /* tell the recovery daemon to reaload the nodes file too */
235 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
237 talloc_free(tmp_ctx);
242 reload the nodes file after a short delay (so that we can send the response
246 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
248 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
254 a traverse function for pulling all relevent records from pulldb
257 struct ctdb_context *ctdb;
258 struct ctdb_db_context *ctdb_db;
259 struct ctdb_marshall_buffer *pulldata;
261 uint32_t allocated_len;
265 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
267 struct pulldb_data *params = (struct pulldb_data *)p;
268 struct ctdb_rec_data *rec;
269 struct ctdb_context *ctdb = params->ctdb;
270 struct ctdb_db_context *ctdb_db = params->ctdb_db;
272 /* add the record to the blob */
273 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
275 params->failed = true;
278 if (params->len + rec->length >= params->allocated_len) {
279 params->allocated_len = rec->length + params->len + ctdb->tunable.pulldb_preallocation_size;
280 params->pulldata = talloc_realloc_size(NULL, params->pulldata, params->allocated_len);
282 if (params->pulldata == NULL) {
283 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand pulldb_data to %u\n", rec->length + params->len));
284 ctdb_fatal(params->ctdb, "failed to allocate memory for recovery. shutting down\n");
286 params->pulldata->count++;
287 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
288 params->len += rec->length;
290 if (ctdb->tunable.db_record_size_warn != 0 && rec->length > ctdb->tunable.db_record_size_warn) {
291 DEBUG(DEBUG_ERR,("Data record in %s is big. Record size is %d bytes\n", ctdb_db->db_name, (int)rec->length));
300 pull a bunch of records from a ltdb, filtering by lmaster
302 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
304 struct ctdb_control_pulldb *pull;
305 struct ctdb_db_context *ctdb_db;
306 struct pulldb_data params;
307 struct ctdb_marshall_buffer *reply;
309 pull = (struct ctdb_control_pulldb *)indata.dptr;
311 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
313 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
317 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
318 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
322 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
323 CTDB_NO_MEMORY(ctdb, reply);
325 reply->db_id = pull->db_id;
328 params.ctdb_db = ctdb_db;
329 params.pulldata = reply;
330 params.len = offsetof(struct ctdb_marshall_buffer, data);
331 params.allocated_len = params.len;
332 params.failed = false;
334 if (ctdb_db->unhealthy_reason) {
335 /* this is just a warning, as the tdb should be empty anyway */
336 DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
337 ctdb_db->db_name, ctdb_db->unhealthy_reason));
340 if (ctdb_lockall_mark_prio(ctdb, ctdb_db->priority) != 0) {
341 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
345 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
346 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
347 ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
348 talloc_free(params.pulldata);
352 ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
354 outdata->dptr = (uint8_t *)params.pulldata;
355 outdata->dsize = params.len;
357 if (ctdb->tunable.db_record_count_warn != 0 && params.pulldata->count > ctdb->tunable.db_record_count_warn) {
358 DEBUG(DEBUG_ERR,("Database %s is big. Contains %d records\n", ctdb_db->db_name, params.pulldata->count));
360 if (ctdb->tunable.db_size_warn != 0 && outdata->dsize > ctdb->tunable.db_size_warn) {
361 DEBUG(DEBUG_ERR,("Database %s is big. Contains %d bytes\n", ctdb_db->db_name, (int)outdata->dsize));
369 push a bunch of records into a ltdb, filtering by rsn
371 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
373 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
374 struct ctdb_db_context *ctdb_db;
376 struct ctdb_rec_data *rec;
378 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
379 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
383 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
385 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
389 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
390 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
394 if (ctdb_lockall_mark_prio(ctdb, ctdb_db->priority) != 0) {
395 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
399 rec = (struct ctdb_rec_data *)&reply->data[0];
401 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
402 reply->count, reply->db_id));
404 for (i=0;i<reply->count;i++) {
406 struct ctdb_ltdb_header *hdr;
408 key.dptr = &rec->data[0];
409 key.dsize = rec->keylen;
410 data.dptr = &rec->data[key.dsize];
411 data.dsize = rec->datalen;
413 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
414 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
417 hdr = (struct ctdb_ltdb_header *)data.dptr;
418 /* strip off any read only record flags. All readonly records
419 are revoked implicitely by a recovery
421 hdr->flags &= ~(CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE);
423 data.dptr += sizeof(*hdr);
424 data.dsize -= sizeof(*hdr);
426 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
428 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
432 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
435 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
436 reply->count, reply->db_id));
438 if (ctdb_db->readonly) {
439 DEBUG(DEBUG_CRIT,("Clearing the tracking database for dbid 0x%x\n",
441 if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
442 DEBUG(DEBUG_ERR,("Failed to wipe tracking database for 0x%x. Dropping read-only delegation support\n", ctdb_db->db_id));
443 ctdb_db->readonly = false;
444 tdb_close(ctdb_db->rottdb);
445 ctdb_db->rottdb = NULL;
446 ctdb_db->readonly = false;
448 while (ctdb_db->revokechild_active != NULL) {
449 talloc_free(ctdb_db->revokechild_active);
453 ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
457 ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
462 static int traverse_setdmaster(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
464 uint32_t *dmaster = (uint32_t *)p;
465 struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)data.dptr;
468 /* skip if already correct */
469 if (header->dmaster == *dmaster) {
473 header->dmaster = *dmaster;
475 ret = tdb_store(tdb, key, data, TDB_REPLACE);
477 DEBUG(DEBUG_CRIT,(__location__ " failed to write tdb data back ret:%d\n",ret));
481 /* TODO: add error checking here */
486 int32_t ctdb_control_set_dmaster(struct ctdb_context *ctdb, TDB_DATA indata)
488 struct ctdb_control_set_dmaster *p = (struct ctdb_control_set_dmaster *)indata.dptr;
489 struct ctdb_db_context *ctdb_db;
491 ctdb_db = find_ctdb_db(ctdb, p->db_id);
493 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", p->db_id));
497 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
498 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_set_dmaster when not frozen\n"));
502 if (ctdb_lockall_mark_prio(ctdb, ctdb_db->priority) != 0) {
503 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
507 tdb_traverse(ctdb_db->ltdb->tdb, traverse_setdmaster, &p->dmaster);
509 ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
514 struct ctdb_set_recmode_state {
515 struct ctdb_context *ctdb;
516 struct ctdb_req_control *c;
519 struct timed_event *te;
520 struct fd_event *fde;
522 struct timeval start_time;
526 called if our set_recmode child times out. this would happen if
527 ctdb_recovery_lock() would block.
529 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
530 struct timeval t, void *private_data)
532 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
533 struct ctdb_set_recmode_state);
535 /* we consider this a success, not a failure, as we failed to
536 set the recovery lock which is what we wanted. This can be
537 caused by the cluster filesystem being very slow to
538 arbitrate locks immediately after a node failure.
540 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
541 state->ctdb->recovery_mode = state->recmode;
542 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
547 /* when we free the recmode state we must kill any child process.
549 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
551 double l = timeval_elapsed(&state->start_time);
553 CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock", reclock.ctdbd, l);
555 if (state->fd[0] != -1) {
558 if (state->fd[1] != -1) {
561 ctdb_kill(state->ctdb, state->child, SIGKILL);
565 /* this is called when the client process has completed ctdb_recovery_lock()
566 and has written data back to us through the pipe.
568 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
569 uint16_t flags, void *private_data)
571 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
572 struct ctdb_set_recmode_state);
576 /* we got a response from our child process so we can abort the
579 talloc_free(state->te);
583 /* read the childs status when trying to lock the reclock file.
584 child wrote 0 if everything is fine and 1 if it did manage
585 to lock the file, which would be a problem since that means
586 we got a request to exit from recovery but we could still lock
587 the file which at this time SHOULD be locked by the recovery
588 daemon on the recmaster
590 ret = read(state->fd[0], &c, 1);
591 if (ret != 1 || c != 0) {
592 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
597 state->ctdb->recovery_mode = state->recmode;
599 /* release any deferred attach calls from clients */
600 if (state->recmode == CTDB_RECOVERY_NORMAL) {
601 ctdb_process_deferred_attach(state->ctdb);
604 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
610 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
611 struct timeval t, void *private_data)
613 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
615 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
616 talloc_free(ctdb->release_ips_ctx);
617 ctdb->release_ips_ctx = NULL;
619 ctdb_release_all_ips(ctdb);
623 * Set up an event to drop all public ips if we remain in recovery for too
626 int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
628 if (ctdb->release_ips_ctx != NULL) {
629 talloc_free(ctdb->release_ips_ctx);
631 ctdb->release_ips_ctx = talloc_new(ctdb);
632 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
634 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
639 set the recovery mode
641 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
642 struct ctdb_req_control *c,
643 TDB_DATA indata, bool *async_reply,
644 const char **errormsg)
646 uint32_t recmode = *(uint32_t *)indata.dptr;
648 struct ctdb_set_recmode_state *state;
649 pid_t parent = getpid();
651 /* if we enter recovery but stay in recovery for too long
652 we will eventually drop all our ip addresses
654 if (recmode == CTDB_RECOVERY_NORMAL) {
655 talloc_free(ctdb->release_ips_ctx);
656 ctdb->release_ips_ctx = NULL;
658 if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
659 DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n"));
663 if (recmode != ctdb->recovery_mode) {
664 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
665 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
668 if (recmode != CTDB_RECOVERY_NORMAL ||
669 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
670 ctdb->recovery_mode = recmode;
674 /* some special handling when ending recovery mode */
676 /* force the databases to thaw */
677 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
678 if (ctdb->freeze_handles[i] != NULL) {
679 ctdb_control_thaw(ctdb, i);
683 state = talloc(ctdb, struct ctdb_set_recmode_state);
684 CTDB_NO_MEMORY(ctdb, state);
686 state->start_time = timeval_current();
690 /* release any deferred attach calls from clients */
691 if (recmode == CTDB_RECOVERY_NORMAL) {
692 ctdb_process_deferred_attach(ctdb);
695 if (ctdb->tunable.verify_recovery_lock == 0) {
696 /* dont need to verify the reclock file */
697 ctdb->recovery_mode = recmode;
701 /* For the rest of what needs to be done, we need to do this in
702 a child process since
703 1, the call to ctdb_recovery_lock() can block if the cluster
704 filesystem is in the process of recovery.
706 ret = pipe(state->fd);
709 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
713 state->child = ctdb_fork(ctdb);
714 if (state->child == (pid_t)-1) {
721 if (state->child == 0) {
725 debug_extra = talloc_asprintf(NULL, "set_recmode:");
726 /* we should not be able to get the lock on the reclock file,
727 as it should be held by the recovery master
729 if (ctdb_recovery_lock(ctdb, false)) {
730 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
734 write(state->fd[1], &cc, 1);
735 /* make sure we die when our parent dies */
736 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
738 write(state->fd[1], &cc, 1);
743 set_close_on_exec(state->fd[0]);
747 talloc_set_destructor(state, set_recmode_destructor);
749 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
751 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
752 ctdb_set_recmode_timeout, state);
754 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
759 if (state->fde == NULL) {
763 tevent_fd_set_auto_close(state->fde);
766 state->recmode = recmode;
767 state->c = talloc_steal(state, c);
776 try and get the recovery lock in shared storage - should only work
777 on the recovery master recovery daemon. Anywhere else is a bug
779 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
784 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
786 if (ctdb->recovery_lock_fd != -1) {
787 close(ctdb->recovery_lock_fd);
788 ctdb->recovery_lock_fd = -1;
791 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
792 if (ctdb->recovery_lock_fd == -1) {
793 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
794 ctdb->recovery_lock_file, strerror(errno)));
798 set_close_on_exec(ctdb->recovery_lock_fd);
800 lock.l_type = F_WRLCK;
801 lock.l_whence = SEEK_SET;
806 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
807 close(ctdb->recovery_lock_fd);
808 ctdb->recovery_lock_fd = -1;
810 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
816 close(ctdb->recovery_lock_fd);
817 ctdb->recovery_lock_fd = -1;
821 DEBUG(DEBUG_NOTICE, ("Recovery lock taken successfully\n"));
824 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
830 delete a record as part of the vacuum process
831 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
832 use non-blocking locks
834 return 0 if the record was successfully deleted (i.e. it does not exist
835 when the function returns)
836 or !0 is the record still exists in the tdb after returning.
838 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
841 struct ctdb_ltdb_header *hdr, *hdr2;
843 /* these are really internal tdb functions - but we need them here for
844 non-blocking lock of the freelist */
845 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
846 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
849 key.dsize = rec->keylen;
850 key.dptr = &rec->data[0];
851 data.dsize = rec->datalen;
852 data.dptr = &rec->data[rec->keylen];
854 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
855 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
859 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
860 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
864 hdr = (struct ctdb_ltdb_header *)data.dptr;
866 /* use a non-blocking lock */
867 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
871 data = tdb_fetch(ctdb_db->ltdb->tdb, key);
872 if (data.dptr == NULL) {
873 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
877 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
878 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
879 tdb_delete(ctdb_db->ltdb->tdb, key);
880 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
881 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
883 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
888 hdr2 = (struct ctdb_ltdb_header *)data.dptr;
890 if (hdr2->rsn > hdr->rsn) {
891 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
892 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
893 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
898 /* do not allow deleting record that have readonly flags set. */
899 if (hdr->flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE)) {
900 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
901 DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly flags set\n"));
905 if (hdr2->flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY|CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_REVOKE_COMPLETE)) {
906 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
907 DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly flags set\n"));
912 if (hdr2->dmaster == ctdb->pnn) {
913 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
914 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
919 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
920 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
925 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
926 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
927 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
928 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
933 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
934 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
941 struct recovery_callback_state {
942 struct ctdb_req_control *c;
947 called when the 'recovered' event script has finished
949 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
951 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
953 ctdb_enable_monitoring(ctdb);
954 CTDB_INCREMENT_STAT(ctdb, num_recoveries);
957 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
958 if (status == -ETIME) {
963 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
966 gettimeofday(&ctdb->last_recovery_finished, NULL);
970 recovery has finished
972 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
973 struct ctdb_req_control *c,
977 struct recovery_callback_state *state;
979 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
981 ctdb_persistent_finish_trans3_commits(ctdb);
983 state = talloc(ctdb, struct recovery_callback_state);
984 CTDB_NO_MEMORY(ctdb, state);
988 ctdb_disable_monitoring(ctdb);
990 ret = ctdb_event_script_callback(ctdb, state,
991 ctdb_end_recovery_callback,
994 CTDB_EVENT_RECOVERED, "%s", "");
997 ctdb_enable_monitoring(ctdb);
999 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
1004 /* tell the control that we will be reply asynchronously */
1005 state->c = talloc_steal(state, c);
1006 *async_reply = true;
1011 called when the 'startrecovery' event script has finished
1013 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
1015 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
1018 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
1021 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1026 run the startrecovery eventscript
1028 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
1029 struct ctdb_req_control *c,
1033 struct recovery_callback_state *state;
1035 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
1036 gettimeofday(&ctdb->last_recovery_started, NULL);
1038 state = talloc(ctdb, struct recovery_callback_state);
1039 CTDB_NO_MEMORY(ctdb, state);
1041 state->c = talloc_steal(state, c);
1043 ctdb_disable_monitoring(ctdb);
1045 ret = ctdb_event_script_callback(ctdb, state,
1046 ctdb_start_recovery_callback,
1048 CTDB_EVENT_START_RECOVERY,
1052 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
1057 /* tell the control that we will be reply asynchronously */
1058 *async_reply = true;
1063 try to delete all these records as part of the vacuuming process
1064 and return the records we failed to delete
1066 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1068 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1069 struct ctdb_db_context *ctdb_db;
1071 struct ctdb_rec_data *rec;
1072 struct ctdb_marshall_buffer *records;
1074 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1075 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1079 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1081 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1086 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1087 reply->count, reply->db_id));
1090 /* create a blob to send back the records we couldnt delete */
1091 records = (struct ctdb_marshall_buffer *)
1092 talloc_zero_size(outdata,
1093 offsetof(struct ctdb_marshall_buffer, data));
1094 if (records == NULL) {
1095 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1098 records->db_id = ctdb_db->db_id;
1101 rec = (struct ctdb_rec_data *)&reply->data[0];
1102 for (i=0;i<reply->count;i++) {
1105 key.dptr = &rec->data[0];
1106 key.dsize = rec->keylen;
1107 data.dptr = &rec->data[key.dsize];
1108 data.dsize = rec->datalen;
1110 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1111 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1115 /* If we cant delete the record we must add it to the reply
1116 so the lmaster knows it may not purge this record
1118 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1120 struct ctdb_ltdb_header *hdr;
1122 hdr = (struct ctdb_ltdb_header *)data.dptr;
1123 data.dptr += sizeof(*hdr);
1124 data.dsize -= sizeof(*hdr);
1126 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1128 old_size = talloc_get_size(records);
1129 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1130 if (records == NULL) {
1131 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1135 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1138 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1142 outdata->dptr = (uint8_t *)records;
1143 outdata->dsize = talloc_get_size(records);
1151 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1153 uint32_t *capabilities = NULL;
1155 capabilities = talloc(outdata, uint32_t);
1156 CTDB_NO_MEMORY(ctdb, capabilities);
1157 *capabilities = ctdb->capabilities;
1159 outdata->dsize = sizeof(uint32_t);
1160 outdata->dptr = (uint8_t *)capabilities;
1165 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1167 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1168 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1170 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1172 if (*count < ctdb->tunable.recd_ping_failcount) {
1174 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1175 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1176 ctdb_recd_ping_timeout, ctdb);
1180 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n"));
1182 ctdb_stop_recoverd(ctdb);
1183 ctdb_start_recoverd(ctdb);
1186 /* The recovery daemon will ping us at regular intervals.
1187 If we havent been pinged for a while we assume the recovery
1188 daemon is inoperable and we shut down.
1190 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1192 talloc_free(ctdb->recd_ping_count);
1194 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1195 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1197 if (ctdb->tunable.recd_ping_timeout != 0) {
1198 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1199 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1200 ctdb_recd_ping_timeout, ctdb);
1208 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1210 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1212 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1217 struct stop_node_callback_state {
1218 struct ctdb_req_control *c;
1222 called when the 'stopped' event script has finished
1224 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1226 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1229 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1230 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1231 if (status == -ETIME) {
1232 ctdb_ban_self(ctdb);
1236 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1240 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1243 struct stop_node_callback_state *state;
1245 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1247 state = talloc(ctdb, struct stop_node_callback_state);
1248 CTDB_NO_MEMORY(ctdb, state);
1250 state->c = talloc_steal(state, c);
1252 ctdb_disable_monitoring(ctdb);
1254 ret = ctdb_event_script_callback(ctdb, state,
1255 ctdb_stop_node_callback,
1257 CTDB_EVENT_STOPPED, "%s", "");
1260 ctdb_enable_monitoring(ctdb);
1262 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1267 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1269 *async_reply = true;
1274 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1276 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1277 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;