4 Copyright (C) Andrew Tridgell 2007
5 Copyright (C) Ronnie Sahlberg 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
21 #include "lib/events/events.h"
22 #include "lib/tdb/include/tdb.h"
23 #include "system/time.h"
24 #include "system/network.h"
25 #include "system/filesys.h"
26 #include "system/wait.h"
27 #include "../include/ctdb_private.h"
28 #include "lib/util/dlinklist.h"
32 lock all databases - mark only
34 static int ctdb_lock_all_databases_mark(struct ctdb_context *ctdb, uint32_t priority)
36 struct ctdb_db_context *ctdb_db;
38 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
39 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
43 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
44 DEBUG(DEBUG_ERR,("Attempt to mark all databases locked when not frozen\n"));
47 /* The dual loop is a woraround for older versions of samba
48 that does not yet support the set-db-priority/lock order
49 call. So that we get basic deadlock avoiidance also for
50 these old versions of samba.
51 This code will be removed in the future.
53 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
54 if (ctdb_db->priority != priority) {
57 if (strstr(ctdb_db->db_name, "notify") != NULL) {
60 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
64 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
65 if (ctdb_db->priority != priority) {
68 if (strstr(ctdb_db->db_name, "notify") == NULL) {
71 if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
79 lock all databases - unmark only
81 static int ctdb_lock_all_databases_unmark(struct ctdb_context *ctdb, uint32_t priority)
83 struct ctdb_db_context *ctdb_db;
85 if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
86 DEBUG(DEBUG_ERR,(__location__ " Illegal priority when trying to mark all databases Prio:%u\n", priority));
90 if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
91 DEBUG(DEBUG_ERR,("Attempt to unmark all databases locked when not frozen\n"));
94 for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
95 if (ctdb_db->priority != priority) {
98 if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
107 ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
109 CHECK_CONTROL_DATA_SIZE(0);
110 struct ctdb_vnn_map_wire *map;
113 len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
114 map = talloc_size(outdata, len);
115 CTDB_NO_MEMORY(ctdb, map);
117 map->generation = ctdb->vnn_map->generation;
118 map->size = ctdb->vnn_map->size;
119 memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
121 outdata->dsize = len;
122 outdata->dptr = (uint8_t *)map;
128 ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
130 struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
133 for(i=1; i<=NUM_DB_PRIORITIES; i++) {
134 if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
135 DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
140 talloc_free(ctdb->vnn_map);
142 ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
143 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
145 ctdb->vnn_map->generation = map->generation;
146 ctdb->vnn_map->size = map->size;
147 ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
148 CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
150 memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
156 ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
159 struct ctdb_db_context *ctdb_db;
160 struct ctdb_dbid_map *dbid_map;
162 CHECK_CONTROL_DATA_SIZE(0);
165 for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
170 outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
171 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
172 if (!outdata->dptr) {
173 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
177 dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
179 for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
180 dbid_map->dbs[i].dbid = ctdb_db->db_id;
181 dbid_map->dbs[i].persistent = ctdb_db->persistent;
188 ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
190 uint32_t i, num_nodes;
191 struct ctdb_node_map *node_map;
193 CHECK_CONTROL_DATA_SIZE(0);
195 num_nodes = ctdb->num_nodes;
197 outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
198 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
199 if (!outdata->dptr) {
200 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
204 node_map = (struct ctdb_node_map *)outdata->dptr;
205 node_map->num = num_nodes;
206 for (i=0; i<num_nodes; i++) {
207 if (parse_ip(ctdb->nodes[i]->address.address,
208 NULL, /* TODO: pass in the correct interface here*/
210 &node_map->nodes[i].addr) == 0)
212 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
215 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
216 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
223 get an old style ipv4-only nodemap
226 ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
228 uint32_t i, num_nodes;
229 struct ctdb_node_mapv4 *node_map;
231 CHECK_CONTROL_DATA_SIZE(0);
233 num_nodes = ctdb->num_nodes;
235 outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
236 outdata->dptr = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
237 if (!outdata->dptr) {
238 DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
242 node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
243 node_map->num = num_nodes;
244 for (i=0; i<num_nodes; i++) {
245 if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
246 DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
250 node_map->nodes[i].pnn = ctdb->nodes[i]->pnn;
251 node_map->nodes[i].flags = ctdb->nodes[i]->flags;
258 ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te,
259 struct timeval t, void *private_data)
262 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
264 struct ctdb_node **nodes;
266 tmp_ctx = talloc_new(ctdb);
268 /* steal the old nodes file for a while */
269 talloc_steal(tmp_ctx, ctdb->nodes);
272 num_nodes = ctdb->num_nodes;
275 /* load the new nodes file */
276 ctdb_load_nodes_file(ctdb);
278 for (i=0; i<ctdb->num_nodes; i++) {
279 /* keep any identical pre-existing nodes and connections */
280 if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
281 talloc_free(ctdb->nodes[i]);
282 ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
286 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
290 /* any new or different nodes must be added */
291 if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
292 DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
293 ctdb_fatal(ctdb, "failed to add node. shutting down\n");
295 if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
296 DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
297 ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
301 /* tell the recovery daemon to reaload the nodes file too */
302 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
304 talloc_free(tmp_ctx);
309 reload the nodes file after a short delay (so that we can send the response
313 ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
315 event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
321 a traverse function for pulling all relevent records from pulldb
324 struct ctdb_context *ctdb;
325 struct ctdb_marshall_buffer *pulldata;
330 static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
332 struct pulldb_data *params = (struct pulldb_data *)p;
333 struct ctdb_rec_data *rec;
335 /* add the record to the blob */
336 rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
338 params->failed = true;
341 params->pulldata = talloc_realloc_size(NULL, params->pulldata, rec->length + params->len);
342 if (params->pulldata == NULL) {
343 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand pulldb_data to %u\n", rec->length + params->len));
344 ctdb_fatal(params->ctdb, "failed to allocate memory for recovery. shutting down\n");
346 params->pulldata->count++;
347 memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
348 params->len += rec->length;
355 pull a bunch of records from a ltdb, filtering by lmaster
357 int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
359 struct ctdb_control_pulldb *pull;
360 struct ctdb_db_context *ctdb_db;
361 struct pulldb_data params;
362 struct ctdb_marshall_buffer *reply;
364 pull = (struct ctdb_control_pulldb *)indata.dptr;
366 ctdb_db = find_ctdb_db(ctdb, pull->db_id);
368 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
372 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
373 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
377 reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
378 CTDB_NO_MEMORY(ctdb, reply);
380 reply->db_id = pull->db_id;
383 params.pulldata = reply;
384 params.len = offsetof(struct ctdb_marshall_buffer, data);
385 params.failed = false;
387 if (ctdb_db->unhealthy_reason) {
388 /* this is just a warning, as the tdb should be empty anyway */
389 DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
390 ctdb_db->db_name, ctdb_db->unhealthy_reason));
393 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
394 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
398 if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, ¶ms) == -1) {
399 DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
400 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
401 talloc_free(params.pulldata);
405 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
407 outdata->dptr = (uint8_t *)params.pulldata;
408 outdata->dsize = params.len;
414 push a bunch of records into a ltdb, filtering by rsn
416 int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
418 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
419 struct ctdb_db_context *ctdb_db;
421 struct ctdb_rec_data *rec;
423 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
424 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
428 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
430 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
434 if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
435 DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
439 if (ctdb_lock_all_databases_mark(ctdb, ctdb_db->priority) != 0) {
440 DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
444 rec = (struct ctdb_rec_data *)&reply->data[0];
446 DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
447 reply->count, reply->db_id));
449 for (i=0;i<reply->count;i++) {
451 struct ctdb_ltdb_header *hdr;
453 key.dptr = &rec->data[0];
454 key.dsize = rec->keylen;
455 data.dptr = &rec->data[key.dsize];
456 data.dsize = rec->datalen;
458 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
459 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
462 hdr = (struct ctdb_ltdb_header *)data.dptr;
463 data.dptr += sizeof(*hdr);
464 data.dsize -= sizeof(*hdr);
466 ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
468 DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
472 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
475 DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
476 reply->count, reply->db_id));
478 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
482 ctdb_lock_all_databases_unmark(ctdb, ctdb_db->priority);
486 struct ctdb_set_recmode_state {
487 struct ctdb_context *ctdb;
488 struct ctdb_req_control *c;
491 struct timed_event *te;
492 struct fd_event *fde;
494 struct timeval start_time;
498 called if our set_recmode child times out. this would happen if
499 ctdb_recovery_lock() would block.
501 static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te,
502 struct timeval t, void *private_data)
504 struct ctdb_set_recmode_state *state = talloc_get_type(private_data,
505 struct ctdb_set_recmode_state);
507 /* we consider this a success, not a failure, as we failed to
508 set the recovery lock which is what we wanted. This can be
509 caused by the cluster filesystem being very slow to
510 arbitrate locks immediately after a node failure.
512 DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
513 state->ctdb->recovery_mode = state->recmode;
514 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
519 /* when we free the recmode state we must kill any child process.
521 static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
523 double l = timeval_elapsed(&state->start_time);
525 ctdb_reclock_latency(state->ctdb, "daemon reclock", &state->ctdb->statistics.reclock.ctdbd, l);
527 if (state->fd[0] != -1) {
530 if (state->fd[1] != -1) {
533 kill(state->child, SIGKILL);
537 /* this is called when the client process has completed ctdb_recovery_lock()
538 and has written data back to us through the pipe.
540 static void set_recmode_handler(struct event_context *ev, struct fd_event *fde,
541 uint16_t flags, void *private_data)
543 struct ctdb_set_recmode_state *state= talloc_get_type(private_data,
544 struct ctdb_set_recmode_state);
548 /* we got a response from our child process so we can abort the
551 talloc_free(state->te);
555 /* read the childs status when trying to lock the reclock file.
556 child wrote 0 if everything is fine and 1 if it did manage
557 to lock the file, which would be a problem since that means
558 we got a request to exit from recovery but we could still lock
559 the file which at this time SHOULD be locked by the recovery
560 daemon on the recmaster
562 ret = read(state->fd[0], &c, 1);
563 if (ret != 1 || c != 0) {
564 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
569 state->ctdb->recovery_mode = state->recmode;
571 ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
577 ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te,
578 struct timeval t, void *private_data)
580 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
582 DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
583 talloc_free(ctdb->release_ips_ctx);
584 ctdb->release_ips_ctx = NULL;
586 ctdb_release_all_ips(ctdb);
590 set the recovery mode
592 int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
593 struct ctdb_req_control *c,
594 TDB_DATA indata, bool *async_reply,
595 const char **errormsg)
597 uint32_t recmode = *(uint32_t *)indata.dptr;
599 struct ctdb_set_recmode_state *state;
600 pid_t parent = getpid();
602 /* if we enter recovery but stay in recovery for too long
603 we will eventually drop all our ip addresses
605 if (recmode == CTDB_RECOVERY_NORMAL) {
606 talloc_free(ctdb->release_ips_ctx);
607 ctdb->release_ips_ctx = NULL;
609 talloc_free(ctdb->release_ips_ctx);
610 ctdb->release_ips_ctx = talloc_new(ctdb);
611 CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
613 event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
616 if (recmode != ctdb->recovery_mode) {
617 DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n",
618 recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
621 if (recmode != CTDB_RECOVERY_NORMAL ||
622 ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
623 ctdb->recovery_mode = recmode;
627 /* some special handling when ending recovery mode */
629 /* force the databases to thaw */
630 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
631 if (ctdb->freeze_handles[i] != NULL) {
632 ctdb_control_thaw(ctdb, i);
636 state = talloc(ctdb, struct ctdb_set_recmode_state);
637 CTDB_NO_MEMORY(ctdb, state);
639 state->start_time = timeval_current();
643 if (ctdb->tunable.verify_recovery_lock == 0) {
644 /* dont need to verify the reclock file */
645 ctdb->recovery_mode = recmode;
649 /* For the rest of what needs to be done, we need to do this in
650 a child process since
651 1, the call to ctdb_recovery_lock() can block if the cluster
652 filesystem is in the process of recovery.
654 ret = pipe(state->fd);
657 DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
661 state->child = fork();
662 if (state->child == (pid_t)-1) {
669 if (state->child == 0) {
673 /* we should not be able to get the lock on the reclock file,
674 as it should be held by the recovery master
676 if (ctdb_recovery_lock(ctdb, false)) {
677 DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
681 write(state->fd[1], &cc, 1);
682 /* make sure we die when our parent dies */
683 while (kill(parent, 0) == 0 || errno != ESRCH) {
685 write(state->fd[1], &cc, 1);
690 set_close_on_exec(state->fd[0]);
694 talloc_set_destructor(state, set_recmode_destructor);
696 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
698 state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
699 ctdb_set_recmode_timeout, state);
701 state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
702 EVENT_FD_READ|EVENT_FD_AUTOCLOSE,
706 if (state->fde == NULL) {
712 state->recmode = recmode;
713 state->c = talloc_steal(state, c);
722 try and get the recovery lock in shared storage - should only work
723 on the recovery master recovery daemon. Anywhere else is a bug
725 bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
730 DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
732 if (ctdb->recovery_lock_fd != -1) {
733 close(ctdb->recovery_lock_fd);
734 ctdb->recovery_lock_fd = -1;
737 ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
738 if (ctdb->recovery_lock_fd == -1) {
739 DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n",
740 ctdb->recovery_lock_file, strerror(errno)));
744 set_close_on_exec(ctdb->recovery_lock_fd);
746 lock.l_type = F_WRLCK;
747 lock.l_whence = SEEK_SET;
752 if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
753 close(ctdb->recovery_lock_fd);
754 ctdb->recovery_lock_fd = -1;
756 DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
762 close(ctdb->recovery_lock_fd);
763 ctdb->recovery_lock_fd = -1;
767 DEBUG(DEBUG_ERR, ("Recovery lock taken successfully\n"));
770 DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
776 delete a record as part of the vacuum process
777 only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
778 use non-blocking locks
780 return 0 if the record was successfully deleted (i.e. it does not exist
781 when the function returns)
782 or !0 is the record still exists in the tdb after returning.
784 static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
786 TDB_DATA key, data, data2;
787 struct ctdb_ltdb_header *hdr, *hdr2;
789 /* these are really internal tdb functions - but we need them here for
790 non-blocking lock of the freelist */
791 int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
792 int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
795 key.dsize = rec->keylen;
796 key.dptr = &rec->data[0];
797 data.dsize = rec->datalen;
798 data.dptr = &rec->data[rec->keylen];
800 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
801 DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
805 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
806 DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
810 hdr = (struct ctdb_ltdb_header *)data.dptr;
812 /* use a non-blocking lock */
813 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
817 data2 = tdb_fetch(ctdb_db->ltdb->tdb, key);
818 if (data2.dptr == NULL) {
819 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
823 if (data2.dsize < sizeof(struct ctdb_ltdb_header)) {
824 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
825 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
826 DEBUG(DEBUG_CRIT,(__location__ " Failed to delete corrupt record\n"));
828 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
829 DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
831 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
836 hdr2 = (struct ctdb_ltdb_header *)data2.dptr;
838 if (hdr2->rsn > hdr->rsn) {
839 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
840 DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
841 (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
846 if (hdr2->dmaster == ctdb->pnn) {
847 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
848 DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
853 if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
854 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
859 if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
860 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
861 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
862 DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
867 tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
868 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
875 struct recovery_callback_state {
876 struct ctdb_req_control *c;
881 called when the 'recovered' event script has finished
883 static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
885 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
887 ctdb_enable_monitoring(ctdb);
890 DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
891 if (status == -ETIME) {
896 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
899 gettimeofday(&ctdb->last_recovery_finished, NULL);
903 recovery has finished
905 int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb,
906 struct ctdb_req_control *c,
910 struct recovery_callback_state *state;
912 DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
914 ctdb_persistent_finish_trans3_commits(ctdb);
916 state = talloc(ctdb, struct recovery_callback_state);
917 CTDB_NO_MEMORY(ctdb, state);
921 ctdb_disable_monitoring(ctdb);
923 ret = ctdb_event_script_callback(ctdb, state,
924 ctdb_end_recovery_callback,
927 CTDB_EVENT_RECOVERED, "%s", "");
930 ctdb_enable_monitoring(ctdb);
932 DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
937 /* tell the control that we will be reply asynchronously */
938 state->c = talloc_steal(state, c);
944 called when the 'startrecovery' event script has finished
946 static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
948 struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
951 DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
954 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
959 run the startrecovery eventscript
961 int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb,
962 struct ctdb_req_control *c,
966 struct recovery_callback_state *state;
968 DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
969 gettimeofday(&ctdb->last_recovery_started, NULL);
971 state = talloc(ctdb, struct recovery_callback_state);
972 CTDB_NO_MEMORY(ctdb, state);
974 state->c = talloc_steal(state, c);
976 ctdb_disable_monitoring(ctdb);
978 ret = ctdb_event_script_callback(ctdb, state,
979 ctdb_start_recovery_callback,
981 CTDB_EVENT_START_RECOVERY,
985 DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
990 /* tell the control that we will be reply asynchronously */
996 try to delete all these records as part of the vacuuming process
997 and return the records we failed to delete
999 int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
1001 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1002 struct ctdb_db_context *ctdb_db;
1004 struct ctdb_rec_data *rec;
1005 struct ctdb_marshall_buffer *records;
1007 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1008 DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
1012 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1014 DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
1019 DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
1020 reply->count, reply->db_id));
1023 /* create a blob to send back the records we couldnt delete */
1024 records = (struct ctdb_marshall_buffer *)
1025 talloc_zero_size(outdata,
1026 offsetof(struct ctdb_marshall_buffer, data));
1027 if (records == NULL) {
1028 DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
1031 records->db_id = ctdb_db->db_id;
1034 rec = (struct ctdb_rec_data *)&reply->data[0];
1035 for (i=0;i<reply->count;i++) {
1038 key.dptr = &rec->data[0];
1039 key.dsize = rec->keylen;
1040 data.dptr = &rec->data[key.dsize];
1041 data.dsize = rec->datalen;
1043 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1044 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
1048 /* If we cant delete the record we must add it to the reply
1049 so the lmaster knows it may not purge this record
1051 if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
1053 struct ctdb_ltdb_header *hdr;
1055 hdr = (struct ctdb_ltdb_header *)data.dptr;
1056 data.dptr += sizeof(*hdr);
1057 data.dsize -= sizeof(*hdr);
1059 DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
1061 old_size = talloc_get_size(records);
1062 records = talloc_realloc_size(outdata, records, old_size + rec->length);
1063 if (records == NULL) {
1064 DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
1068 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1071 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1075 outdata->dptr = (uint8_t *)records;
1076 outdata->dsize = talloc_get_size(records);
1082 * Store a record as part of the vacuum process:
1083 * This is called from the RECEIVE_RECORD control which
1084 * the lmaster uses to send the current empty copy
1085 * to all nodes for storing, before it lets the other
1086 * nodes delete the records in the second phase with
1087 * the TRY_DELETE_RECORDS control.
1089 * Only store if we are not lmaster or dmaster, and our
1090 * rsn is <= the provided rsn. Use non-blocking locks.
1092 * return 0 if the record was successfully stored.
1093 * return !0 if the record still exists in the tdb after returning.
1095 static int store_tdb_record(struct ctdb_context *ctdb,
1096 struct ctdb_db_context *ctdb_db,
1097 struct ctdb_rec_data *rec)
1099 TDB_DATA key, data, data2;
1100 struct ctdb_ltdb_header *hdr, *hdr2;
1103 key.dsize = rec->keylen;
1104 key.dptr = &rec->data[0];
1105 data.dsize = rec->datalen;
1106 data.dptr = &rec->data[rec->keylen];
1108 if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
1109 DEBUG(DEBUG_INFO, (__location__ " Called store_tdb_record "
1110 "where we are lmaster\n"));
1114 if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
1115 DEBUG(DEBUG_ERR, (__location__ " Bad record size\n"));
1119 hdr = (struct ctdb_ltdb_header *)data.dptr;
1121 /* use a non-blocking lock */
1122 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
1123 DEBUG(DEBUG_INFO, (__location__ " Failed to lock chain in non-blocking mode\n"));
1127 data2 = tdb_fetch(ctdb_db->ltdb->tdb, key);
1128 if (data2.dptr == NULL || data2.dsize < sizeof(struct ctdb_ltdb_header)) {
1129 tdb_store(ctdb_db->ltdb->tdb, key, data, 0);
1130 DEBUG(DEBUG_INFO, (__location__ " Stored record\n"));
1135 hdr2 = (struct ctdb_ltdb_header *)data2.dptr;
1137 if (hdr2->rsn > hdr->rsn) {
1138 DEBUG(DEBUG_INFO, (__location__ " Skipping record with "
1139 "rsn=%llu - called with rsn=%llu\n",
1140 (unsigned long long)hdr2->rsn,
1141 (unsigned long long)hdr->rsn));
1146 if (hdr2->dmaster == ctdb->pnn) {
1147 DEBUG(DEBUG_INFO, (__location__ " Attempted to store record "
1148 "where we are the dmaster\n"));
1153 if (tdb_store(ctdb_db->ltdb->tdb, key, data, 0) != 0) {
1154 DEBUG(DEBUG_INFO,(__location__ " Failed to store record\n"));
1162 tdb_chainunlock(ctdb_db->ltdb->tdb, key);
1170 * Try to store all these records as part of the vacuuming process
1171 * and return the records we failed to store.
1173 int32_t ctdb_control_receive_records(struct ctdb_context *ctdb,
1174 TDB_DATA indata, TDB_DATA *outdata)
1176 struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
1177 struct ctdb_db_context *ctdb_db;
1179 struct ctdb_rec_data *rec;
1180 struct ctdb_marshall_buffer *records;
1182 if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
1184 (__location__ " invalid data in receive_records\n"));
1188 ctdb_db = find_ctdb_db(ctdb, reply->db_id);
1190 DEBUG(DEBUG_ERR, (__location__ " Unknown db 0x%08x\n",
1195 DEBUG(DEBUG_DEBUG, ("starting receive_records of %u records for "
1196 "dbid 0x%x\n", reply->count, reply->db_id));
1198 /* create a blob to send back the records we could not store */
1199 records = (struct ctdb_marshall_buffer *)
1200 talloc_zero_size(outdata,
1201 offsetof(struct ctdb_marshall_buffer, data));
1202 if (records == NULL) {
1203 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1206 records->db_id = ctdb_db->db_id;
1208 rec = (struct ctdb_rec_data *)&reply->data[0];
1209 for (i=0; i<reply->count; i++) {
1212 key.dptr = &rec->data[0];
1213 key.dsize = rec->keylen;
1214 data.dptr = &rec->data[key.dsize];
1215 data.dsize = rec->datalen;
1217 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1218 DEBUG(DEBUG_CRIT, (__location__ " bad ltdb record "
1224 * If we can not store the record we must add it to the reply
1225 * so the lmaster knows it may not purge this record.
1227 if (store_tdb_record(ctdb, ctdb_db, rec) != 0) {
1229 struct ctdb_ltdb_header *hdr;
1231 hdr = (struct ctdb_ltdb_header *)data.dptr;
1232 data.dptr += sizeof(*hdr);
1233 data.dsize -= sizeof(*hdr);
1235 DEBUG(DEBUG_INFO, (__location__ " Failed to store "
1236 "record with hash 0x%08x in vacuum "
1237 "via RECEIVE_RECORDS\n",
1240 old_size = talloc_get_size(records);
1241 records = talloc_realloc_size(outdata, records,
1242 old_size + rec->length);
1243 if (records == NULL) {
1244 DEBUG(DEBUG_ERR, (__location__ " Failed to "
1249 memcpy(old_size+(uint8_t *)records, rec, rec->length);
1252 rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
1256 outdata->dptr = (uint8_t *)records;
1257 outdata->dsize = talloc_get_size(records);
1266 int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
1268 uint32_t *capabilities = NULL;
1270 capabilities = talloc(outdata, uint32_t);
1271 CTDB_NO_MEMORY(ctdb, capabilities);
1272 *capabilities = ctdb->capabilities;
1274 outdata->dsize = sizeof(uint32_t);
1275 outdata->dptr = (uint8_t *)capabilities;
1280 static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
1282 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
1283 uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
1285 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
1287 if (*count < ctdb->tunable.recd_ping_failcount) {
1289 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1290 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1291 ctdb_recd_ping_timeout, ctdb);
1295 DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Shutting down ctdb daemon. (This can be caused if the cluster filesystem has hung)\n"));
1297 ctdb_stop_recoverd(ctdb);
1298 ctdb_stop_keepalive(ctdb);
1299 ctdb_stop_monitoring(ctdb);
1300 ctdb_release_all_ips(ctdb);
1301 if (ctdb->methods != NULL) {
1302 ctdb->methods->shutdown(ctdb);
1304 ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
1305 DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Daemon has been shut down.\n"));
1309 /* The recovery daemon will ping us at regular intervals.
1310 If we havent been pinged for a while we assume the recovery
1311 daemon is inoperable and we shut down.
1313 int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
1315 talloc_free(ctdb->recd_ping_count);
1317 ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
1318 CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
1320 if (ctdb->tunable.recd_ping_timeout != 0) {
1321 event_add_timed(ctdb->ev, ctdb->recd_ping_count,
1322 timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
1323 ctdb_recd_ping_timeout, ctdb);
1331 int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
1333 CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
1335 ctdb->recovery_master = ((uint32_t *)(&indata.dptr[0]))[0];
1340 struct stop_node_callback_state {
1341 struct ctdb_req_control *c;
1345 called when the 'stopped' event script has finished
1347 static void ctdb_stop_node_callback(struct ctdb_context *ctdb, int status, void *p)
1349 struct stop_node_callback_state *state = talloc_get_type(p, struct stop_node_callback_state);
1352 DEBUG(DEBUG_ERR,(__location__ " stopped event script failed (status %d)\n", status));
1353 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
1354 if (status == -ETIME) {
1355 ctdb_ban_self(ctdb);
1359 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
1363 int32_t ctdb_control_stop_node(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
1366 struct stop_node_callback_state *state;
1368 DEBUG(DEBUG_INFO,(__location__ " Stopping node\n"));
1370 state = talloc(ctdb, struct stop_node_callback_state);
1371 CTDB_NO_MEMORY(ctdb, state);
1373 state->c = talloc_steal(state, c);
1375 ctdb_disable_monitoring(ctdb);
1377 ret = ctdb_event_script_callback(ctdb, state,
1378 ctdb_stop_node_callback,
1380 CTDB_EVENT_STOPPED, "%s", "");
1383 ctdb_enable_monitoring(ctdb);
1385 DEBUG(DEBUG_ERR,(__location__ " Failed to stop node\n"));
1390 ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
1392 *async_reply = true;
1397 int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
1399 DEBUG(DEBUG_INFO,(__location__ " Continue node\n"));
1400 ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;