s3: Avoid a thundering herd in g_lock_unlock
[obnox/samba-ctdb.git] / source3 / lib / g_lock.c
1 /*
2    Unix SMB/CIFS implementation.
3    global locks based on dbwrap and messaging
4    Copyright (C) 2009 by Volker Lendecke
5
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "includes.h"
21 #include "g_lock.h"
22
23 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
24                                     struct server_id pid);
25
26 struct g_lock_ctx {
27         struct db_context *db;
28         struct messaging_context *msg;
29 };
30
31 /*
32  * The "g_lock.tdb" file contains records, indexed by the 0-terminated
33  * lockname. The record contains an array of "struct g_lock_rec"
34  * structures. Waiters have the lock_type with G_LOCK_PENDING or'ed.
35  */
36
37 struct g_lock_rec {
38         enum g_lock_type lock_type;
39         struct server_id pid;
40 };
41
42 struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
43                                    struct messaging_context *msg)
44 {
45         struct g_lock_ctx *result;
46
47         result = talloc(mem_ctx, struct g_lock_ctx);
48         if (result == NULL) {
49                 return NULL;
50         }
51         result->msg = msg;
52
53         result->db = db_open(result, lock_path("g_lock.tdb"), 0,
54                              TDB_CLEAR_IF_FIRST, O_RDWR|O_CREAT, 0700);
55         if (result->db == NULL) {
56                 DEBUG(1, ("g_lock_init: Could not open g_lock.tdb"));
57                 TALLOC_FREE(result);
58                 return NULL;
59         }
60         return result;
61 }
62
63 static bool g_lock_conflicts(enum g_lock_type lock_type,
64                              const struct g_lock_rec *rec)
65 {
66         enum g_lock_type rec_lock = rec->lock_type;
67
68         if ((rec_lock & G_LOCK_PENDING) != 0) {
69                 return false;
70         }
71
72         /*
73          * Only tested write locks so far. Very likely this routine
74          * needs to be fixed for read locks....
75          */
76         if ((lock_type == G_LOCK_READ) && (rec_lock == G_LOCK_READ)) {
77                 return false;
78         }
79         return true;
80 }
81
82 static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
83                          int *pnum_locks, struct g_lock_rec **plocks)
84 {
85         int i, num_locks;
86         struct g_lock_rec *locks;
87
88         if ((data.dsize % sizeof(struct g_lock_rec)) != 0) {
89                 DEBUG(1, ("invalid lock record length %d\n", (int)data.dsize));
90                 return false;
91         }
92
93         num_locks = data.dsize / sizeof(struct g_lock_rec);
94         locks = talloc_array(mem_ctx, struct g_lock_rec, num_locks);
95         if (locks == NULL) {
96                 DEBUG(1, ("talloc failed\n"));
97                 return false;
98         }
99
100         memcpy(locks, data.dptr, data.dsize);
101
102         DEBUG(10, ("locks:\n"));
103         for (i=0; i<num_locks; i++) {
104                 DEBUGADD(10, ("%s: %s %s\n",
105                               procid_str(debug_ctx(), &locks[i].pid),
106                               ((locks[i].lock_type & 1) == G_LOCK_READ) ?
107                               "read" : "write",
108                               (locks[i].lock_type & G_LOCK_PENDING) ?
109                               "(pending)" : "(owner)"));
110
111                 if (((locks[i].lock_type & G_LOCK_PENDING) == 0)
112                     && !process_exists(locks[i].pid)) {
113
114                         DEBUGADD(10, ("lock owner %s died -- discarding\n",
115                                       procid_str(talloc_tos(),
116                                                  &locks[i].pid)));
117
118                         if (i < (num_locks-1)) {
119                                 locks[i] = locks[num_locks-1];
120                         }
121                         num_locks -= 1;
122                 }
123         }
124
125         *plocks = locks;
126         *pnum_locks = num_locks;
127         return true;
128 }
129
130 static void g_lock_cleanup(int *pnum_locks, struct g_lock_rec *locks)
131 {
132         int i, num_locks;
133
134         num_locks = *pnum_locks;
135
136         DEBUG(10, ("g_lock_cleanup: %d locks\n", num_locks));
137
138         for (i=0; i<num_locks; i++) {
139                 if (process_exists(locks[i].pid)) {
140                         continue;
141                 }
142                 DEBUGADD(10, ("%s does not exist -- discarding\n",
143                               procid_str(debug_ctx(), &locks[i].pid)));
144
145                 if (i < (num_locks-1)) {
146                         locks[i] = locks[num_locks-1];
147                 }
148                 num_locks -= 1;
149         }
150         *pnum_locks = num_locks;
151         return;
152 }
153
154 static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
155                                         struct g_lock_rec *locks,
156                                         int *pnum_locks,
157                                         const struct server_id pid,
158                                         enum g_lock_type lock_type)
159 {
160         struct g_lock_rec *result;
161         int num_locks = *pnum_locks;
162
163         result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
164                                 num_locks+1);
165         if (result == NULL) {
166                 return NULL;
167         }
168
169         result[num_locks].pid = pid;
170         result[num_locks].lock_type = lock_type;
171         *pnum_locks += 1;
172         return result;
173 }
174
175 static void g_lock_got_retry(struct messaging_context *msg,
176                              void *private_data,
177                              uint32_t msg_type,
178                              struct server_id server_id,
179                              DATA_BLOB *data);
180 static void g_lock_timedout(struct tevent_context *ev,
181                             struct tevent_timer *te,
182                             struct timeval current_time,
183                             void *private_data);
184
185 static NTSTATUS g_lock_trylock(struct g_lock_ctx *ctx, const char *name,
186                                enum g_lock_type lock_type)
187 {
188         struct db_record *rec = NULL;
189         struct g_lock_rec *locks = NULL;
190         int i, num_locks;
191         struct server_id self;
192         int our_index;
193         TDB_DATA data;
194         NTSTATUS status = NT_STATUS_OK;
195         NTSTATUS store_status;
196
197 again:
198         rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
199                                     string_term_tdb_data(name));
200         if (rec == NULL) {
201                 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
202                 status = NT_STATUS_LOCK_NOT_GRANTED;
203                 goto done;
204         }
205
206         if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
207                 DEBUG(10, ("g_lock_parse for %s failed\n", name));
208                 status = NT_STATUS_INTERNAL_ERROR;
209                 goto done;
210         }
211
212         self = procid_self();
213         our_index = -1;
214
215         for (i=0; i<num_locks; i++) {
216                 if (procid_equal(&self, &locks[i].pid)) {
217                         if (our_index != -1) {
218                                 DEBUG(1, ("g_lock_trylock: Added ourself "
219                                           "twice!\n"));
220                                 status = NT_STATUS_INTERNAL_ERROR;
221                                 goto done;
222                         }
223                         if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
224                                 DEBUG(1, ("g_lock_trylock: Found ourself not "
225                                           "pending!\n"));
226                                 status = NT_STATUS_INTERNAL_ERROR;
227                                 goto done;
228                         }
229
230                         our_index = i;
231
232                         /* never conflict with ourself */
233                         continue;
234                 }
235                 if (g_lock_conflicts(lock_type, &locks[i])) {
236                         struct server_id pid = locks[i].pid;
237
238                         if (!process_exists(pid)) {
239                                 TALLOC_FREE(locks);
240                                 TALLOC_FREE(rec);
241                                 status = g_lock_force_unlock(ctx, name, pid);
242                                 if (!NT_STATUS_IS_OK(status)) {
243                                         DEBUG(1, ("Could not unlock dead lock "
244                                                   "holder!\n"));
245                                         goto done;
246                                 }
247                                 goto again;
248                         }
249                         lock_type |= G_LOCK_PENDING;
250                 }
251         }
252
253         if (our_index == -1) {
254                 /* First round, add ourself */
255
256                 locks = g_lock_addrec(talloc_tos(), locks, &num_locks,
257                                       self, lock_type);
258                 if (locks == NULL) {
259                         DEBUG(10, ("g_lock_addrec failed\n"));
260                         status = NT_STATUS_NO_MEMORY;
261                         goto done;
262                 }
263         } else {
264                 /*
265                  * Retry. We were pending last time. Overwrite the
266                  * stored lock_type with what we calculated, we might
267                  * have acquired the lock this time.
268                  */
269                 locks[our_index].lock_type = lock_type;
270         }
271
272         if (NT_STATUS_IS_OK(status) && ((lock_type & G_LOCK_PENDING) == 0)) {
273                 /*
274                  * Walk through the list of locks, search for dead entries
275                  */
276                 g_lock_cleanup(&num_locks, locks);
277         }
278
279         data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
280         store_status = rec->store(rec, data, 0);
281         if (!NT_STATUS_IS_OK(store_status)) {
282                 DEBUG(1, ("rec->store failed: %s\n",
283                           nt_errstr(store_status)));
284                 status = store_status;
285         }
286
287 done:
288         TALLOC_FREE(locks);
289         TALLOC_FREE(rec);
290
291         if (NT_STATUS_IS_OK(status) && (lock_type & G_LOCK_PENDING) != 0) {
292                 return STATUS_PENDING;
293         }
294
295         return NT_STATUS_OK;
296 }
297
298 NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
299                      enum g_lock_type lock_type, struct timeval timeout)
300 {
301         struct tevent_timer *te = NULL;
302         NTSTATUS status;
303         bool retry = false;
304         struct timeval timeout_end;
305         struct timeval timeout_remaining;
306         struct timeval time_now;
307
308         DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
309                    name));
310
311         if (lock_type & ~1) {
312                 DEBUG(1, ("Got invalid lock type %d for %s\n",
313                           (int)lock_type, name));
314                 return NT_STATUS_INVALID_PARAMETER;
315         }
316
317 #ifdef CLUSTER_SUPPORT
318         if (lp_clustering()) {
319                 status = ctdb_watch_us(messaging_ctdbd_connection());
320                 if (!NT_STATUS_IS_OK(status)) {
321                         DEBUG(10, ("could not register retry with ctdb: %s\n",
322                                    nt_errstr(status)));
323                         goto done;
324                 }
325         }
326 #endif
327
328         status = messaging_register(ctx->msg, &retry, MSG_DBWRAP_G_LOCK_RETRY,
329                                     g_lock_got_retry);
330         if (!NT_STATUS_IS_OK(status)) {
331                 DEBUG(10, ("messaging_register failed: %s\n",
332                            nt_errstr(status)));
333                 return status;
334         }
335
336         time_now = timeval_current();
337         timeout_end = timeval_sum(&time_now, &timeout);
338
339         while (true) {
340                 fd_set _r_fds;
341                 fd_set *r_fds = NULL;
342                 int max_fd = 0;
343                 int ret;
344
345                 status = g_lock_trylock(ctx, name, lock_type);
346                 if (NT_STATUS_IS_OK(status)) {
347                         DEBUG(10, ("Got lock %s\n", name));
348                         break;
349                 }
350                 if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
351                         DEBUG(10, ("g_lock_trylock failed: %s\n",
352                                    nt_errstr(status)));
353                         break;
354                 }
355
356                 DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
357
358                 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
359                  *             !!! HACK ALERT --- FIX ME !!!
360                  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
361                  * What we really want to do here is to react to
362                  * MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
363                  * by a client doing g_lock_unlock or by ourselves when
364                  * we receive a CTDB_SRVID_SAMBA_NOTIFY or
365                  * CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
366                  * either a client holding a lock or a complete node
367                  * has died.
368                  *
369                  * Doing this properly involves calling tevent_loop_once(),
370                  * but doing this here with the main ctdbd messaging context
371                  * creates a nested event loop when g_lock_lock() is called
372                  * from the main event loop, e.g. in a tcon_and_X where the
373                  * share_info.tdb needs to be initialized and is locked by
374                  * another process, or when the remore registry is accessed
375                  * for writing and some other process already holds a lock
376                  * on the registry.tdb.
377                  *
378                  * So as a quick fix, we act a little coarsely here: we do
379                  * a select on the ctdb connection fd and when it is readable
380                  * or we get EINTR, then we retry without actually parsing
381                  * any ctdb packages or dispatching messages. This means that
382                  * we retry more often than intended by design, but this does
383                  * not harm and it is unobtrusive. When we have finished,
384                  * the main loop will pick up all the messages and ctdb
385                  * packets. The only extra twist is that we cannot use timed
386                  * events here but have to handcode a timeout.
387                  */
388
389 #ifdef CLUSTER_SUPPORT
390                 if (lp_clustering()) {
391                         struct ctdbd_connection *conn = messaging_ctdbd_connection();
392
393                         r_fds = &_r_fds;
394                         FD_ZERO(r_fds);
395                         max_fd = ctdbd_conn_get_fd(conn);
396                         FD_SET(max_fd, r_fds);
397                 }
398 #endif
399
400                 time_now = timeval_current();
401                 timeout_remaining = timeval_until(&time_now, &timeout_end);
402
403                 ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
404                                  &timeout_remaining);
405
406                 if (ret == -1) {
407                         if (errno != EINTR) {
408                                 DEBUG(1, ("error calling select: %s\n",
409                                           strerror(errno)));
410                                 status = NT_STATUS_INTERNAL_ERROR;
411                                 break;
412                         }
413                         /*
414                          * errno == EINTR:
415                          * This means a signal was received.
416                          * It might have been a MSG_DBWRAP_G_LOCK_RETRY message.
417                          * ==> retry
418                          */
419                 } else if (ret == 0) {
420                         if (timeval_expired(&timeout_end)) {
421                                 DEBUG(10, ("g_lock_lock timed out\n"));
422                                 status = NT_STATUS_LOCK_NOT_GRANTED;
423                                 break;
424                         } else {
425                                 DEBUG(10, ("select returned 0 but timeout not "
426                                            "not expired: strange - retrying\n"));
427                         }
428                 } else if (ret != 1) {
429                         DEBUG(1, ("invalid return code of select: %d\n", ret));
430                         status = NT_STATUS_INTERNAL_ERROR;
431                         break;
432                 }
433                 /*
434                  * ret == 1:
435                  * This means ctdbd has sent us some data.
436                  * Might be a CTDB_SRVID_RECONFIGURE or a
437                  * CTDB_SRVID_SAMBA_NOTIFY message.
438                  * ==> retry
439                  */
440         }
441
442 done:
443
444         if (!NT_STATUS_IS_OK(status)) {
445                 NTSTATUS unlock_status;
446
447                 unlock_status = g_lock_unlock(ctx, name);
448
449                 if (!NT_STATUS_IS_OK(unlock_status)) {
450                         DEBUG(1, ("Could not remove ourself from the locking "
451                                   "db: %s\n", nt_errstr(status)));
452                 }
453         }
454
455         messaging_deregister(ctx->msg, MSG_DBWRAP_G_LOCK_RETRY, &retry);
456         TALLOC_FREE(te);
457
458         return status;
459 }
460
461 static void g_lock_got_retry(struct messaging_context *msg,
462                              void *private_data,
463                              uint32_t msg_type,
464                              struct server_id server_id,
465                              DATA_BLOB *data)
466 {
467         bool *pretry = (bool *)private_data;
468
469         DEBUG(10, ("Got retry message from pid %s\n",
470                    procid_str(talloc_tos(), &server_id)));
471
472         *pretry = true;
473 }
474
475 static void g_lock_timedout(struct tevent_context *ev,
476                             struct tevent_timer *te,
477                             struct timeval current_time,
478                             void *private_data)
479 {
480         bool *ptimedout = (bool *)private_data;
481         *ptimedout = true;
482         TALLOC_FREE(te);
483 }
484
485 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
486                                     struct server_id pid)
487 {
488         struct db_record *rec = NULL;
489         struct g_lock_rec *locks = NULL;
490         int i, num_locks;
491         enum g_lock_type lock_type;
492         NTSTATUS status;
493
494         rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
495                                     string_term_tdb_data(name));
496         if (rec == NULL) {
497                 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
498                 status = NT_STATUS_INTERNAL_ERROR;
499                 goto done;
500         }
501
502         if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
503                 DEBUG(10, ("g_lock_parse for %s failed\n", name));
504                 status = NT_STATUS_INTERNAL_ERROR;
505                 goto done;
506         }
507
508         for (i=0; i<num_locks; i++) {
509                 if (procid_equal(&pid, &locks[i].pid)) {
510                         break;
511                 }
512         }
513
514         if (i == num_locks) {
515                 DEBUG(10, ("g_lock_force_unlock: Lock not found\n"));
516                 status = NT_STATUS_INTERNAL_ERROR;
517                 goto done;
518         }
519
520         lock_type = locks[i].lock_type;
521
522         if (i < (num_locks-1)) {
523                 locks[i] = locks[num_locks-1];
524         }
525         num_locks -= 1;
526
527         if (num_locks == 0) {
528                 status = rec->delete_rec(rec);
529         } else {
530                 TDB_DATA data;
531                 data = make_tdb_data((uint8_t *)locks,
532                                      sizeof(struct g_lock_rec) * num_locks);
533                 status = rec->store(rec, data, 0);
534         }
535
536         if (!NT_STATUS_IS_OK(status)) {
537                 DEBUG(1, ("g_lock_force_unlock: Could not store record: %s\n",
538                           nt_errstr(status)));
539                 goto done;
540         }
541
542         if ((lock_type & G_LOCK_PENDING) == 0) {
543                 int num_wakeups = 0;
544
545                 /*
546                  * We've been the lock holder. Others to retry. Don't
547                  * tell all others to avoid a thundering herd. In case
548                  * this leads to a complete stall because we miss some
549                  * processes, the loop in g_lock_lock tries at least
550                  * once a minute.
551                  */
552
553                 for (i=0; i<num_locks; i++) {
554                         if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
555                                 continue;
556                         }
557                         if (!process_exists(locks[i].pid)) {
558                                 continue;
559                         }
560
561                         /*
562                          * Ping all waiters to retry
563                          */
564                         status = messaging_send(ctx->msg, locks[i].pid,
565                                                 MSG_DBWRAP_G_LOCK_RETRY,
566                                                 &data_blob_null);
567                         if (!NT_STATUS_IS_OK(status)) {
568                                 DEBUG(1, ("sending retry to %s failed: %s\n",
569                                           procid_str(debug_ctx(),
570                                                      &locks[i].pid),
571                                           nt_errstr(status)));
572                         } else {
573                                 num_wakeups += 1;
574                         }
575                         if (num_wakeups > 5) {
576                                 break;
577                         }
578                 }
579         }
580 done:
581
582         TALLOC_FREE(locks);
583         TALLOC_FREE(rec);
584         return status;
585 }
586
587 NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, const char *name)
588 {
589         NTSTATUS status;
590
591         status = g_lock_force_unlock(ctx, name, procid_self());
592
593 #ifdef CLUSTER_SUPPORT
594         if (lp_clustering()) {
595                 ctdb_unwatch(messaging_ctdbd_connection());
596         }
597 #endif
598         return status;
599 }
600
601 struct g_lock_locks_state {
602         int (*fn)(const char *name, void *private_data);
603         void *private_data;
604 };
605
606 static int g_lock_locks_fn(struct db_record *rec, void *priv)
607 {
608         struct g_lock_locks_state *state = (struct g_lock_locks_state *)priv;
609
610         if ((rec->key.dsize == 0) || (rec->key.dptr[rec->key.dsize-1] != 0)) {
611                 DEBUG(1, ("invalid key in g_lock.tdb, ignoring\n"));
612                 return 0;
613         }
614         return state->fn((char *)rec->key.dptr, state->private_data);
615 }
616
617 int g_lock_locks(struct g_lock_ctx *ctx,
618                  int (*fn)(const char *name, void *private_data),
619                  void *private_data)
620 {
621         struct g_lock_locks_state state;
622
623         state.fn = fn;
624         state.private_data = private_data;
625
626         return ctx->db->traverse_read(ctx->db, g_lock_locks_fn, &state);
627 }
628
629 NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, const char *name,
630                      int (*fn)(struct server_id pid,
631                                enum g_lock_type lock_type,
632                                void *private_data),
633                      void *private_data)
634 {
635         TDB_DATA data;
636         int i, num_locks;
637         struct g_lock_rec *locks = NULL;
638         bool ret;
639
640         if (ctx->db->fetch(ctx->db, talloc_tos(), string_term_tdb_data(name),
641                            &data) != 0) {
642                 return NT_STATUS_NOT_FOUND;
643         }
644
645         if ((data.dsize == 0) || (data.dptr == NULL)) {
646                 return NT_STATUS_OK;
647         }
648
649         ret = g_lock_parse(talloc_tos(), data, &num_locks, &locks);
650
651         TALLOC_FREE(data.dptr);
652
653         if (!ret) {
654                 DEBUG(10, ("g_lock_parse for %s failed\n", name));
655                 return NT_STATUS_INTERNAL_ERROR;
656         }
657
658         for (i=0; i<num_locks; i++) {
659                 if (fn(locks[i].pid, locks[i].lock_type, private_data) != 0) {
660                         break;
661                 }
662         }
663         TALLOC_FREE(locks);
664         return NT_STATUS_OK;
665 }
666
667 struct g_lock_get_state {
668         bool found;
669         struct server_id *pid;
670 };
671
672 static int g_lock_get_fn(struct server_id pid, enum g_lock_type lock_type,
673                          void *priv)
674 {
675         struct g_lock_get_state *state = (struct g_lock_get_state *)priv;
676
677         if ((lock_type & G_LOCK_PENDING) != 0) {
678                 return 0;
679         }
680
681         state->found = true;
682         *state->pid = pid;
683         return 1;
684 }
685
686 NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
687                     struct server_id *pid)
688 {
689         struct g_lock_get_state state;
690         NTSTATUS status;
691
692         state.found = false;
693         state.pid = pid;
694
695         status = g_lock_dump(ctx, name, g_lock_get_fn, &state);
696         if (!NT_STATUS_IS_OK(status)) {
697                 return status;
698         }
699         if (!state.found) {
700                 return NT_STATUS_NOT_FOUND;
701         }
702         return NT_STATUS_OK;
703 }