42c0397189d707154376f2db6995a379cf565523
[abartlet/samba.git/.git] / source3 / lib / g_lock.c
1 /*
2    Unix SMB/CIFS implementation.
3    global locks based on dbwrap and messaging
4    Copyright (C) 2009 by Volker Lendecke
5
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "includes.h"
21 #include "g_lock.h"
22
23 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
24                                     struct server_id pid);
25
26 struct g_lock_ctx {
27         struct db_context *db;
28         struct messaging_context *msg;
29 };
30
31 /*
32  * The "g_lock.tdb" file contains records, indexed by the 0-terminated
33  * lockname. The record contains an array of "struct g_lock_rec"
34  * structures. Waiters have the lock_type with G_LOCK_PENDING or'ed.
35  */
36
37 struct g_lock_rec {
38         enum g_lock_type lock_type;
39         struct server_id pid;
40 };
41
42 struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
43                                    struct messaging_context *msg)
44 {
45         struct g_lock_ctx *result;
46
47         result = talloc(mem_ctx, struct g_lock_ctx);
48         if (result == NULL) {
49                 return NULL;
50         }
51         result->msg = msg;
52
53         result->db = db_open(result, lock_path("g_lock.tdb"), 0,
54                              TDB_CLEAR_IF_FIRST, O_RDWR|O_CREAT, 0700);
55         if (result->db == NULL) {
56                 DEBUG(1, ("g_lock_init: Could not open g_lock.tdb"));
57                 TALLOC_FREE(result);
58                 return NULL;
59         }
60         return result;
61 }
62
63 static bool g_lock_conflicts(enum g_lock_type lock_type,
64                              const struct g_lock_rec *rec)
65 {
66         enum g_lock_type rec_lock = rec->lock_type;
67
68         if ((rec_lock & G_LOCK_PENDING) != 0) {
69                 return false;
70         }
71
72         /*
73          * Only tested write locks so far. Very likely this routine
74          * needs to be fixed for read locks....
75          */
76         if ((lock_type == G_LOCK_READ) && (rec_lock == G_LOCK_READ)) {
77                 return false;
78         }
79         return true;
80 }
81
82 static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
83                          int *pnum_locks, struct g_lock_rec **plocks)
84 {
85         int i, num_locks;
86         struct g_lock_rec *locks;
87
88         if ((data.dsize % sizeof(struct g_lock_rec)) != 0) {
89                 DEBUG(1, ("invalid lock record length %d\n", (int)data.dsize));
90                 return false;
91         }
92
93         num_locks = data.dsize / sizeof(struct g_lock_rec);
94         locks = talloc_array(mem_ctx, struct g_lock_rec, num_locks);
95         if (locks == NULL) {
96                 DEBUG(1, ("talloc failed\n"));
97                 return false;
98         }
99
100         memcpy(locks, data.dptr, data.dsize);
101
102         DEBUG(10, ("locks:\n"));
103         for (i=0; i<num_locks; i++) {
104                 DEBUGADD(10, ("%s: %s %s\n",
105                               procid_str(talloc_tos(), &locks[i].pid),
106                               ((locks[i].lock_type & 1) == G_LOCK_READ) ?
107                               "read" : "write",
108                               (locks[i].lock_type & G_LOCK_PENDING) ?
109                               "(pending)" : "(owner)"));
110
111                 if (process_exists(locks[i].pid)) {
112                         continue;
113                 }
114                 DEBUGADD(10, ("%s does not exist -- discarding\n",
115                               procid_str(talloc_tos(), &locks[i].pid)));
116
117                 if (i < (num_locks-1)) {
118                         locks[i] = locks[num_locks-1];
119                 }
120                 num_locks -= 1;
121         }
122
123         *plocks = locks;
124         *pnum_locks = num_locks;
125         return true;
126 }
127
128 static struct g_lock_rec *g_lock_addrec(TALLOC_CTX *mem_ctx,
129                                         struct g_lock_rec *locks,
130                                         int *pnum_locks,
131                                         const struct server_id pid,
132                                         enum g_lock_type lock_type)
133 {
134         struct g_lock_rec *result;
135         int num_locks = *pnum_locks;
136
137         result = talloc_realloc(mem_ctx, locks, struct g_lock_rec,
138                                 num_locks+1);
139         if (result == NULL) {
140                 return NULL;
141         }
142
143         result[num_locks].pid = pid;
144         result[num_locks].lock_type = lock_type;
145         *pnum_locks += 1;
146         return result;
147 }
148
149 static void g_lock_got_retry(struct messaging_context *msg,
150                              void *private_data,
151                              uint32_t msg_type,
152                              struct server_id server_id,
153                              DATA_BLOB *data);
154
155 static NTSTATUS g_lock_trylock(struct g_lock_ctx *ctx, const char *name,
156                                enum g_lock_type lock_type)
157 {
158         struct db_record *rec = NULL;
159         struct g_lock_rec *locks = NULL;
160         int i, num_locks;
161         struct server_id self;
162         int our_index;
163         TDB_DATA data;
164         NTSTATUS status = NT_STATUS_OK;
165         NTSTATUS store_status;
166
167 again:
168         rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
169                                     string_term_tdb_data(name));
170         if (rec == NULL) {
171                 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
172                 status = NT_STATUS_LOCK_NOT_GRANTED;
173                 goto done;
174         }
175
176         if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
177                 DEBUG(10, ("g_lock_parse for %s failed\n", name));
178                 status = NT_STATUS_INTERNAL_ERROR;
179                 goto done;
180         }
181
182         self = procid_self();
183         our_index = -1;
184
185         for (i=0; i<num_locks; i++) {
186                 if (procid_equal(&self, &locks[i].pid)) {
187                         if (our_index != -1) {
188                                 DEBUG(1, ("g_lock_trylock: Added ourself "
189                                           "twice!\n"));
190                                 status = NT_STATUS_INTERNAL_ERROR;
191                                 goto done;
192                         }
193                         if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
194                                 DEBUG(1, ("g_lock_trylock: Found ourself not "
195                                           "pending!\n"));
196                                 status = NT_STATUS_INTERNAL_ERROR;
197                                 goto done;
198                         }
199
200                         our_index = i;
201
202                         /* never conflict with ourself */
203                         continue;
204                 }
205                 if (g_lock_conflicts(lock_type, &locks[i])) {
206                         struct server_id pid = locks[i].pid;
207
208                         if (!process_exists(pid)) {
209                                 TALLOC_FREE(locks);
210                                 TALLOC_FREE(rec);
211                                 status = g_lock_force_unlock(ctx, name, pid);
212                                 if (!NT_STATUS_IS_OK(status)) {
213                                         DEBUG(1, ("Could not unlock dead lock "
214                                                   "holder!\n"));
215                                         goto done;
216                                 }
217                                 goto again;
218                         }
219                         lock_type |= G_LOCK_PENDING;
220                 }
221         }
222
223         if (our_index == -1) {
224                 /* First round, add ourself */
225
226                 locks = g_lock_addrec(talloc_tos(), locks, &num_locks,
227                                       self, lock_type);
228                 if (locks == NULL) {
229                         DEBUG(10, ("g_lock_addrec failed\n"));
230                         status = NT_STATUS_NO_MEMORY;
231                         goto done;
232                 }
233         } else {
234                 /*
235                  * Retry. We were pending last time. Overwrite the
236                  * stored lock_type with what we calculated, we might
237                  * have acquired the lock this time.
238                  */
239                 locks[our_index].lock_type = lock_type;
240         }
241
242         data = make_tdb_data((uint8_t *)locks, num_locks * sizeof(*locks));
243         store_status = rec->store(rec, data, 0);
244         if (!NT_STATUS_IS_OK(store_status)) {
245                 DEBUG(1, ("rec->store failed: %s\n",
246                           nt_errstr(store_status)));
247                 status = store_status;
248         }
249
250 done:
251         TALLOC_FREE(locks);
252         TALLOC_FREE(rec);
253
254         if (NT_STATUS_IS_OK(status) && (lock_type & G_LOCK_PENDING) != 0) {
255                 return STATUS_PENDING;
256         }
257
258         return NT_STATUS_OK;
259 }
260
261 NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, const char *name,
262                      enum g_lock_type lock_type, struct timeval timeout)
263 {
264         struct tevent_timer *te = NULL;
265         NTSTATUS status;
266         bool retry = false;
267         struct timeval timeout_end;
268         struct timeval timeout_remaining;
269         struct timeval time_now;
270
271         DEBUG(10, ("Trying to acquire lock %d for %s\n", (int)lock_type,
272                    name));
273
274         if (lock_type & ~1) {
275                 DEBUG(1, ("Got invalid lock type %d for %s\n",
276                           (int)lock_type, name));
277                 return NT_STATUS_INVALID_PARAMETER;
278         }
279
280 #ifdef CLUSTER_SUPPORT
281         if (lp_clustering()) {
282                 status = ctdb_watch_us(messaging_ctdbd_connection());
283                 if (!NT_STATUS_IS_OK(status)) {
284                         DEBUG(10, ("could not register retry with ctdb: %s\n",
285                                    nt_errstr(status)));
286                         goto done;
287                 }
288         }
289 #endif
290
291         status = messaging_register(ctx->msg, &retry, MSG_DBWRAP_G_LOCK_RETRY,
292                                     g_lock_got_retry);
293         if (!NT_STATUS_IS_OK(status)) {
294                 DEBUG(10, ("messaging_register failed: %s\n",
295                            nt_errstr(status)));
296                 return status;
297         }
298
299         time_now = timeval_current();
300         timeout_end = timeval_sum(&time_now, &timeout);
301
302         while (true) {
303 #ifdef CLUSTER_SUPPORT
304                 fd_set _r_fds;
305 #endif
306                 fd_set *r_fds = NULL;
307                 int max_fd = 0;
308                 int ret;
309
310                 status = g_lock_trylock(ctx, name, lock_type);
311                 if (NT_STATUS_IS_OK(status)) {
312                         DEBUG(10, ("Got lock %s\n", name));
313                         break;
314                 }
315                 if (!NT_STATUS_EQUAL(status, STATUS_PENDING)) {
316                         DEBUG(10, ("g_lock_trylock failed: %s\n",
317                                    nt_errstr(status)));
318                         break;
319                 }
320
321                 DEBUG(10, ("g_lock_trylock: Did not get lock, waiting...\n"));
322
323                 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
324                  *             !!! HACK ALERT --- FIX ME !!!
325                  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
326                  * What we really want to do here is to react to
327                  * MSG_DBWRAP_G_LOCK_RETRY messages that are either sent
328                  * by a client doing g_lock_unlock or by ourselves when
329                  * we receive a CTDB_SRVID_SAMBA_NOTIFY or
330                  * CTDB_SRVID_RECONFIGURE message from ctdbd, i.e. when
331                  * either a client holding a lock or a complete node
332                  * has died.
333                  *
334                  * Doing this properly involves calling tevent_loop_once(),
335                  * but doing this here with the main ctdbd messaging context
336                  * creates a nested event loop when g_lock_lock() is called
337                  * from the main event loop, e.g. in a tcon_and_X where the
338                  * share_info.tdb needs to be initialized and is locked by
339                  * another process, or when the remore registry is accessed
340                  * for writing and some other process already holds a lock
341                  * on the registry.tdb.
342                  *
343                  * So as a quick fix, we act a little coarsely here: we do
344                  * a select on the ctdb connection fd and when it is readable
345                  * or we get EINTR, then we retry without actually parsing
346                  * any ctdb packages or dispatching messages. This means that
347                  * we retry more often than intended by design, but this does
348                  * not harm and it is unobtrusive. When we have finished,
349                  * the main loop will pick up all the messages and ctdb
350                  * packets. The only extra twist is that we cannot use timed
351                  * events here but have to handcode a timeout.
352                  */
353
354 #ifdef CLUSTER_SUPPORT
355                 if (lp_clustering()) {
356                         struct ctdbd_connection *conn = messaging_ctdbd_connection();
357
358                         r_fds = &_r_fds;
359                         FD_ZERO(r_fds);
360                         max_fd = ctdbd_conn_get_fd(conn);
361                         FD_SET(max_fd, r_fds);
362                 }
363 #endif
364
365                 time_now = timeval_current();
366                 timeout_remaining = timeval_until(&time_now, &timeout_end);
367
368                 ret = sys_select(max_fd + 1, r_fds, NULL, NULL,
369                                  &timeout_remaining);
370
371                 if (ret == -1) {
372                         if (errno != EINTR) {
373                                 DEBUG(1, ("error calling select: %s\n",
374                                           strerror(errno)));
375                                 status = NT_STATUS_INTERNAL_ERROR;
376                                 break;
377                         }
378                         /*
379                          * errno == EINTR:
380                          * This means a signal was received.
381                          * It might have been a MSG_DBWRAP_G_LOCK_RETRY message.
382                          * ==> retry
383                          */
384                 } else if (ret == 0) {
385                         if (timeval_expired(&timeout_end)) {
386                                 DEBUG(10, ("g_lock_lock timed out\n"));
387                                 status = NT_STATUS_LOCK_NOT_GRANTED;
388                                 break;
389                         } else {
390                                 DEBUG(10, ("select returned 0 but timeout not "
391                                            "not expired: strange - retrying\n"));
392                         }
393                 } else if (ret != 1) {
394                         DEBUG(1, ("invalid return code of select: %d\n", ret));
395                         status = NT_STATUS_INTERNAL_ERROR;
396                         break;
397                 }
398                 /*
399                  * ret == 1:
400                  * This means ctdbd has sent us some data.
401                  * Might be a CTDB_SRVID_RECONFIGURE or a
402                  * CTDB_SRVID_SAMBA_NOTIFY message.
403                  * ==> retry
404                  */
405         }
406
407 #ifdef CLUSTER_SUPPORT
408 done:
409 #endif
410
411         if (!NT_STATUS_IS_OK(status)) {
412                 NTSTATUS unlock_status;
413
414                 unlock_status = g_lock_unlock(ctx, name);
415
416                 if (!NT_STATUS_IS_OK(unlock_status)) {
417                         DEBUG(1, ("Could not remove ourself from the locking "
418                                   "db: %s\n", nt_errstr(status)));
419                 }
420         }
421
422         messaging_deregister(ctx->msg, MSG_DBWRAP_G_LOCK_RETRY, &retry);
423         TALLOC_FREE(te);
424
425         return status;
426 }
427
428 static void g_lock_got_retry(struct messaging_context *msg,
429                              void *private_data,
430                              uint32_t msg_type,
431                              struct server_id server_id,
432                              DATA_BLOB *data)
433 {
434         bool *pretry = (bool *)private_data;
435
436         DEBUG(10, ("Got retry message from pid %s\n",
437                    procid_str(talloc_tos(), &server_id)));
438
439         *pretry = true;
440 }
441
442 static NTSTATUS g_lock_force_unlock(struct g_lock_ctx *ctx, const char *name,
443                                     struct server_id pid)
444 {
445         struct db_record *rec = NULL;
446         struct g_lock_rec *locks = NULL;
447         int i, num_locks;
448         enum g_lock_type lock_type;
449         NTSTATUS status;
450
451         rec = ctx->db->fetch_locked(ctx->db, talloc_tos(),
452                                     string_term_tdb_data(name));
453         if (rec == NULL) {
454                 DEBUG(10, ("fetch_locked(\"%s\") failed\n", name));
455                 status = NT_STATUS_INTERNAL_ERROR;
456                 goto done;
457         }
458
459         if (!g_lock_parse(talloc_tos(), rec->value, &num_locks, &locks)) {
460                 DEBUG(10, ("g_lock_parse for %s failed\n", name));
461                 status = NT_STATUS_INTERNAL_ERROR;
462                 goto done;
463         }
464
465         for (i=0; i<num_locks; i++) {
466                 if (procid_equal(&pid, &locks[i].pid)) {
467                         break;
468                 }
469         }
470
471         if (i == num_locks) {
472                 DEBUG(10, ("g_lock_force_unlock: Lock not found\n"));
473                 status = NT_STATUS_INTERNAL_ERROR;
474                 goto done;
475         }
476
477         lock_type = locks[i].lock_type;
478
479         if (i < (num_locks-1)) {
480                 locks[i] = locks[num_locks-1];
481         }
482         num_locks -= 1;
483
484         if (num_locks == 0) {
485                 status = rec->delete_rec(rec);
486         } else {
487                 TDB_DATA data;
488                 data = make_tdb_data((uint8_t *)locks,
489                                      sizeof(struct g_lock_rec) * num_locks);
490                 status = rec->store(rec, data, 0);
491         }
492
493         if (!NT_STATUS_IS_OK(status)) {
494                 DEBUG(1, ("g_lock_force_unlock: Could not store record: %s\n",
495                           nt_errstr(status)));
496                 goto done;
497         }
498
499         if ((lock_type & G_LOCK_PENDING) == 0) {
500                 /*
501                  * We've been the lock holder. Tell all others to retry.
502                  */
503                 for (i=0; i<num_locks; i++) {
504                         if ((locks[i].lock_type & G_LOCK_PENDING) == 0) {
505                                 continue;
506                         }
507
508                         /*
509                          * Ping all waiters to retry
510                          */
511                         status = messaging_send(ctx->msg, locks[i].pid,
512                                                 MSG_DBWRAP_G_LOCK_RETRY,
513                                                 &data_blob_null);
514                         if (!NT_STATUS_IS_OK(status)) {
515                                 DEBUG(1, ("sending retry to %s failed: %s\n",
516                                           procid_str(talloc_tos(),
517                                                      &locks[i].pid),
518                                           nt_errstr(status)));
519                         }
520                 }
521         }
522 done:
523
524         TALLOC_FREE(locks);
525         TALLOC_FREE(rec);
526         return status;
527 }
528
529 NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, const char *name)
530 {
531         NTSTATUS status;
532
533         status = g_lock_force_unlock(ctx, name, procid_self());
534
535 #ifdef CLUSTER_SUPPORT
536         if (lp_clustering()) {
537                 ctdb_unwatch(messaging_ctdbd_connection());
538         }
539 #endif
540         return status;
541 }
542
543 struct g_lock_locks_state {
544         int (*fn)(const char *name, void *private_data);
545         void *private_data;
546 };
547
548 static int g_lock_locks_fn(struct db_record *rec, void *priv)
549 {
550         struct g_lock_locks_state *state = (struct g_lock_locks_state *)priv;
551
552         if ((rec->key.dsize == 0) || (rec->key.dptr[rec->key.dsize-1] != 0)) {
553                 DEBUG(1, ("invalid key in g_lock.tdb, ignoring\n"));
554                 return 0;
555         }
556         return state->fn((char *)rec->key.dptr, state->private_data);
557 }
558
559 int g_lock_locks(struct g_lock_ctx *ctx,
560                  int (*fn)(const char *name, void *private_data),
561                  void *private_data)
562 {
563         struct g_lock_locks_state state;
564
565         state.fn = fn;
566         state.private_data = private_data;
567
568         return ctx->db->traverse_read(ctx->db, g_lock_locks_fn, &state);
569 }
570
571 NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, const char *name,
572                      int (*fn)(struct server_id pid,
573                                enum g_lock_type lock_type,
574                                void *private_data),
575                      void *private_data)
576 {
577         TDB_DATA data;
578         int i, num_locks;
579         struct g_lock_rec *locks = NULL;
580         bool ret;
581
582         if (ctx->db->fetch(ctx->db, talloc_tos(), string_term_tdb_data(name),
583                            &data) != 0) {
584                 return NT_STATUS_NOT_FOUND;
585         }
586
587         if ((data.dsize == 0) || (data.dptr == NULL)) {
588                 return NT_STATUS_OK;
589         }
590
591         ret = g_lock_parse(talloc_tos(), data, &num_locks, &locks);
592
593         TALLOC_FREE(data.dptr);
594
595         if (!ret) {
596                 DEBUG(10, ("g_lock_parse for %s failed\n", name));
597                 return NT_STATUS_INTERNAL_ERROR;
598         }
599
600         for (i=0; i<num_locks; i++) {
601                 if (fn(locks[i].pid, locks[i].lock_type, private_data) != 0) {
602                         break;
603                 }
604         }
605         TALLOC_FREE(locks);
606         return NT_STATUS_OK;
607 }
608
609 struct g_lock_get_state {
610         bool found;
611         struct server_id *pid;
612 };
613
614 static int g_lock_get_fn(struct server_id pid, enum g_lock_type lock_type,
615                          void *priv)
616 {
617         struct g_lock_get_state *state = (struct g_lock_get_state *)priv;
618
619         if ((lock_type & G_LOCK_PENDING) != 0) {
620                 return 0;
621         }
622
623         state->found = true;
624         *state->pid = pid;
625         return 1;
626 }
627
628 NTSTATUS g_lock_get(struct g_lock_ctx *ctx, const char *name,
629                     struct server_id *pid)
630 {
631         struct g_lock_get_state state;
632         NTSTATUS status;
633
634         state.found = false;
635         state.pid = pid;
636
637         status = g_lock_dump(ctx, name, g_lock_get_fn, &state);
638         if (!NT_STATUS_IS_OK(status)) {
639                 return status;
640         }
641         if (!state.found) {
642                 return NT_STATUS_NOT_FOUND;
643         }
644         return NT_STATUS_OK;
645 }