s3: Fix a winbind race leading to 100% CPU
[samba.git] / source3 / winbindd / winbindd_dual.c
index 904d3e4ed3d307fd1e7f4d9b8c88788de4fb1600..2c0633c3eaa2673bb7c3a5c1db02d623a0ba5534 100644 (file)
@@ -37,6 +37,8 @@
 extern bool override_logfile;
 extern struct winbindd_methods cache_methods;
 
+static struct winbindd_child *children = NULL;
+
 /* Read some data from a client connection */
 
 static NTSTATUS child_read_request(struct winbindd_cli_state *state)
@@ -134,7 +136,7 @@ static void wb_child_request_trigger(struct tevent_req *req,
                req, struct wb_child_request_state);
        struct tevent_req *subreq;
 
-       if ((state->child->pid == 0) && (!fork_domain_child(state->child))) {
+       if ((state->child->sock == -1) && (!fork_domain_child(state->child))) {
                tevent_req_error(req, errno);
                return;
        }
@@ -164,6 +166,13 @@ static void wb_child_request_done(struct tevent_req *subreq)
        ret = wb_simple_trans_recv(subreq, state, &state->response, &err);
        TALLOC_FREE(subreq);
        if (ret == -1) {
+               /*
+                * The basic parent/child communication broke, close
+                * our socket
+                */
+               close(state->child->sock);
+               state->child->sock = -1;
+               DLIST_REMOVE(children, state->child);
                tevent_req_error(req, err);
                return;
        }
@@ -367,85 +376,6 @@ int wb_domain_request_recv(struct tevent_req *req, TALLOC_CTX *mem_ctx,
        return 0;
 }
 
-/*
- * Machinery for async requests sent to children. You set up a
- * winbindd_request, select a child to query, and issue a async_request
- * call. When the request is completed, the callback function you specified is
- * called back with the private pointer you gave to async_request.
- */
-
-struct winbindd_async_request {
-       struct winbindd_async_request *next, *prev;
-       TALLOC_CTX *mem_ctx;
-       struct winbindd_child *child;
-       struct winbindd_response *response;
-       void (*continuation)(void *private_data, bool success);
-       struct timed_event *reply_timeout_event;
-       pid_t child_pid; /* pid of the child we're waiting on. Used to detect
-                           a restart of the child (child->pid != child_pid). */
-       void *private_data;
-};
-
-static void async_request_done(struct tevent_req *req);
-
-void async_request(TALLOC_CTX *mem_ctx, struct winbindd_child *child,
-                  struct winbindd_request *request,
-                  struct winbindd_response *response,
-                  void (*continuation)(void *private_data, bool success),
-                  void *private_data)
-{
-       struct winbindd_async_request *state;
-       struct tevent_req *req;
-
-       DEBUG(10, ("Sending request to child pid %d (domain=%s)\n",
-                  (int)child->pid,
-                  (child->domain != NULL) ? child->domain->name : "''"));
-
-       state = talloc(mem_ctx, struct winbindd_async_request);
-       if (state == NULL) {
-               DEBUG(0, ("talloc failed\n"));
-               continuation(private_data, False);
-               return;
-       }
-
-       state->mem_ctx = mem_ctx;
-       state->child = child;
-       state->reply_timeout_event = NULL;
-       state->response = response;
-       state->continuation = continuation;
-       state->private_data = private_data;
-
-       request->pid = child->pid;
-
-       req = wb_child_request_send(state, winbind_event_context(),
-                                          child, request);
-       if (req == NULL) {
-               DEBUG(0, ("wb_child_request_send failed\n"));
-                continuation(private_data, false);
-               return;
-        }
-       tevent_req_set_callback(req, async_request_done, state);
-}
-
-static void async_request_done(struct tevent_req *req)
-{
-       struct winbindd_async_request *state = tevent_req_callback_data(
-               req, struct winbindd_async_request);
-       struct winbindd_response *response;
-       int ret, err;
-
-       ret = wb_child_request_recv(req, state, &response, &err);
-       TALLOC_FREE(req);
-       if (ret == -1) {
-               DEBUG(2, ("wb_child_request_recv failed: %s\n",
-                         strerror(err)));
-               state->continuation(state->private_data, false);
-               return;
-       }
-       *state->response = *response;
-       state->continuation(state->private_data, true);
-}
-
 struct domain_request_state {
        struct winbindd_domain *domain;
        struct winbindd_request *request;
@@ -527,13 +457,6 @@ static void recvfrom_child(void *private_data_data, bool success)
        request_ok(state);
 }
 
-void sendto_child(struct winbindd_cli_state *state,
-                 struct winbindd_child *child)
-{
-       async_request(state->mem_ctx, child, state->request,
-                     state->response, recvfrom_child, state);
-}
-
 void sendto_domain(struct winbindd_cli_state *state,
                   struct winbindd_domain *domain)
 {
@@ -588,7 +511,8 @@ void setup_child(struct winbindd_domain *domain, struct winbindd_child *child,
                          "logname == NULL");
        }
 
-       child->domain = NULL;
+       child->sock = -1;
+       child->domain = domain;
        child->table = table;
        child->queue = tevent_queue_create(NULL, "winbind_child");
        SMB_ASSERT(child->queue != NULL);
@@ -596,8 +520,6 @@ void setup_child(struct winbindd_domain *domain, struct winbindd_child *child,
        SMB_ASSERT(child->rpccli != NULL);
 }
 
-struct winbindd_child *children = NULL;
-
 void winbind_child_died(pid_t pid)
 {
        struct winbindd_child *child;
@@ -616,9 +538,6 @@ void winbind_child_died(pid_t pid)
        /* This will be re-added in fork_domain_child() */
 
        DLIST_REMOVE(children, child);
-
-       close(child->sock);
-       child->sock = -1;
        child->pid = 0;
 }
 
@@ -830,7 +749,7 @@ void winbind_msg_onlinestatus(struct messaging_context *msg_ctx,
        TALLOC_CTX *mem_ctx;
        const char *message;
        struct server_id *sender;
-       
+
        DEBUG(5,("winbind_msg_onlinestatus received.\n"));
 
        if (!data->data) {
@@ -843,7 +762,7 @@ void winbind_msg_onlinestatus(struct messaging_context *msg_ctx,
        if (mem_ctx == NULL) {
                return;
        }
-       
+
        message = collect_onlinestatus(mem_ctx);
        if (message == NULL) {
                talloc_destroy(mem_ctx);
@@ -1051,6 +970,24 @@ static bool calculate_next_machine_pwd_change(const char *domain,
                DEBUG(10,("machine password still valid until: %s\n",
                        http_timestring(talloc_tos(), next_change)));
                *t = timeval_set(next_change, 0);
+
+               if (lp_clustering()) {
+                       uint8_t randbuf;
+                       /*
+                        * When having a cluster, we have several
+                        * winbinds racing for the password change. In
+                        * the machine_password_change_handler()
+                        * function we check if someone else was
+                        * faster when the event triggers. We add a
+                        * 255-second random delay here, so that we
+                        * don't run to change the password at the
+                        * exact same moment.
+                        */
+                       generate_random_buffer(&randbuf, sizeof(randbuf));
+                       DEBUG(10, ("adding %d seconds randomness\n",
+                                  (int)randbuf));
+                       t->tv_sec += randbuf;
+               }
                return true;
        }
 
@@ -1117,21 +1054,31 @@ static void machine_password_change_handler(struct event_context *ctx,
                   "trust_pw_find_change_and_store_it returned %s\n",
                   nt_errstr(result)));
 
+       if (NT_STATUS_EQUAL(result, NT_STATUS_ACCESS_DENIED) ) {
+               DEBUG(3,("machine_password_change_handler: password set returned "
+                        "ACCESS_DENIED.  Maybe the trust account "
+                        "password was changed and we didn't know it. "
+                        "Killing connections to domain %s\n",
+                        child->domain->name));
+               TALLOC_FREE(child->domain->conn.netlogon_pipe);
+       }
+
+       if (!calculate_next_machine_pwd_change(child->domain->name,
+                                              &next_change)) {
+               DEBUG(10, ("calculate_next_machine_pwd_change failed\n"));
+               return;
+       }
+
+       DEBUG(10, ("calculate_next_machine_pwd_change returned %s\n",
+                  timeval_string(talloc_tos(), &next_change, false)));
+
        if (!NT_STATUS_IS_OK(result)) {
-               DEBUG(10,("machine_password_change_handler: "
-                       "failed to change machine password: %s\n",
-                        nt_errstr(result)));
-               if (NT_STATUS_EQUAL(result, NT_STATUS_ACCESS_DENIED) ) {
-                       DEBUG(3,("machine_password_change_handler: password set returned "
-                               "ACCESS_DENIED.  Maybe the trust account "
-                               "password was changed and we didn't know it. "
-                               "Killing connections to domain %s\n",
-                               child->domain->name));
-                       TALLOC_FREE(child->domain->conn.netlogon_pipe);
-               }
-       } else {
-               DEBUG(10,("machine_password_change_handler: "
-                       "successfully changed machine password\n"));
+               struct timeval tmp;
+               /*
+                * In case of failure, give the DC a minute to recover
+                */
+               tmp = timeval_current_ofs(60, 0);
+               next_change = timeval_max(&next_change, &tmp);
        }
 
 done:
@@ -1273,6 +1220,9 @@ bool winbindd_reinit_after_fork(const char *logfilename)
                                            logfilename))
                return false;
 
+       /* Stop zombies in children */
+       CatchChild();
+
        /* Don't handle the same messages as our parent. */
        messaging_deregister(winbind_messaging_context(),
                             MSG_SMB_CONF_UPDATED, NULL);
@@ -1396,9 +1346,6 @@ static bool fork_domain_child(struct winbindd_child *child)
 
        DEBUG(10, ("Child process %d\n", (int)sys_getpid()));
 
-       /* Stop zombies in children */
-       CatchChild();
-
        state.sock = fdpair[0];
        close(fdpair[1]);
 
@@ -1518,9 +1465,24 @@ static bool fork_domain_child(struct winbindd_child *child)
 
                FD_ZERO(&r_fds);
                FD_ZERO(&w_fds);
+
+               if (state.sock < 0 || state.sock >= FD_SETSIZE) {
+                       TALLOC_FREE(frame);
+                       perror("EBADF");
+                       _exit(1);
+               }
+
                FD_SET(state.sock, &r_fds);
                maxfd = state.sock;
 
+               /*
+                * Initialize this high as event_add_to_select_args()
+                * uses a timeval_min() on this and next_event. Fix
+                * from Roel van Meer <rolek@alt001.com>.
+                */
+               t.tv_sec = 999999;
+               t.tv_usec = 0;
+
                event_add_to_select_args(winbind_event_context(), &now,
                                         &r_fds, &w_fds, &t, &maxfd);
                tp = get_timed_events_timeout(winbind_event_context(), &t);