IFACE handling. Assume links are always good on nstartup (they almost always
authorRonnie Sahlberg <ronniesahlberg@gmail.com>
Sun, 10 Apr 2011 19:56:14 +0000 (05:56 +1000)
committerRonnie Sahlberg <ronniesahlberg@gmail.com>
Sun, 10 Apr 2011 21:39:34 +0000 (07:39 +1000)
Simplify the handling of setting the links in the 10.interface eventscript
and remove the optimization to only call setifacelink on state change
to make the code simpler to read.

If a take ip event fails, flag the node as unhealthy.

Add a check to the interface script to check if the interface exists
or if it has been deleted.
So that we can capture and become UNHELTHY if someone deletes an interface
we are using to host public addresses.

config/events.d/10.interface
server/ctdb_takeover.c

index 640d3dbad3bffe190b0ecb34ae2bcf800d42f43b..2ca8eaea0426b3c91e871e5afc4d7062f8e53639 100755 (executable)
@@ -38,13 +38,12 @@ monitor_interfaces()
        INTERFACES=`for IFACE in $INTERFACES ; do echo $IFACE ; done | sort | uniq`
 
        local fail=0
-       local force_fail=0
        local ok=0
        for IFACE in $INTERFACES ; do
 
-           local OLDLINK=`echo -n "$IFACES" | grep "^:$IFACE:" | cut -d ':' -f3 | xargs`
-           test -z "$OLDLINK" && {
-               force_fail=1
+           ip addr show $IFACE 2>/dev/null >/dev/null || {
+               echo Interface $IFACE does not exist but it is used by public addresses.
+               exit 1
            }
 
            # These interfaces are sometimes bond devices
@@ -55,44 +54,34 @@ monitor_interfaces()
                grep -q 'Currently Active Slave: None' /proc/net/bonding/$REALIFACE && {
                        echo "ERROR: No active slaves for bond device $REALIFACE"
                        fail=1
-                       test -n "$OLDLINK" && {
-                               ctdb setifacelink $IFACE down
-                       }
+                       ctdb setifacelink $IFACE down
                        continue;
                }
                grep -q '^MII Status: up' /proc/net/bonding/$REALIFACE || {
                        echo "ERROR: public network interface $REALIFACE is down"
                        fail=1
-                       test -n "$OLDLINK" && {
-                               ctdb setifacelink $IFACE down
-                       }
+                       ctdb setifacelink $IFACE down
                        continue;
                }
-               test -n "$OLDLINK" && {
-                       ok=1 # we only set ok for interfaces known to ctdbd
-                       ctdb setifacelink $IFACE up
-               }
+               ok=1 # we only set ok for interfaces known to ctdbd
+               ctdb setifacelink $IFACE up
                continue;
            }
 
            case $IFACE in
            lo*)
                # loopback is always working
-               test -n "$OLDLINK" && {
-                   ok=1 # we only set ok for interfaces known to ctdbd
-                   ctdb setifacelink $IFACE up
-               }
+               ok=1 # we only set ok for interfaces known to ctdbd
+               ctdb setifacelink $IFACE up
                ;;
            ib*)
                # we dont know how to test ib links
-               test -n "$OLDLINK" && {
-                   ok=1 # we only set ok for interfaces known to ctdbd
-                   ctdb setifacelink $IFACE up
-               }
+               ok=1 # we only set ok for interfaces known to ctdbd
+               ctdb setifacelink $IFACE up
                ;;
            *)
                [ -z "$IFACE" ] || {
-                   [ "$(basename $(readlink /sys/class/net/$IFACE/device/driver))" = virtio_net ] ||
+                   [ "$(basename $(readlink /sys/class/net/$IFACE/device/driver) 2>/dev/null)" = virtio_net ] ||
                    ethtool $IFACE | grep -q 'Link detected: yes' || {
                        # On some systems, this is not successful when a
                        # cable is plugged but the interface has not been
@@ -102,16 +91,12 @@ monitor_interfaces()
                        ethtool $IFACE | grep -q 'Link detected: yes' || {
                            echo "ERROR: No link on the public network interface $IFACE"
                            fail=1
-                           test -n "$OLDLINK" && {
-                               ctdb setifacelink $IFACE down
-                           }
+                           ctdb setifacelink $IFACE down
                            continue
                        }
                    }
-                   test -n "$OLDLINK" && {
-                       ok=1 # we only set ok for interfaces known to ctdbd
-                       ctdb setifacelink $IFACE up
-                   }
+                   ok=1 # we only set ok for interfaces known to ctdbd
+                   ctdb setifacelink $IFACE up
                }
                ;;
            esac
@@ -122,10 +107,6 @@ monitor_interfaces()
                return 0;
        }
 
-       test x"$force_fail" != x"0" && {
-               return 1;
-       }
-
        test x"$ok" = x"1" && {
                return 2;
        }
@@ -148,6 +129,13 @@ case "$1" in
      # called after ctdbd has done its initial recovery
      # and we start the services to become healthy
      startup)
+       # Assume all links are good initially
+       INTERFACES=`for IFACE in $INTERFACES ; do echo $IFACE ; done | sort | uniq`
+
+       for IFACE in $INTERFACES ; do
+               ctdb setifacelink $IFACE down
+       done
+       
        monitor_interfaces
 
        ;;
index 3dac1f7fdd1b268a762d8082ca3c9eb550f332ee..bed2ab9f670a718a8b623c0119a339a29c8fc4c1 100644 (file)
@@ -334,6 +334,8 @@ static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
        TDB_DATA data;
 
        if (status != 0) {
+               struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+       
                if (status == -ETIME) {
                        ctdb_ban_self(ctdb);
                }
@@ -341,6 +343,8 @@ static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
                                 ctdb_addr_to_str(&state->vnn->public_address),
                                 ctdb_vnn_iface_string(state->vnn)));
                ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+
+               node->flags |= NODE_FLAGS_UNHEALTHY;
                talloc_free(state);
                return;
        }