Rework restart logic, introduce --max-errors.
authorAndre Noll <maan@tuebingen.mpg.de>
Fri, 12 Dec 2014 14:05:21 +0000 (15:05 +0100)
committerAndre Noll <maan@tuebingen.mpg.de>
Tue, 27 Jan 2015 09:47:54 +0000 (10:47 +0100)
It has happened several times in the past that dss made no progress
because the underlying rsync command terminates with exit code 13
(Errors with program diagnostics). Currently dss special cases this
exit code as a non-fatal error, i.e. it does not terminate but restarts
the rsync command after 60 seconds. If the problem is permanent,
no new snapshots will be created, but the exit hook is not called
either, which is unfortunate.

This commit tries to improve on this. With this patch applied, the
only non-fatal exit code from rsync is 24 (Partial transfer due
to vanished source files), which is actually considered success.
All other non-zero exit codes cause dss to restart the rsync command,
but only at most N times, where N is the argument given to the new
--max-rsync-errors option.

dss.c
dss.ggo
err.h

diff --git a/dss.c b/dss.c
index 006cd27..95c4c03 100644 (file)
--- a/dss.c
+++ b/dss.c
@@ -45,6 +45,8 @@ static int signal_pipe;
 static pid_t create_pid;
 /** Whether the pre-create-hook/rsync/post-create-hook is currently stopped. */
 static int create_process_stopped;
+/** How many times in a row the rsync command failed. */
+static int num_consecutive_rsync_errors;
 /** Process id of current pre-remove/rm/post-remove process. */
 static pid_t remove_pid;
 /** When the next snapshot is due. */
@@ -125,6 +127,7 @@ static void dump_dss_config(const char *msg)
                "reference_snapshot: %s\n"
                "snapshot_creation_status: %s\n"
                "snapshot_removal_status: %s\n"
+               "num_consecutive_rsync_errors: %d\n"
                ,
                (int) getpid(),
                logfile? conf.logfile_arg : "stderr",
@@ -135,7 +138,8 @@ static void dump_dss_config(const char *msg)
                name_of_reference_snapshot?
                        name_of_reference_snapshot : "(none)",
                hook_status_description[snapshot_creation_status],
-               hook_status_description[snapshot_removal_status]
+               hook_status_description[snapshot_removal_status],
+               num_consecutive_rsync_errors
        );
        if (create_pid != 0)
                fprintf(log,
@@ -844,23 +848,23 @@ static int handle_rsync_exit(int status)
        es = WEXITSTATUS(status);
        /*
         * Restart rsync on non-fatal errors:
-        * 12: Error in rsync protocol data stream
-        * 13: Errors with program diagnostics
+        * 24: Partial transfer due to vanished source files
         */
-       if (es == 12 || es == 13) {
-               DSS_WARNING_LOG(("rsync process %d returned %d -- restarting\n",
-                       (int)create_pid, es));
+       if (es != 0 && es != 24) {
+               DSS_WARNING_LOG(("rsync exit code %d, error count %d\n",
+                       es, ++num_consecutive_rsync_errors));
+               if (num_consecutive_rsync_errors > conf.max_rsync_errors_arg) {
+                       ret = -E_TOO_MANY_RSYNC_ERRORS;
+                       snapshot_creation_status = HS_READY;
+                       goto out;
+               }
+               DSS_WARNING_LOG(("restarting rsync process\n"));
                snapshot_creation_status = HS_NEEDS_RESTART;
                next_snapshot_time = get_current_time() + 60;
                ret = 1;
                goto out;
        }
-       if (es != 0 && es != 23 && es != 24) {
-               DSS_ERROR_LOG(("rsync process %d returned %d\n", (int)create_pid, es));
-               ret = -E_BAD_EXIT_CODE;
-               snapshot_creation_status = HS_READY;
-               goto out;
-       }
+       num_consecutive_rsync_errors = 0;
        ret = rename_incomplete_snapshot(current_snapshot_creation_time);
        if (ret < 0)
                goto out;
diff --git a/dss.ggo b/dss.ggo
index db11901..1152995 100644 (file)
--- a/dss.ggo
+++ b/dss.ggo
@@ -244,6 +244,26 @@ details="
                --rsync-option --exclude --rsync-option /proc
 "
 
+option "max-rsync-errors" -
+"Terminate after this many rsync failures"
+int typestr="count"
+default="10"
+optional
+details="
+       Only relevant when operating in --run mode (see above). If
+       the rsync process exits with a fatal error, dss restarts
+       the command in the hope that the problem is transient
+       and subsequent rsync runs succeed. After the given number
+       of consecutive rsync error exits, however, dss gives up,
+       executes the exit hook and terminates. Set this to zero if
+       dss should exit immediately on the first rsync error.
+
+       The only non-fatal error is when rsync exits with code 24. This
+       indicates a partial transfer due to vanished source files
+       and happens frequently when snapshotting a directory which
+       is concurrently being modified.
+"
+
 ###################
 section "Intervals"
 ###################
diff --git a/err.h b/err.h
index e651d9c..e7aced0 100644 (file)
--- a/err.h
+++ b/err.h
@@ -55,7 +55,8 @@ static inline char *dss_strerror(int num)
        DSS_ERROR(SIGNAL_SIG_ERR, "signal() returned SIG_ERR"), \
        DSS_ERROR(SIGNAL, "caught terminating signal"), \
        DSS_ERROR(BUG, "values of beta might cause dom!"), \
-       DSS_ERROR(NOT_RUNNING, "dss not running")
+       DSS_ERROR(NOT_RUNNING, "dss not running"), \
+       DSS_ERROR(TOO_MANY_RSYNC_ERRORS, "too many consecutive rsync errors")
 
 /**
  * This is temporarily defined to expand to its first argument (prefixed by