]> git.tuebingen.mpg.de Git - dss.git/commitdiff
Merge branch 'refs/heads/t/max-errors'
authorAndre Noll <maan@tuebingen.mpg.de>
Wed, 25 Feb 2015 12:23:56 +0000 (13:23 +0100)
committerAndre Noll <maan@tuebingen.mpg.de>
Wed, 25 Feb 2015 12:24:22 +0000 (13:24 +0100)
The topic branch was cooking for about a week, and it was tested with
no problems on several multi-terabyte file systems.

* Rework restart logic, introduce --max-errors.
* Fix typo in help text of --daemon.

NEWS
dss.c
dss.ggo
err.h

diff --git a/NEWS b/NEWS
index db77b4591b3775b458bb8b86a0a882cae5b4d432..444c12eef89103900bf3aaa14fcb6bfe747d4d86 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -2,8 +2,13 @@
 0.1.6 (to be announced)
 -----------------------
 
-- New option --min-complete
-- New home page URL, email address
+ - New option --min-complete to specify the minimal number of snapshots
+   to keep.
+
+ - Improved handling of rsync errors. The new --max-rsync-errors option
+ tells dss to terminate after the given number of rsync failures.
+
+ - New home page URL, email address
 
 ------------------
 0.1.5 (2014-01-14)
diff --git a/dss.c b/dss.c
index c64156188bc753f7ee264f7434db1e254bbbc8f2..07a60425170ddab063ee6071d4901993a9db8c62 100644 (file)
--- a/dss.c
+++ b/dss.c
@@ -45,6 +45,8 @@ static int signal_pipe;
 static pid_t create_pid;
 /** Whether the pre-create-hook/rsync/post-create-hook is currently stopped. */
 static int create_process_stopped;
+/** How many times in a row the rsync command failed. */
+static int num_consecutive_rsync_errors;
 /** Process id of current pre-remove/rm/post-remove process. */
 static pid_t remove_pid;
 /** When the next snapshot is due. */
@@ -125,6 +127,7 @@ static void dump_dss_config(const char *msg)
                "reference_snapshot: %s\n"
                "snapshot_creation_status: %s\n"
                "snapshot_removal_status: %s\n"
+               "num_consecutive_rsync_errors: %d\n"
                ,
                (int) getpid(),
                logfile? conf.logfile_arg : "stderr",
@@ -135,7 +138,8 @@ static void dump_dss_config(const char *msg)
                name_of_reference_snapshot?
                        name_of_reference_snapshot : "(none)",
                hook_status_description[snapshot_creation_status],
-               hook_status_description[snapshot_removal_status]
+               hook_status_description[snapshot_removal_status],
+               num_consecutive_rsync_errors
        );
        if (create_pid != 0)
                fprintf(log,
@@ -844,23 +848,23 @@ static int handle_rsync_exit(int status)
        es = WEXITSTATUS(status);
        /*
         * Restart rsync on non-fatal errors:
-        * 12: Error in rsync protocol data stream
-        * 13: Errors with program diagnostics
+        * 24: Partial transfer due to vanished source files
         */
-       if (es == 12 || es == 13) {
-               DSS_WARNING_LOG(("rsync process %d returned %d -- restarting\n",
-                       (int)create_pid, es));
+       if (es != 0 && es != 24) {
+               DSS_WARNING_LOG(("rsync exit code %d, error count %d\n",
+                       es, ++num_consecutive_rsync_errors));
+               if (num_consecutive_rsync_errors > conf.max_rsync_errors_arg) {
+                       ret = -E_TOO_MANY_RSYNC_ERRORS;
+                       snapshot_creation_status = HS_READY;
+                       goto out;
+               }
+               DSS_WARNING_LOG(("restarting rsync process\n"));
                snapshot_creation_status = HS_NEEDS_RESTART;
                next_snapshot_time = get_current_time() + 60;
                ret = 1;
                goto out;
        }
-       if (es != 0 && es != 23 && es != 24) {
-               DSS_ERROR_LOG(("rsync process %d returned %d\n", (int)create_pid, es));
-               ret = -E_BAD_EXIT_CODE;
-               snapshot_creation_status = HS_READY;
-               goto out;
-       }
+       num_consecutive_rsync_errors = 0;
        ret = rename_incomplete_snapshot(current_snapshot_creation_time);
        if (ret < 0)
                goto out;
diff --git a/dss.ggo b/dss.ggo
index db119015e9a64c49f0be1edcc4cc88d4ff5c3816..11529956891db25e5110e03009b88ff5ca36c0c6 100644 (file)
--- a/dss.ggo
+++ b/dss.ggo
@@ -244,6 +244,26 @@ details="
                --rsync-option --exclude --rsync-option /proc
 "
 
+option "max-rsync-errors" -
+"Terminate after this many rsync failures"
+int typestr="count"
+default="10"
+optional
+details="
+       Only relevant when operating in --run mode (see above). If
+       the rsync process exits with a fatal error, dss restarts
+       the command in the hope that the problem is transient
+       and subsequent rsync runs succeed. After the given number
+       of consecutive rsync error exits, however, dss gives up,
+       executes the exit hook and terminates. Set this to zero if
+       dss should exit immediately on the first rsync error.
+
+       The only non-fatal error is when rsync exits with code 24. This
+       indicates a partial transfer due to vanished source files
+       and happens frequently when snapshotting a directory which
+       is concurrently being modified.
+"
+
 ###################
 section "Intervals"
 ###################
diff --git a/err.h b/err.h
index e651d9ca06f082eff6eee583ada6ee73ffacdf61..e7aced084542cb8d05fec76d49839e6d25b4cef8 100644 (file)
--- a/err.h
+++ b/err.h
@@ -55,7 +55,8 @@ static inline char *dss_strerror(int num)
        DSS_ERROR(SIGNAL_SIG_ERR, "signal() returned SIG_ERR"), \
        DSS_ERROR(SIGNAL, "caught terminating signal"), \
        DSS_ERROR(BUG, "values of beta might cause dom!"), \
-       DSS_ERROR(NOT_RUNNING, "dss not running")
+       DSS_ERROR(NOT_RUNNING, "dss not running"), \
+       DSS_ERROR(TOO_MANY_RSYNC_ERRORS, "too many consecutive rsync errors")
 
 /**
  * This is temporarily defined to expand to its first argument (prefixed by