]> git.tuebingen.mpg.de Git - dss.git/commitdiff
Merge branch 'refs/heads/t/max-errors'
authorAndre Noll <maan@tuebingen.mpg.de>
Wed, 25 Feb 2015 12:23:56 +0000 (13:23 +0100)
committerAndre Noll <maan@tuebingen.mpg.de>
Wed, 25 Feb 2015 12:24:22 +0000 (13:24 +0100)
The topic branch was cooking for about a week, and it was tested with
no problems on several multi-terabyte file systems.

* Rework restart logic, introduce --max-errors.
* Fix typo in help text of --daemon.

1  2 
NEWS
dss.c

diff --combined NEWS
index db77b4591b3775b458bb8b86a0a882cae5b4d432,db77b4591b3775b458bb8b86a0a882cae5b4d432..444c12eef89103900bf3aaa14fcb6bfe747d4d86
--- 1/NEWS
--- 2/NEWS
+++ b/NEWS
@@@ -2,8 -2,8 +2,13 @@@
  0.1.6 (to be announced)
  -----------------------
  
--- New option --min-complete
--- New home page URL, email address
++ - New option --min-complete to specify the minimal number of snapshots
++   to keep.
++
++ - Improved handling of rsync errors. The new --max-rsync-errors option
++ tells dss to terminate after the given number of rsync failures.
++
++ - New home page URL, email address
  
  ------------------
  0.1.5 (2014-01-14)
diff --combined dss.c
index c64156188bc753f7ee264f7434db1e254bbbc8f2,95c4c03ab1c3467aa33507b35f7e57cc81d4aea9..07a60425170ddab063ee6071d4901993a9db8c62
--- 1/dss.c
--- 2/dss.c
+++ b/dss.c
@@@ -45,6 -45,8 +45,8 @@@ static int signal_pipe
  static pid_t create_pid;
  /** Whether the pre-create-hook/rsync/post-create-hook is currently stopped. */
  static int create_process_stopped;
+ /** How many times in a row the rsync command failed. */
+ static int num_consecutive_rsync_errors;
  /** Process id of current pre-remove/rm/post-remove process. */
  static pid_t remove_pid;
  /** When the next snapshot is due. */
@@@ -125,6 -127,7 +127,7 @@@ static void dump_dss_config(const char 
                "reference_snapshot: %s\n"
                "snapshot_creation_status: %s\n"
                "snapshot_removal_status: %s\n"
+               "num_consecutive_rsync_errors: %d\n"
                ,
                (int) getpid(),
                logfile? conf.logfile_arg : "stderr",
                name_of_reference_snapshot?
                        name_of_reference_snapshot : "(none)",
                hook_status_description[snapshot_creation_status],
-               hook_status_description[snapshot_removal_status]
+               hook_status_description[snapshot_removal_status],
+               num_consecutive_rsync_errors
        );
        if (create_pid != 0)
                fprintf(log,
@@@ -844,23 -848,23 +848,23 @@@ static int handle_rsync_exit(int status
        es = WEXITSTATUS(status);
        /*
         * Restart rsync on non-fatal errors:
-        * 12: Error in rsync protocol data stream
-        * 13: Errors with program diagnostics
+        * 24: Partial transfer due to vanished source files
         */
-       if (es == 12 || es == 13) {
-               DSS_WARNING_LOG(("rsync process %d returned %d -- restarting\n",
-                       (int)create_pid, es));
+       if (es != 0 && es != 24) {
+               DSS_WARNING_LOG(("rsync exit code %d, error count %d\n",
+                       es, ++num_consecutive_rsync_errors));
+               if (num_consecutive_rsync_errors > conf.max_rsync_errors_arg) {
+                       ret = -E_TOO_MANY_RSYNC_ERRORS;
+                       snapshot_creation_status = HS_READY;
+                       goto out;
+               }
+               DSS_WARNING_LOG(("restarting rsync process\n"));
                snapshot_creation_status = HS_NEEDS_RESTART;
                next_snapshot_time = get_current_time() + 60;
                ret = 1;
                goto out;
        }
-       if (es != 0 && es != 23 && es != 24) {
-               DSS_ERROR_LOG(("rsync process %d returned %d\n", (int)create_pid, es));
-               ret = -E_BAD_EXIT_CODE;
-               snapshot_creation_status = HS_READY;
-               goto out;
-       }
+       num_consecutive_rsync_errors = 0;
        ret = rename_incomplete_snapshot(current_snapshot_creation_time);
        if (ret < 0)
                goto out;
@@@ -1520,7 -1524,6 +1524,7 @@@ int main(int argc, char **argv
        if (ret < 0)
                goto out;
        ret = call_command_handler();
 +      signal_shutdown();
  out:
        if (ret < 0)
                DSS_EMERG_LOG(("%s\n", dss_strerror(-ret)));