X-Git-Url: http://git.tuebingen.mpg.de/?p=dss.git;a=blobdiff_plain;f=dss.c;h=03ddaf89c1493c10bef79e7c0f36c9180d2410e5;hp=354bdb847095d0dd2b6a2346b9bdceaae7977d0d;hb=e9e6450e1a8ece2ff879523516fefd6576a76521;hpb=42263e3bce7826703a531d0113d706fe3f3536a4 diff --git a/dss.c b/dss.c index 354bdb8..03ddaf8 100644 --- a/dss.c +++ b/dss.c @@ -1,16 +1,18 @@ /* - * Copyright (C) 2008-2011 Andre Noll + * Copyright (C) 2008-2011 Andre Noll * * Licensed under the GPL v2. For licencing details see COPYING. */ #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -45,6 +47,8 @@ static int signal_pipe; static pid_t create_pid; /** Whether the pre-create-hook/rsync/post-create-hook is currently stopped. */ static int create_process_stopped; +/** How many times in a row the rsync command failed. */ +static int num_consecutive_rsync_errors; /** Process id of current pre-remove/rm/post-remove process. */ static pid_t remove_pid; /** When the next snapshot is due. */ @@ -125,6 +129,7 @@ static void dump_dss_config(const char *msg) "reference_snapshot: %s\n" "snapshot_creation_status: %s\n" "snapshot_removal_status: %s\n" + "num_consecutive_rsync_errors: %d\n" , (int) getpid(), logfile? conf.logfile_arg : "stderr", @@ -135,7 +140,8 @@ static void dump_dss_config(const char *msg) name_of_reference_snapshot? name_of_reference_snapshot : "(none)", hook_status_description[snapshot_creation_status], - hook_status_description[snapshot_removal_status] + hook_status_description[snapshot_removal_status], + num_consecutive_rsync_errors ); if (create_pid != 0) fprintf(log, @@ -299,7 +305,7 @@ static int64_t compute_next_snapshot_time(void) int64_t x = 0, now = get_current_time(), unit_interval = 24 * 3600 * conf.unit_interval_arg, ret; unsigned wanted = desired_number_of_snapshots(0, conf.num_intervals_arg), - num_complete_snapshots = 0; + num_complete = 0; int i; struct snapshot *s = NULL; struct snapshot_list sl; @@ -308,15 +314,15 @@ static int64_t compute_next_snapshot_time(void) FOR_EACH_SNAPSHOT(s, i, &sl) { if (!(s->flags & SS_COMPLETE)) continue; - num_complete_snapshots++; + num_complete++; x += s->completion_time - s->creation_time; } assert(x >= 0); ret = now; - if (num_complete_snapshots == 0) + if (num_complete == 0) goto out; - x /= num_complete_snapshots; /* avg time to create one snapshot */ + x /= num_complete; /* avg time to create one snapshot */ if (unit_interval < x * wanted) /* oops, no sleep at all */ goto out; ret = s->completion_time + unit_interval / wanted - x; @@ -531,17 +537,25 @@ static struct snapshot *find_outdated_snapshot(struct snapshot_list *sl) static struct snapshot *find_oldest_removable_snapshot(struct snapshot_list *sl) { - int i; - struct snapshot *s; + int i, num_complete; + struct snapshot *s, *ref = NULL; + + num_complete = num_complete_snapshots(sl); + if (num_complete <= conf.min_complete_arg) + return NULL; FOR_EACH_SNAPSHOT(s, i, sl) { if (snapshot_is_being_created(s)) continue; - if (is_reference_snapshot(s)) + if (is_reference_snapshot(s)) { /* avoid this one */ + ref = s; continue; + } DSS_INFO_LOG(("oldest removable snapshot: %s\n", s->name)); return s; } - return NULL; + assert(ref); + DSS_WARNING_LOG(("removing reference snapshot %s\n", ref->name)); + return ref; } static int rename_incomplete_snapshot(int64_t start) @@ -594,9 +608,18 @@ static int try_to_free_disk_space(void) if (next_snapshot_is_due()) return 0; } + /* + * Idle and --keep_redundant not given, or low disk space. Look at + * existing snapshots. + */ dss_get_snapshot_list(&sl); ret = 0; - if (!low_disk_space && sl.num_snapshots <= 1) + /* + * Don't remove anything if there is free space and we have fewer + * snapshots than configured, plus one. This way there is always one + * snapshot that can be recycled. + */ + if (!low_disk_space && sl.num_snapshots <= 1 << conf.num_intervals_arg) goto out; why = "outdated"; victim = find_outdated_snapshot(&sl); @@ -606,13 +629,13 @@ static int try_to_free_disk_space(void) victim = find_redundant_snapshot(&sl); if (victim) goto remove; - /* try harder only if disk space is low */ - if (!low_disk_space) - goto out; why = "orphaned"; victim = find_orphaned_snapshot(&sl); if (victim) goto remove; + /* try harder only if disk space is low */ + if (!low_disk_space) + goto out; DSS_WARNING_LOG(("disk space low and nothing obvious to remove\n")); victim = find_oldest_removable_snapshot(&sl); if (victim) @@ -836,23 +859,27 @@ static int handle_rsync_exit(int status) es = WEXITSTATUS(status); /* * Restart rsync on non-fatal errors: - * 12: Error in rsync protocol data stream - * 13: Errors with program diagnostics + * 24: Partial transfer due to vanished source files */ - if (es == 12 || es == 13) { - DSS_WARNING_LOG(("rsync process %d returned %d -- restarting\n", - (int)create_pid, es)); + if (es != 0 && es != 24) { + DSS_WARNING_LOG(("rsync exit code %d, error count %d\n", + es, ++num_consecutive_rsync_errors)); + if (conf.create_given) { + ret = -E_BAD_EXIT_CODE; + goto out; + } + if (num_consecutive_rsync_errors > conf.max_rsync_errors_arg) { + ret = -E_TOO_MANY_RSYNC_ERRORS; + snapshot_creation_status = HS_READY; + goto out; + } + DSS_WARNING_LOG(("restarting rsync process\n")); snapshot_creation_status = HS_NEEDS_RESTART; next_snapshot_time = get_current_time() + 60; ret = 1; goto out; } - if (es != 0 && es != 23 && es != 24) { - DSS_ERROR_LOG(("rsync process %d returned %d\n", (int)create_pid, es)); - ret = -E_BAD_EXIT_CODE; - snapshot_creation_status = HS_READY; - goto out; - } + num_consecutive_rsync_errors = 0; ret = rename_incomplete_snapshot(current_snapshot_creation_time); if (ret < 0) goto out; @@ -939,8 +966,9 @@ static int check_config(void) return -E_INVALID_NUMBER; } DSS_DEBUG_LOG(("unit interval: %i day(s)\n", conf.unit_interval_arg)); - if (conf.num_intervals_arg <= 0) { - DSS_ERROR_LOG(("bad number of intervals %i\n", conf.num_intervals_arg)); + if (conf.num_intervals_arg <= 0 || conf.num_intervals_arg > 30) { + DSS_ERROR_LOG(("bad number of intervals: %i\n", + conf.num_intervals_arg)); return -E_INVALID_NUMBER; } DSS_DEBUG_LOG(("number of intervals: %i\n", conf.num_intervals_arg)); @@ -951,7 +979,7 @@ static int check_config(void) * Returns < 0 on errors, 0 if no config file is given and > 0 if the config * file was read successfully. */ -static int parse_config_file(int override) +static int parse_config_file(bool sighup) { int ret, config_file_exists; char *config_file = get_config_file_name(); @@ -959,7 +987,7 @@ static int parse_config_file(int override) char *old_logfile_arg = NULL; int old_daemon_given = 0; - if (override) { /* SIGHUP */ + if (sighup) { if (conf.logfile_given) old_logfile_arg = dss_strdup(conf.logfile_arg); old_daemon_given = conf.daemon_given; @@ -973,12 +1001,12 @@ static int parse_config_file(int override) } if (config_file_exists) { struct cmdline_parser_params params; - params.override = override; + params.override = sighup; params.initialize = 0; params.check_required = 1; params.check_ambiguity = 0; params.print_errors = 1; - if (override) { /* invalidate all rsync options */ + if (sighup) { /* invalidate all rsync options */ int i; for (i = 0; i < conf.rsync_option_given; i++) { @@ -992,7 +1020,7 @@ static int parse_config_file(int override) ret = check_config(); if (ret < 0) goto out; - if (override) { + if (sighup) { /* don't change daemon mode on SIGHUP */ conf.daemon_given = old_daemon_given; close_log(logfile); @@ -1029,7 +1057,7 @@ static int handle_sighup(void) DSS_NOTICE_LOG(("SIGHUP, re-reading config\n")); dump_dss_config("old"); - ret = parse_config_file(1); + ret = parse_config_file(true /* SIGHUP */); if (ret < 0) return ret; dump_dss_config("reloaded"); @@ -1037,6 +1065,13 @@ static int handle_sighup(void) return change_to_dest_dir(); } +static void kill_children(void) +{ + restart_create_process(); + dss_kill(create_pid, SIGTERM, NULL); + dss_kill(remove_pid, SIGTERM, NULL); +} + static int handle_signal(void) { int sig, ret = next_signal(); @@ -1047,9 +1082,7 @@ static int handle_signal(void) switch (sig) { case SIGINT: case SIGTERM: - restart_create_process(); - dss_kill(create_pid, SIGTERM, NULL); - dss_kill(remove_pid, SIGTERM, NULL); + kill_children(); ret = -E_SIGNAL; break; case SIGHUP: @@ -1147,7 +1180,7 @@ static void create_rsync_argv(char ***argv, int64_t *num) *argv = dss_malloc((15 + conf.rsync_option_given) * sizeof(char *)); (*argv)[i++] = dss_strdup("rsync"); - (*argv)[i++] = dss_strdup("-aq"); + (*argv)[i++] = dss_strdup("-a"); (*argv)[i++] = dss_strdup("--delete"); for (j = 0; j < conf.rsync_option_given; j++) (*argv)[i++] = dss_strdup(conf.rsync_option_arg[j]); @@ -1318,6 +1351,7 @@ static int com_run(void) ret = select_loop(); if (ret >= 0) /* impossible */ ret = -E_BUG; + kill_children(); exit_hook(ret); return ret; } @@ -1441,7 +1475,7 @@ static int com_ls(void) if (s->flags & SS_COMPLETE) d = (s->completion_time - s->creation_time) / 60; dss_msg("%u\t%s\t%3" PRId64 ":%02" PRId64 "\n", s->interval, s->name, d/60, d%60); - }; + } free_snapshot_list(&sl); return 1; } @@ -1461,12 +1495,6 @@ static int setup_signal_handling(void) return install_sighandler(SIGCHLD); } -/** - * The main function of dss. - * - * \param argc Usual argument count. - * \param argv Usual argument vector. - */ int main(int argc, char **argv) { int ret; @@ -1480,6 +1508,7 @@ int main(int argc, char **argv) cmdline_parser_ext(argc, argv, &conf, ¶ms); /* aborts on errors */ ret = parse_config_file(0); + ret = parse_config_file(false /* no SIGHUP */); if (ret < 0) goto out; if (ret == 0) { /* no config file given */ @@ -1505,6 +1534,7 @@ int main(int argc, char **argv) if (ret < 0) goto out; ret = call_command_handler(); + signal_shutdown(); out: if (ret < 0) DSS_EMERG_LOG(("%s\n", dss_strerror(-ret)));