3 This patch adds the --link-by-hash=DIR option, which hard links received files
4 in a link farm arranged by MD4 or MD5 file hash. The result is that the system
5 will only store one copy of the unique contents of each file, regardless of the
8 To use this patch, run these commands for a successful build:
10 patch -p1 <patches/link-by-hash.diff
15 based-on: 1c82a1e1e54eb585cd37c875604193f5b977d24e
16 diff --git a/Makefile.in b/Makefile.in
19 @@ -40,7 +40,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
20 util.o util2.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
21 OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
22 fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
23 -OBJS3=progress.o pipe.o
24 +OBJS3=progress.o pipe.o hashlink.o
25 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
26 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
27 popt/popthelp.o popt/poptparse.o
28 diff --git a/checksum.c b/checksum.c
35 +extern int checksum_len;
36 extern int checksum_seed;
37 extern int protocol_version;
38 extern int proper_seed_order;
39 +extern char *link_by_hash_dir;
40 +extern char link_by_hash_extra_sum[MAX_DIGEST_LEN];
41 extern char *checksum_choice;
44 @@ -250,7 +253,7 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
47 static int32 sumresidue;
48 -static md_context md;
49 +static md_context md, md2;
50 static int cursum_type;
52 void sum_init(int csum_type, int seed)
53 @@ -264,6 +267,8 @@ void sum_init(int csum_type, int seed)
57 + if (link_by_hash_dir)
62 @@ -297,6 +302,8 @@ void sum_update(const char *p, int32 len)
63 switch (cursum_type) {
65 md5_update(&md, (uchar *)p, len);
66 + if (link_by_hash_dir)
67 + md5_update(&md2, (uchar *)p, len);
71 @@ -342,6 +349,8 @@ int sum_end(char *sum)
72 switch (cursum_type) {
74 md5_result(&md, (uchar *)sum);
75 + if (link_by_hash_dir)
76 + md5_result(&md2, (uchar *)link_by_hash_extra_sum);
80 diff --git a/clientserver.c b/clientserver.c
83 @@ -50,6 +50,7 @@ extern int logfile_format_has_i;
84 extern int logfile_format_has_o_or_i;
85 extern char *bind_address;
86 extern char *config_file;
87 +extern char *link_by_hash_dir;
88 extern char *logfile_format;
89 extern char *files_from;
91 @@ -543,6 +544,9 @@ static int rsync_module(int f_in, int f_out, int i, const char *addr, const char
95 + if (*lp_link_by_hash_dir(i))
96 + link_by_hash_dir = lp_link_by_hash_dir(i);
98 if (am_daemon && am_server) {
99 rprintf(FLOG, "rsync allowed access on module %s from %s (%s)\n",
101 diff --git a/compat.c b/compat.c
104 @@ -57,6 +57,7 @@ extern char *partial_dir;
105 extern char *dest_option;
106 extern char *files_from;
107 extern char *filesfrom_host;
108 +extern char *link_by_hash_dir;
109 extern filter_rule_list filter_list;
110 extern int need_unsorted_flist;
112 diff --git a/hashlink.c b/hashlink.c
118 + Copyright (C) Cronosys, LLC 2004
120 + This program is free software; you can redistribute it and/or modify
121 + it under the terms of the GNU General Public License as published by
122 + the Free Software Foundation; either version 2 of the License, or
123 + (at your option) any later version.
125 + This program is distributed in the hope that it will be useful,
126 + but WITHOUT ANY WARRANTY; without even the implied warranty of
127 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
128 + GNU General Public License for more details.
130 + You should have received a copy of the GNU General Public License
131 + along with this program; if not, write to the Free Software
132 + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
135 +/* This file contains code used by the --link-by-hash option. */
140 +extern int protocol_version;
141 +extern char *link_by_hash_dir;
142 +extern char sender_file_sum[MAX_DIGEST_LEN];
144 +char link_by_hash_extra_sum[MAX_DIGEST_LEN]; /* Only used when md4 sums are in the transfer */
148 +/* This function is always called after a file is received, so the
149 + * sender_file_sum buffer has whatever the last checksum was for the
150 + * transferred file. */
151 +void link_by_hash(const char *fname, const char *fnametmp, struct file_struct *file)
154 + char *hashname, *last_slash, *num_str;
158 + /* We don't bother to hard-link 0-length files. */
159 + if (F_LENGTH(file) == 0)
162 + hex = sum_as_hex(5, protocol_version >= 30 ? sender_file_sum : link_by_hash_extra_sum, 0);
163 + if (asprintf(&hashname, "%s/%.3s/%.3s/%.3s/%s.%s.000000",
164 + link_by_hash_dir, hex, hex+3, hex+6, hex+9, big_num(F_LENGTH(file))) < 0)
166 + out_of_memory("make_hash_name");
169 + last_slash = strrchr(hashname, '/');
170 + num_str = strrchr(last_slash, '.') + 1;
173 + if (num >= 999999) { /* Surely we'll never reach this... */
174 + if (DEBUG_GTE(HASHLINK, 1))
175 + rprintf(FINFO, "link-by-hash: giving up after \"%s\".\n", hashname);
178 + if (num > 0 && DEBUG_GTE(HASHLINK, 1))
179 + rprintf(FINFO, "link-by-hash: max link count exceeded, starting new file \"%s\".\n", hashname);
181 + snprintf(num_str, 7, "%d", num++);
182 + if (do_stat(hashname, &st) < 0)
185 + if (do_link(hashname, fnametmp) < 0) {
186 + if (errno == EMLINK)
188 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", hashname, full_fname(fname));
190 + if (DEBUG_GTE(HASHLINK, 2))
191 + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", hashname, full_fname(fname));
192 + robust_rename(fnametmp, fname, NULL, 0644);
198 + if (DEBUG_GTE(HASHLINK, 2))
199 + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", full_fname(fname), hashname);
201 + if (do_link(fname, hashname) < 0
202 + && (errno != ENOENT || make_path(hashname, MKP_DROP_NAME) < 0 || do_link(fname, hashname) < 0))
203 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", full_fname(fname), hashname);
209 diff --git a/loadparm.c b/loadparm.c
212 @@ -130,6 +130,7 @@ typedef struct {
215 char *incoming_chmod;
216 + char *link_by_hash_dir;
220 @@ -158,6 +159,7 @@ typedef struct {
222 BOOL include_from_EXP;
223 BOOL incoming_chmod_EXP;
224 + BOOL link_by_hash_dir_EXP;
228 @@ -244,6 +246,7 @@ static const all_vars Defaults = {
230 /* include_from; */ NULL,
231 /* incoming_chmod; */ NULL,
232 + /* link_by_hash_dir; */ NULL,
233 /* lock_file; */ DEFAULT_LOCK_FILE,
234 /* log_file; */ NULL,
235 /* log_format; */ "%o %h [%a] %m (%u) %f %l",
236 @@ -271,6 +274,7 @@ static const all_vars Defaults = {
237 /* include_EXP; */ False,
238 /* include_from_EXP; */ False,
239 /* incoming_chmod_EXP; */ False,
240 + /* link_by_hash_dir_EXP; */ False,
241 /* lock_file_EXP; */ False,
242 /* log_file_EXP; */ False,
243 /* log_format_EXP; */ False,
244 @@ -416,6 +420,7 @@ static struct parm_struct parm_table[] =
245 {"include from", P_STRING, P_LOCAL, &Vars.l.include_from, NULL,0},
246 {"include", P_STRING, P_LOCAL, &Vars.l.include, NULL,0},
247 {"incoming chmod", P_STRING, P_LOCAL, &Vars.l.incoming_chmod, NULL,0},
248 + {"link by hash dir", P_STRING, P_LOCAL, &Vars.l.link_by_hash_dir, NULL,0},
249 {"list", P_BOOL, P_LOCAL, &Vars.l.list, NULL,0},
250 {"lock file", P_STRING, P_LOCAL, &Vars.l.lock_file, NULL,0},
251 {"log file", P_STRING, P_LOCAL, &Vars.l.log_file, NULL,0},
252 @@ -551,6 +556,7 @@ FN_LOCAL_STRING(lp_hosts_deny, hosts_deny)
253 FN_LOCAL_STRING(lp_include, include)
254 FN_LOCAL_STRING(lp_include_from, include_from)
255 FN_LOCAL_STRING(lp_incoming_chmod, incoming_chmod)
256 +FN_LOCAL_STRING(lp_link_by_hash_dir, link_by_hash_dir)
257 FN_LOCAL_STRING(lp_lock_file, lock_file)
258 FN_LOCAL_STRING(lp_log_file, log_file)
259 FN_LOCAL_STRING(lp_log_format, log_format)
260 diff --git a/options.c b/options.c
263 @@ -163,6 +163,7 @@ char *backup_suffix = NULL;
265 char *partial_dir = NULL;
266 char *basis_dir[MAX_BASIS_DIRS+1];
267 +char *link_by_hash_dir = NULL;
268 char *config_file = NULL;
269 char *shell_cmd = NULL;
270 char *logfile_name = NULL;
271 @@ -213,7 +214,7 @@ static const char *debug_verbosity[] = {
272 /*2*/ "BIND,CMD,CONNECT,DEL,DELTASUM,DUP,FILTER,FLIST,ICONV",
273 /*3*/ "ACL,BACKUP,CONNECT2,DELTASUM2,DEL2,EXIT,FILTER2,FLIST2,FUZZY,GENR,OWN,RECV,SEND,TIME",
274 /*4*/ "CMD2,DELTASUM3,DEL3,EXIT2,FLIST3,ICONV2,OWN2,PROTO,TIME2",
275 - /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HLINK",
276 + /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HASHLINK,HLINK",
279 #define MAX_VERBOSITY ((int)(sizeof debug_verbosity / sizeof debug_verbosity[0]) - 1)
280 @@ -283,6 +284,7 @@ static struct output_struct debug_words[COUNT_DEBUG+1] = {
281 DEBUG_WORD(FUZZY, W_REC, "Debug fuzzy scoring (levels 1-2)"),
282 DEBUG_WORD(GENR, W_REC, "Debug generator functions"),
283 DEBUG_WORD(HASH, W_SND|W_REC, "Debug hashtable code"),
284 + DEBUG_WORD(HASHLINK, W_REC, "Debug hashlink code (levels 1-2)"),
285 DEBUG_WORD(HLINK, W_SND|W_REC, "Debug hard-link actions (levels 1-3)"),
286 DEBUG_WORD(ICONV, W_CLI|W_SRV, "Debug iconv character conversions (levels 1-2)"),
287 DEBUG_WORD(IO, W_CLI|W_SRV, "Debug I/O routines (levels 1-4)"),
288 @@ -767,6 +769,7 @@ void usage(enum logcode F)
289 rprintf(F," --compare-dest=DIR also compare destination files relative to DIR\n");
290 rprintf(F," --copy-dest=DIR ... and include copies of unchanged files\n");
291 rprintf(F," --link-dest=DIR hardlink to files in DIR when unchanged\n");
292 + rprintf(F," --link-by-hash=DIR create hardlinks by hash into DIR\n");
293 rprintf(F," -z, --compress compress file data during the transfer\n");
294 rprintf(F," --compress-level=NUM explicitly set compression level\n");
295 rprintf(F," --skip-compress=LIST skip compressing files with a suffix in LIST\n");
296 @@ -824,7 +827,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
297 OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
298 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
299 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
300 - OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG,
301 + OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_LINK_BY_HASH,
302 OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT,
303 OPT_SERVER, OPT_REFUSED_BASE = 9000};
305 @@ -971,6 +974,7 @@ static struct poptOption long_options[] = {
306 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
307 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
308 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
309 + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
310 {"fuzzy", 'y', POPT_ARG_NONE, 0, 'y', 0, 0 },
311 {"no-fuzzy", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
312 {"no-y", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
313 @@ -1343,6 +1347,9 @@ int parse_arguments(int *argc_p, const char ***argv_p)
314 iconv_opt = strdup(arg);
317 + if (*lp_link_by_hash_dir(module_id))
318 + set_refuse_options("link-by-hash");
320 /* TODO: Call poptReadDefaultConfig; handle errors. */
322 /* The context leaks in case of an error, but if there's a
323 @@ -1821,6 +1828,21 @@ int parse_arguments(int *argc_p, const char ***argv_p)
327 + case OPT_LINK_BY_HASH:
329 + arg = poptGetOptArg(pc);
330 + if (sanitize_paths)
331 + arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
332 + link_by_hash_dir = (char *)arg;
335 + snprintf(err_buf, sizeof err_buf,
336 + "hard links are not supported on this %s\n",
337 + am_server ? "server" : "client");
338 + rprintf(FERROR, "ERROR: %s", err_buf);
343 /* A large opt value means that set_refuse_options()
344 * turned this option off. */
345 @@ -2143,6 +2165,8 @@ int parse_arguments(int *argc_p, const char ***argv_p)
346 tmpdir = sanitize_path(NULL, tmpdir, NULL, 0, SP_DEFAULT);
348 backup_dir = sanitize_path(NULL, backup_dir, NULL, 0, SP_DEFAULT);
349 + if (link_by_hash_dir)
350 + link_by_hash_dir = sanitize_path(NULL, link_by_hash_dir, NULL, 0, SP_DEFAULT);
352 if (daemon_filter_list.head && !am_sender) {
353 filter_rule_list *elp = &daemon_filter_list;
354 @@ -2803,6 +2827,12 @@ void server_options(char **args, int *argc_p)
356 args[ac++] = "--inplace";
358 + if (link_by_hash_dir && am_sender) {
359 + args[ac++] = "--link-by-hash";
360 + args[ac++] = link_by_hash_dir;
361 + link_by_hash_dir = NULL; /* optimize sending-side checksums */
364 if (files_from && (!am_sender || filesfrom_host)) {
365 if (filesfrom_host) {
366 args[ac++] = "--files-from";
367 diff --git a/rsync.c b/rsync.c
370 @@ -50,6 +50,7 @@ extern int flist_eof;
371 extern int file_old_total;
372 extern int keep_dirlinks;
373 extern int make_backups;
374 +extern char *link_by_hash_dir;
375 extern int sanitize_paths;
376 extern struct file_list *cur_flist, *first_flist, *dir_flist;
377 extern struct chmod_mode_struct *daemon_chmod_modes;
378 @@ -693,6 +694,10 @@ int finish_transfer(const char *fname, const char *fnametmp,
381 /* The file was moved into place (not copied), so it's done. */
383 + if (link_by_hash_dir)
384 + link_by_hash(fname, fnametmp, file);
388 /* The file was copied, so tweak the perms of the copied file. If it
389 diff --git a/rsync.h b/rsync.h
392 @@ -1280,7 +1280,8 @@ extern short info_levels[], debug_levels[];
393 #define DEBUG_FUZZY (DEBUG_FLIST+1)
394 #define DEBUG_GENR (DEBUG_FUZZY+1)
395 #define DEBUG_HASH (DEBUG_GENR+1)
396 -#define DEBUG_HLINK (DEBUG_HASH+1)
397 +#define DEBUG_HASHLINK (DEBUG_HASH+1)
398 +#define DEBUG_HLINK (DEBUG_HASHLINK+1)
399 #define DEBUG_ICONV (DEBUG_HLINK+1)
400 #define DEBUG_IO (DEBUG_ICONV+1)
401 #define DEBUG_OWN (DEBUG_IO+1)
402 diff --git a/rsync.yo b/rsync.yo
405 @@ -427,6 +427,7 @@ to the detailed description below for a complete description. verb(
406 --compare-dest=DIR also compare received files relative to DIR
407 --copy-dest=DIR ... and include copies of unchanged files
408 --link-dest=DIR hardlink to files in DIR when unchanged
409 + --link-by-hash=DIR create hardlinks by hash into DIR
410 -z, --compress compress file data during the transfer
411 --compress-level=NUM explicitly set compression level
412 --skip-compress=LIST skip compressing files with suffix in LIST
413 @@ -2007,6 +2008,48 @@ bf(--link-dest) from working properly for a non-super-user when bf(-o) was
414 specified (or implied by bf(-a)). You can work-around this bug by avoiding
415 the bf(-o) option when sending to an old rsync.
417 +dit(bf(--link-by-hash=DIR)) This option hard links the destination files into
418 +em(DIR), a link farm arranged by MD5 file hash. The result is that the system
419 +will only store (usually) one copy of the unique contents of each file,
420 +regardless of the file's name (it will use extra files if the links overflow
421 +the available maximum).
423 +This patch does not take into account file permissions, extended attributes,
424 +or ACLs when linking things together, so you should only use this if you
425 +don't care about preserving those extra file attributes (or if they are
426 +always the same for identical files).
428 +The DIR is relative to the destination directory, so either specify a full
429 +path to the hash hierarchy, or specify a relative path that puts the links
430 +outside the destination (e.g. "../links").
432 +Keep in mind that the hierarchy is never pruned, so if you need to reclaim
433 +space, you should remove any files that have just one link (since they are not
434 +linked into any destination dirs anymore):
436 + find $DIR -links 1 -delete
438 +The link farm's directory hierarchy is determined by the file's (32-char) MD5
439 +hash and the file-length. The hash is split up into directory shards. For
440 +example, if a file is 54321 bytes long, it could be stored like this:
442 + $DIR/123/456/789/01234567890123456789012.54321.0
444 +Note that the directory layout in this patch was modified for version 3.1.0,
445 +so anyone using an older version of this patch should move their existing
446 +link hierarchy out of the way and then use the newer rsync to copy the saved
447 +hierarchy into its new layout. Assuming that no files have overflowed their
448 +link limits, this would work:
451 + rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
455 +If some of your files are at their link limit, you'd be better of using a
456 +script to calculate the md5 sum of each file in the hierarchy and move it
457 +to its new location.
459 dit(bf(-z, --compress)) With this option, rsync compresses the file data
460 as it is sent to the destination machine, which reduces the amount of data
461 being transmitted -- something that is useful over a slow connection.
462 diff --git a/rsyncd.conf.yo b/rsyncd.conf.yo
465 @@ -297,6 +297,21 @@ message telling them to try later. The default is 0, which means no limit.
466 A negative value disables the module.
467 See also the "lock file" parameter.
469 +dit(bf(link by hash dir)) When the "link by hash dir" parameter is set to a
470 +non-empty string, received files will be hard linked into em(DIR), a link farm
471 +arranged by MD5 file hash. See the bf(--link-by-hash) option for a full
474 +The em(DIR) must be accessible inside any chroot restrictions for the module,
475 +but can exist outside the transfer location if there is an inside-the-chroot
476 +path to the module (see "use chroot"). Note that a user-specified option does
477 +not allow this outside-the-transfer-area placement.
479 +If this parameter is set, it will disable the bf(--link-by-hash) command-line
480 +option for copies into the module.
482 +The default is for this parameter to be unset.
484 dit(bf(log file)) When the "log file" parameter is set to a non-empty
485 string, the rsync daemon will log messages to the indicated file rather
486 than using syslog. This is particularly useful on systems (such as AIX)