3 This patch adds the --link-by-hash=DIR option, which hard links received files
4 in a link farm arranged by MD4 or MD5 file hash. The result is that the system
5 will only store one copy of the unique contents of each file, regardless of the
8 To use this patch, run these commands for a successful build:
10 patch -p1 <patches/link-by-hash.diff
15 based-on: 6c8ca91c731b7bf2b081694bda85b7dadc2b7aff
16 diff --git a/Makefile.in b/Makefile.in
19 @@ -47,7 +47,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
20 util1.o util2.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
21 OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
22 usage.o fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
23 -OBJS3=progress.o pipe.o @MD5_ASM@ @ROLL_SIMD@ @ROLL_ASM@
24 +OBJS3=progress.o pipe.o hashlink.o @MD5_ASM@ @ROLL_SIMD@ @ROLL_ASM@
25 DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
26 popt_OBJS=popt/findme.o popt/popt.o popt/poptconfig.o \
27 popt/popthelp.o popt/poptparse.o
28 diff --git a/checksum.c b/checksum.c
31 @@ -40,6 +40,8 @@ extern int whole_file;
32 extern int checksum_seed;
33 extern int protocol_version;
34 extern int proper_seed_order;
35 +extern char *link_by_hash_dir;
36 +extern char link_by_hash_extra_sum[MAX_DIGEST_LEN];
37 extern const char *checksum_choice;
39 #define NNI_BUILTIN (1<<0)
40 @@ -539,7 +541,7 @@ void file_checksum(const char *fname, const STRUCT_STAT *st_p, char *sum)
43 static int32 sumresidue;
44 -static md_context ctx_md;
45 +static md_context ctx_md, ctx2_md;
47 static XXH64_state_t* xxh64_state;
49 @@ -597,6 +599,8 @@ int sum_init(struct name_num_item *nni, int seed)
53 + if (link_by_hash_dir)
54 + md5_begin(&ctx2_md);
57 mdfour_begin(&ctx_md);
58 @@ -643,6 +647,8 @@ void sum_update(const char *p, int32 len)
61 md5_update(&ctx_md, (uchar *)p, len);
62 + if (link_by_hash_dir)
63 + md5_update(&ctx2_md, (uchar *)p, len);
67 @@ -709,6 +715,8 @@ void sum_end(char *sum)
70 md5_result(&ctx_md, (uchar *)sum);
71 + if (link_by_hash_dir)
72 + md5_result(&ctx2_md, (uchar *)link_by_hash_extra_sum);
76 diff --git a/clientserver.c b/clientserver.c
79 @@ -53,6 +53,7 @@ extern int logfile_format_has_i;
80 extern int logfile_format_has_o_or_i;
81 extern char *bind_address;
82 extern char *config_file;
83 +extern char *link_by_hash_dir;
84 extern char *logfile_format;
85 extern char *files_from;
87 @@ -736,6 +737,9 @@ static int rsync_module(int f_in, int f_out, int i, const char *addr, const char
91 + if (*lp_link_by_hash_dir(i))
92 + link_by_hash_dir = lp_link_by_hash_dir(i);
95 rprintf(FLOG, "rsync allowed access on module %s from %s (%s)\n",
97 diff --git a/daemon-parm.txt b/daemon-parm.txt
100 @@ -29,6 +29,7 @@ STRING hosts_deny NULL
102 STRING include_from NULL
103 STRING incoming_chmod NULL
104 +STRING link_by_hash_dir NULL
105 STRING lock_file DEFAULT_LOCK_FILE
107 STRING log_format "%o %h [%a] %m (%u) %f %l"
108 diff --git a/hashlink.c b/hashlink.c
114 + Copyright (C) Cronosys, LLC 2004
116 + This program is free software; you can redistribute it and/or modify
117 + it under the terms of the GNU General Public License as published by
118 + the Free Software Foundation; either version 2 of the License, or
119 + (at your option) any later version.
121 + This program is distributed in the hope that it will be useful,
122 + but WITHOUT ANY WARRANTY; without even the implied warranty of
123 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
124 + GNU General Public License for more details.
126 + You should have received a copy of the GNU General Public License
127 + along with this program; if not, write to the Free Software
128 + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
131 +/* This file contains code used by the --link-by-hash option. */
136 +extern int protocol_version;
137 +extern char *link_by_hash_dir;
138 +extern char sender_file_sum[MAX_DIGEST_LEN];
140 +char link_by_hash_extra_sum[MAX_DIGEST_LEN]; /* Only used when md4 sums are in the transfer */
144 +/* This function is always called after a file is received, so the
145 + * sender_file_sum buffer has whatever the last checksum was for the
146 + * transferred file. */
147 +void link_by_hash(const char *fname, const char *fnametmp, struct file_struct *file)
150 + char *hashname, *last_slash, *num_str;
154 + /* We don't bother to hard-link 0-length files. */
155 + if (F_LENGTH(file) == 0)
158 + hex = sum_as_hex(5, protocol_version >= 30 ? sender_file_sum : link_by_hash_extra_sum, 0);
159 + if (asprintf(&hashname, "%s/%.3s/%.3s/%.3s/%s.%s.000000",
160 + link_by_hash_dir, hex, hex+3, hex+6, hex+9, big_num(F_LENGTH(file))) < 0)
162 + out_of_memory("make_hash_name");
165 + last_slash = strrchr(hashname, '/');
166 + num_str = strrchr(last_slash, '.') + 1;
169 + if (num >= 999999) { /* Surely we'll never reach this... */
170 + if (DEBUG_GTE(HASHLINK, 1))
171 + rprintf(FINFO, "link-by-hash: giving up after \"%s\".\n", hashname);
174 + if (num > 0 && DEBUG_GTE(HASHLINK, 1))
175 + rprintf(FINFO, "link-by-hash: max link count exceeded, starting new file \"%s\".\n", hashname);
177 + snprintf(num_str, 7, "%d", num++);
178 + if (do_stat(hashname, &st) < 0)
181 + if (do_link(hashname, fnametmp) < 0) {
182 + if (errno == EMLINK)
184 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", hashname, full_fname(fname));
186 + if (DEBUG_GTE(HASHLINK, 2))
187 + rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", hashname, full_fname(fname));
188 + robust_rename(fnametmp, fname, NULL, 0644);
194 + if (DEBUG_GTE(HASHLINK, 2))
195 + rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", full_fname(fname), hashname);
197 + if (do_link(fname, hashname) < 0
198 + && (errno != ENOENT || make_path(hashname, MKP_DROP_NAME) < 0 || do_link(fname, hashname) < 0))
199 + rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"", full_fname(fname), hashname);
205 diff --git a/options.c b/options.c
208 @@ -173,6 +173,7 @@ char *backup_suffix = NULL;
210 char *partial_dir = NULL;
211 char *basis_dir[MAX_BASIS_DIRS+1];
212 +char *link_by_hash_dir = NULL;
213 char *config_file = NULL;
214 char *shell_cmd = NULL;
215 char *logfile_name = NULL;
216 @@ -231,7 +232,7 @@ static const char *debug_verbosity[] = {
217 /*2*/ "BIND,CMD,CONNECT,DEL,DELTASUM,DUP,FILTER,FLIST,ICONV",
218 /*3*/ "ACL,BACKUP,CONNECT2,DELTASUM2,DEL2,EXIT,FILTER2,FLIST2,FUZZY,GENR,OWN,RECV,SEND,TIME",
219 /*4*/ "CMD2,DELTASUM3,DEL3,EXIT2,FLIST3,ICONV2,OWN2,PROTO,TIME2",
220 - /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HLINK",
221 + /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HASHLINK,HLINK",
224 #define MAX_VERBOSITY ((int)(sizeof debug_verbosity / sizeof debug_verbosity[0]) - 1)
225 @@ -302,6 +303,7 @@ static struct output_struct debug_words[COUNT_DEBUG+1] = {
226 DEBUG_WORD(FUZZY, W_REC, "Debug fuzzy scoring (levels 1-2)"),
227 DEBUG_WORD(GENR, W_REC, "Debug generator functions"),
228 DEBUG_WORD(HASH, W_SND|W_REC, "Debug hashtable code"),
229 + DEBUG_WORD(HASHLINK, W_REC, "Debug hashlink code (levels 1-2)"),
230 DEBUG_WORD(HLINK, W_SND|W_REC, "Debug hard-link actions (levels 1-3)"),
231 DEBUG_WORD(ICONV, W_CLI|W_SRV, "Debug iconv character conversions (levels 1-2)"),
232 DEBUG_WORD(IO, W_CLI|W_SRV, "Debug I/O routines (levels 1-4)"),
233 @@ -582,7 +584,7 @@ enum {OPT_SERVER = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
234 OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
235 OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
236 OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_BLOCK_SIZE,
237 - OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR,
238 + OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT, OPT_STDERR, OPT_LINK_BY_HASH,
239 OPT_OLD_COMPRESS, OPT_NEW_COMPRESS, OPT_NO_COMPRESS, OPT_OLD_ARGS,
240 OPT_STOP_AFTER, OPT_STOP_AT,
241 OPT_REFUSED_BASE = 9000};
242 @@ -743,6 +745,7 @@ static struct poptOption long_options[] = {
243 {"compare-dest", 0, POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
244 {"copy-dest", 0, POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
245 {"link-dest", 0, POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
246 + {"link-by-hash", 0, POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
247 {"fuzzy", 'y', POPT_ARG_NONE, 0, 'y', 0, 0 },
248 {"no-fuzzy", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
249 {"no-y", 0, POPT_ARG_VAL, &fuzzy_basis, 0, 0, 0 },
250 @@ -990,6 +993,9 @@ static void set_refuse_options(void)
254 + if (*lp_link_by_hash_dir(module_id))
255 + parse_one_refuse_match(0, "link-by-hash", list_end);
259 if (!*lp_charset(module_id))
260 @@ -1867,6 +1873,20 @@ int parse_arguments(int *argc_p, const char ***argv_p)
264 + case OPT_LINK_BY_HASH:
266 + arg = poptGetOptArg(pc);
267 + if (sanitize_paths)
268 + arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
269 + link_by_hash_dir = (char *)arg;
272 + snprintf(err_buf, sizeof err_buf,
273 + "hard links are not supported on this %s\n",
274 + am_server ? "server" : "client");
278 case OPT_STOP_AFTER: {
280 arg = poptGetOptArg(pc);
281 @@ -2252,6 +2272,8 @@ int parse_arguments(int *argc_p, const char ***argv_p)
282 tmpdir = sanitize_path(NULL, tmpdir, NULL, 0, SP_DEFAULT);
284 backup_dir = sanitize_path(NULL, backup_dir, NULL, 0, SP_DEFAULT);
285 + if (link_by_hash_dir)
286 + link_by_hash_dir = sanitize_path(NULL, link_by_hash_dir, NULL, 0, SP_DEFAULT);
288 if (daemon_filter_list.head && !am_sender) {
289 filter_rule_list *elp = &daemon_filter_list;
290 @@ -2941,6 +2963,12 @@ void server_options(char **args, int *argc_p)
291 args[ac++] = "--no-W";
294 + if (link_by_hash_dir && am_sender) {
295 + args[ac++] = "--link-by-hash";
296 + args[ac++] = link_by_hash_dir;
297 + link_by_hash_dir = NULL; /* optimize sending-side checksums */
300 if (files_from && (!am_sender || filesfrom_host)) {
301 if (filesfrom_host) {
302 args[ac++] = "--files-from";
303 diff --git a/rsync.1.md b/rsync.1.md
306 @@ -510,6 +510,7 @@ has its own detailed description later in this manpage.
307 --compare-dest=DIR also compare destination files relative to DIR
308 --copy-dest=DIR ... and include copies of unchanged files
309 --link-dest=DIR hardlink to files in DIR when unchanged
310 +--link-by-hash=DIR create hardlinks by hash into DIR
311 --compress, -z compress file data during the transfer
312 --compress-choice=STR choose the compression algorithm (aka --zc)
313 --compress-level=NUM explicitly set compression level (aka --zl)
314 @@ -2720,6 +2721,50 @@ expand it.
315 this bug by avoiding the `-o` option (or using `--no-o`) when sending to an
318 +0. `--link-by-hash=DIR`
320 + This option hard links the destination files into _DIR_, a link farm
321 + arranged by MD5 file hash. The result is that the system will only store
322 + (usually) one copy of the unique contents of each file, regardless of the
323 + file's name (it will use extra files if the links overflow the available
326 + This patch does not take into account file permissions, extended
327 + attributes, or ACLs when linking things together, so you should only use
328 + this if you don't care about preserving those extra file attributes (or if
329 + they are always the same for identical files).
331 + The _DIR_ is relative to the destination directory, so either specify a full
332 + path to the hash hierarchy, or specify a relative path that puts the links
333 + outside the destination (e.g. "../links").
335 + Keep in mind that the hierarchy is never pruned, so if you need to reclaim
336 + space, you should remove any files that have just one link (since they are
337 + not linked into any destination dirs anymore):
339 + > find $DIR -links 1 -delete
341 + The link farm's directory hierarchy is determined by the file's (32-char)
342 + MD5 hash and the file-length. The hash is split up into directory shards.
343 + For example, if a file is 54321 bytes long, it could be stored like this:
345 + > $DIR/123/456/789/01234567890123456789012.54321.0
347 + Note that the directory layout in this patch was modified for version
348 + 3.1.0, so anyone using an older version of this patch should move their
349 + existing link hierarchy out of the way and then use the newer rsync to copy
350 + the saved hierarchy into its new layout. Assuming that no files have
351 + overflowed their link limits, this would work:
354 + > rsync -aiv --link-by-hash=$DIR $DIR.old/ $DIR.tmp/
358 + If some of your files are at their link limit, you'd be better of using a
359 + script to calculate the md5 sum of each file in the hierarchy and move it
360 + to its new location.
362 0. `--compress`, `-z`
364 With this option, rsync compresses the file data as it is sent to the
365 diff --git a/rsync.c b/rsync.c
368 @@ -52,6 +52,7 @@ extern int flist_eof;
369 extern int file_old_total;
370 extern int keep_dirlinks;
371 extern int make_backups;
372 +extern char *link_by_hash_dir;
373 extern int sanitize_paths;
374 extern struct file_list *cur_flist, *first_flist, *dir_flist;
375 extern struct chmod_mode_struct *daemon_chmod_modes;
376 @@ -760,6 +761,10 @@ int finish_transfer(const char *fname, const char *fnametmp,
379 /* The file was moved into place (not copied), so it's done. */
381 + if (link_by_hash_dir)
382 + link_by_hash(fname, fnametmp, file);
386 /* The file was copied, so tweak the perms of the copied file. If it
387 diff --git a/rsync.h b/rsync.h
390 @@ -1446,7 +1446,8 @@ extern short info_levels[], debug_levels[];
391 #define DEBUG_FUZZY (DEBUG_FLIST+1)
392 #define DEBUG_GENR (DEBUG_FUZZY+1)
393 #define DEBUG_HASH (DEBUG_GENR+1)
394 -#define DEBUG_HLINK (DEBUG_HASH+1)
395 +#define DEBUG_HASHLINK (DEBUG_HASH+1)
396 +#define DEBUG_HLINK (DEBUG_HASHLINK+1)
397 #define DEBUG_ICONV (DEBUG_HLINK+1)
398 #define DEBUG_IO (DEBUG_ICONV+1)
399 #define DEBUG_NSTR (DEBUG_IO+1)
400 diff --git a/rsyncd.conf.5.md b/rsyncd.conf.5.md
401 --- a/rsyncd.conf.5.md
402 +++ b/rsyncd.conf.5.md
403 @@ -388,6 +388,23 @@ in the values of parameters. See that section for details.
404 is 0, which means no limit. A negative value disables the module. See
405 also the "[lock file](#)" parameter.
407 +0. `link by hash dir`
409 + When the "link by hash dir" parameter is set to a non-empty string,
410 + received files will be hard linked into **DIR**, a link farm arranged by
411 + MD5 file hash. See the `--link-by-hash` option for a full explanation.
413 + The **DIR** must be accessible inside any chroot restrictions for the
414 + module, but can exist outside the transfer location if there is an
415 + inside-the-chroot path to the module (see "use chroot"). Note that a
416 + user-specified option does not allow this outside-the-transfer-area
419 + If this parameter is set, it will disable the `--link-by-hash` command-line
420 + option for copies into the module.
422 +The default is for this parameter to be unset.
426 When the "log file" parameter is set to a non-empty string, the rsync