Applied to the main code as option --outbuf=N|L|B.
[rsync-patches.git] / link-by-hash.diff
1 Jason M. Felice wrote:
2
3 This patch adds the --link-by-hash=DIR option, which hard links received files
4 in a link farm arranged by MD4 or MD5 file hash.  The result is that the system
5 will only store one copy of the unique contents of each file, regardless of the
6 file's name.
7
8 To use this patch, run these commands for a successful build:
9
10     patch -p1 <patches/link-by-hash.diff
11     ./prepare-source
12     ./configure
13     make
14
15 based-on: 60ef39705797c9df7069297eb4ed5feab5e88f29
16 diff --git a/Makefile.in b/Makefile.in
17 --- a/Makefile.in
18 +++ b/Makefile.in
19 @@ -37,7 +37,7 @@ OBJS1=flist.o rsync.o generator.o receiver.o cleanup.o sender.o exclude.o \
20         util.o main.o checksum.o match.o syscall.o log.o backup.o delete.o
21  OBJS2=options.o io.o compat.o hlink.o token.o uidlist.o socket.o hashtable.o \
22         fileio.o batch.o clientname.o chmod.o acls.o xattrs.o
23 -OBJS3=progress.o pipe.o
24 +OBJS3=progress.o pipe.o hashlink.o
25  DAEMON_OBJ = params.o loadparm.o clientserver.o access.o connection.o authenticate.o
26  popt_OBJS=popt/findme.o  popt/popt.o  popt/poptconfig.o \
27         popt/popthelp.o popt/poptparse.o
28 diff --git a/checksum.c b/checksum.c
29 --- a/checksum.c
30 +++ b/checksum.c
31 @@ -21,6 +21,7 @@
32  
33  #include "rsync.h"
34  
35 +extern int checksum_len;
36  extern int checksum_seed;
37  extern int protocol_version;
38  
39 @@ -221,3 +222,24 @@ int sum_end(char *sum)
40  
41         return MD4_DIGEST_LEN;
42  }
43 +
44 +const char *sum_as_hex(const char *sum)
45 +{
46 +       static char buf[MAX_DIGEST_LEN*2+1];
47 +       int i, x1, x2;
48 +       char *c = buf + checksum_len*2;
49 +
50 +       assert(c - buf < (int)sizeof buf);
51 +
52 +       *c = '\0';
53 +
54 +       for (i = checksum_len; --i >= 0; ) {
55 +               x1 = CVAL(sum, i);
56 +               x2 = x1 >> 4;
57 +               x1 &= 0xF;
58 +               *--c = x1 <= 9 ? x1 + '0' : x1 + 'a' - 10;
59 +               *--c = x2 <= 9 ? x2 + '0' : x2 + 'a' - 10;
60 +       }
61 +
62 +       return buf;
63 +}
64 diff --git a/clientserver.c b/clientserver.c
65 --- a/clientserver.c
66 +++ b/clientserver.c
67 @@ -50,6 +50,7 @@ extern int logfile_format_has_i;
68  extern int logfile_format_has_o_or_i;
69  extern char *bind_address;
70  extern char *config_file;
71 +extern char *link_by_hash_dir;
72  extern char *logfile_format;
73  extern char *files_from;
74  extern char *tmpdir;
75 @@ -551,6 +552,9 @@ static int rsync_module(int f_in, int f_out, int i, const char *addr, const char
76                 return -1;
77         }
78  
79 +       if (*lp_link_by_hash_dir(i))
80 +               link_by_hash_dir = lp_link_by_hash_dir(i);
81 +
82         if (am_daemon && am_server) {
83                 rprintf(FLOG, "rsync allowed access on module %s from %s (%s)\n",
84                         name, host, addr);
85 diff --git a/compat.c b/compat.c
86 --- a/compat.c
87 +++ b/compat.c
88 @@ -55,6 +55,7 @@ extern char *partial_dir;
89  extern char *dest_option;
90  extern char *files_from;
91  extern char *filesfrom_host;
92 +extern char *link_by_hash_dir;
93  extern filter_rule_list filter_list;
94  extern int need_unsorted_flist;
95  #ifdef ICONV_OPTION
96 @@ -328,4 +329,8 @@ void setup_protocol(int f_out,int f_in)
97         } else {
98                 checksum_seed = read_int(f_in);
99         }
100 +       if (!am_sender && link_by_hash_dir && protocol_version < 30 && checksum_seed != 1) {
101 +               rprintf(FERROR, "You must specify --checksum-seed=1 when using --link-by-hash with an old version of rsync.\n");
102 +               exit_cleanup(RERR_PROTOCOL);
103 +       }
104  }
105 diff --git a/hashlink.c b/hashlink.c
106 new file mode 100644
107 --- /dev/null
108 +++ b/hashlink.c
109 @@ -0,0 +1,334 @@
110 +/*
111 +   Copyright (C) Cronosys, LLC 2004
112 +
113 +   This program is free software; you can redistribute it and/or modify
114 +   it under the terms of the GNU General Public License as published by
115 +   the Free Software Foundation; either version 2 of the License, or
116 +   (at your option) any later version.
117 +
118 +   This program is distributed in the hope that it will be useful,
119 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
120 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
121 +   GNU General Public License for more details.
122 +
123 +   You should have received a copy of the GNU General Public License
124 +   along with this program; if not, write to the Free Software
125 +   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
126 +*/
127 +
128 +/* This file contains code used by the --link-by-hash option. */
129 +
130 +#include "rsync.h"
131 +
132 +extern char *link_by_hash_dir;
133 +extern char sender_file_sum[MAX_DIGEST_LEN];
134 +
135 +#ifdef HAVE_LINK
136 +
137 +/* This function is always called after a file is received, so the
138 + * sender_file_sum buffer has whatever the last checksum was for the
139 + * transferred file. */
140 +static char *make_hash_name(void)
141 +{
142 +       const char *hex = sum_as_hex(sender_file_sum);
143 +       char *dst;
144 +
145 +       if (asprintf(&dst, "%s/%.8s/%s", link_by_hash_dir, hex, hex+8) < 0)
146 +               out_of_memory("make_hash_name");
147 +
148 +       return dst;
149 +}
150 +
151 +
152 +static void kill_hashfile(struct hashfile_struct *hashfile)
153 +{
154 +       if (!hashfile)
155 +               return;
156 +       free(hashfile->name);
157 +       close(hashfile->fd);
158 +       free(hashfile);
159 +}
160 +
161 +
162 +static void kill_hashfiles(struct hashfile_struct *hashfiles)
163 +{
164 +       struct hashfile_struct *iter, *next;
165 +       if ((iter = hashfiles) != NULL) {
166 +               do {
167 +                       next = iter->next;
168 +                       kill_hashfile(iter);
169 +                       iter = next;
170 +               } while (iter != hashfiles);
171 +       }
172 +}
173 +
174 +
175 +static struct hashfile_struct *find_hashfiles(char *hashname, int64 size, long *fnbr)
176 +{
177 +       DIR *d;
178 +       struct dirent *di;
179 +       struct hashfile_struct *hashfiles = NULL, *hashfile;
180 +       STRUCT_STAT st;
181 +       long this_fnbr;
182 +
183 +       *fnbr = 0;
184 +
185 +       /* Build a list of potential candidates and open
186 +        * them. */
187 +       if ((d = opendir(hashname)) == NULL) {
188 +               rsyserr(FERROR, errno, "opendir failed: \"%s\"", hashname);
189 +               free(hashname);
190 +               return NULL;
191 +       }
192 +       while ((di = readdir(d)) != NULL) {
193 +               if (!strcmp(di->d_name,".") || !strcmp(di->d_name,"..")) {
194 +                       continue;
195 +               }
196 +
197 +               /* We need to have the largest fnbr in case we need to store
198 +                * a new file. */
199 +               this_fnbr = atol(di->d_name);
200 +               if (this_fnbr > *fnbr)
201 +                       *fnbr = this_fnbr;
202 +
203 +               hashfile = new_array(struct hashfile_struct, 1);
204 +               if (asprintf(&hashfile->name,"%s/%s",hashname, di->d_name) < 0)
205 +                       out_of_memory("find_hashfiles");
206 +               if (do_stat(hashfile->name,&st) == -1) {
207 +                       rsyserr(FERROR, errno, "stat failed: %s", hashfile->name);
208 +                       kill_hashfile(hashfile);
209 +                       continue;
210 +               }
211 +               if (st.st_size != size) {
212 +                       kill_hashfile(hashfile);
213 +                       continue;
214 +               }
215 +               hashfile->nlink = st.st_nlink;
216 +               hashfile->fd = open(hashfile->name,O_RDONLY|O_BINARY);
217 +               if (hashfile->fd == -1) {
218 +                       rsyserr(FERROR, errno, "open failed: %s", hashfile->name);
219 +                       kill_hashfile(hashfile);
220 +                       continue;
221 +               }
222 +               if (hashfiles == NULL)
223 +                       hashfiles = hashfile->next = hashfile->prev = hashfile;
224 +               else {
225 +                       hashfile->next = hashfiles;
226 +                       hashfile->prev = hashfiles->prev;
227 +                       hashfile->next->prev = hashfile;
228 +                       hashfile->prev->next = hashfile;
229 +               }
230 +       }
231 +       closedir(d);
232 +
233 +       return hashfiles;
234 +}
235 +
236 +
237 +static struct hashfile_struct *compare_hashfiles(int fd,struct hashfile_struct *files)
238 +{
239 +       int amt, hamt;
240 +       char buffer[BUFSIZ], cmpbuffer[BUFSIZ];
241 +       struct hashfile_struct *iter, *next, *best;
242 +       uint32 nlink;
243 +
244 +       if (!files)
245 +               return NULL;
246 +
247 +       iter = files; /* in case files are 0 bytes */
248 +       while ((amt = read(fd, buffer, BUFSIZ)) > 0) {
249 +               iter = files;
250 +               do {
251 +                       /* Icky bit to resync when we steal the first node. */
252 +                       if (!files)
253 +                               files = iter;
254 +
255 +                       next = iter->next;
256 +
257 +                       hamt = read(iter->fd, cmpbuffer, BUFSIZ);
258 +                       if (amt != hamt || memcmp(buffer, cmpbuffer, amt)) {
259 +                               if (iter == files) {
260 +                                       files = files->prev;
261 +                               }
262 +                               if (iter->next == iter) {
263 +                                       files = next = NULL;
264 +                               } else {
265 +                                       next = iter->next;
266 +                                       if (iter == files) {
267 +                                               /* So we know to resync */
268 +                                               files = NULL;
269 +                                       }
270 +                               }
271 +                               iter->next->prev = iter->prev;
272 +                               iter->prev->next = iter->next;
273 +                               kill_hashfile(iter);
274 +                       }
275 +
276 +                       iter = next;
277 +               } while (iter != files);
278 +
279 +               if (iter == NULL && files == NULL) {
280 +                       /* There are no matches. */
281 +                       return NULL;
282 +               }
283 +       }
284 +
285 +       if (amt == -1) {
286 +               rsyserr(FERROR, errno, "read failed in compare_hashfiles()");
287 +               kill_hashfiles(files);
288 +               return NULL;
289 +       }
290 +
291 +       /* If we only have one file left, use it. */
292 +       if (files == files->next) {
293 +               return files;
294 +       }
295 +
296 +       /* All files which remain in the list are identical and should have
297 +        * the same size.  We pick the one with the lowest link count (we
298 +        * may have rolled over because we hit the maximum link count for
299 +        * the filesystem). */
300 +       best = iter = files;
301 +       nlink = iter->nlink;
302 +       do {
303 +               if (iter->nlink < nlink) {
304 +                       nlink = iter->nlink;
305 +                       best = iter;
306 +               }
307 +               iter = iter->next;
308 +       } while (iter != files);
309 +
310 +       best->next->prev = best->prev;
311 +       best->prev->next = best->next;
312 +       if (files == best)
313 +               files = files->next;
314 +       kill_hashfiles(files);
315 +       return best;
316 +}
317 +
318 +
319 +int link_by_hash(const char *fnametmp, const char *fname, struct file_struct *file)
320 +{
321 +       STRUCT_STAT st;
322 +       char *hashname = make_hash_name();
323 +       int first = 0, rc;
324 +       char *linkname;
325 +       long last_fnbr;
326 +
327 +       if (F_LENGTH(file) == 0)
328 +               return robust_rename(fnametmp, fname, NULL, 0644);
329 +
330 +       if (do_stat(hashname, &st) == -1) {
331 +               char *dirname;
332 +
333 +               /* Directory does not exist. */
334 +               dirname = strdup(hashname);
335 +               *strrchr(dirname,'/') = 0;
336 +               if (do_mkdir(dirname, 0755) == -1 && errno != EEXIST) {
337 +                       rsyserr(FERROR, errno, "mkdir failed: %s", dirname);
338 +                       free(hashname);
339 +                       free(dirname);
340 +                       return robust_rename(fnametmp, fname, NULL, 0644);
341 +               }
342 +               free(dirname);
343 +
344 +               if (do_mkdir(hashname, 0755) == -1 && errno != EEXIST) {
345 +                       rsyserr(FERROR, errno, "mkdir failed: %s", hashname);
346 +                       free(hashname);
347 +                       return robust_rename(fnametmp, fname, NULL, 0644);
348 +               }
349 +
350 +               first = 1;
351 +               if (asprintf(&linkname,"%s/0",hashname) < 0)
352 +                       out_of_memory("link_by_hash");
353 +               if (DEBUG_GTE(HASHLINK, 2))
354 +                       rprintf(FINFO, "(1) linkname = %s\n", linkname);
355 +       } else {
356 +               struct hashfile_struct *hashfiles, *hashfile;
357 +
358 +               if (do_stat(fnametmp,&st) == -1) {
359 +                       rsyserr(FERROR, errno, "stat failed: %s", fname);
360 +                       return -1;
361 +               }
362 +               hashfiles = find_hashfiles(hashname, st.st_size, &last_fnbr);
363 +
364 +               if (hashfiles == NULL) {
365 +                       first = 1;
366 +                       if (asprintf(&linkname,"%s/0",hashname) < 0)
367 +                               out_of_memory("link_by_hash");
368 +                       if (DEBUG_GTE(HASHLINK, 2))
369 +                               rprintf(FINFO, "(2) linkname = %s\n", linkname);
370 +               } else {
371 +                       int fd;
372 +                       /* Search for one identical to us. */
373 +                       if ((fd = open(fnametmp,O_RDONLY|O_BINARY)) == -1) {
374 +                               rsyserr(FERROR, errno, "open failed: %s", fnametmp);
375 +                               kill_hashfiles(hashfiles);
376 +                               return -1;
377 +                       }
378 +                       hashfile = compare_hashfiles(fd, hashfiles);
379 +                       hashfiles = NULL;
380 +                       close(fd);
381 +
382 +                       if (hashfile) {
383 +                               first = 0;
384 +                               linkname = strdup(hashfile->name);
385 +                               if (DEBUG_GTE(HASHLINK, 2))
386 +                                       rprintf(FINFO, "(3) linkname = %s\n", linkname);
387 +                               kill_hashfile(hashfile);
388 +                       } else {
389 +                               first = 1;
390 +                               if (asprintf(&linkname, "%s/%ld", hashname, last_fnbr + 1) < 0)
391 +                                       out_of_memory("link_by_hash");
392 +                               if (DEBUG_GTE(HASHLINK, 2))
393 +                                       rprintf(FINFO, "(4) linkname = %s\n", linkname);
394 +                       }
395 +               }
396 +       }
397 +
398 +       if (!first) {
399 +               if (DEBUG_GTE(HASHLINK, 2))
400 +                       rprintf(FINFO, "link-by-hash (existing): \"%s\" -> %s\n", linkname, full_fname(fname));
401 +               robust_unlink(fname);
402 +               rc = do_link(linkname, fname);
403 +               if (rc == -1) {
404 +                       if (errno == EMLINK) {
405 +                               first = 1;
406 +                               free(linkname);
407 +                               if (asprintf(&linkname,"%s/%ld",hashname, last_fnbr + 1) < 0)
408 +                                       out_of_memory("link_by_hash");
409 +                               if (DEBUG_GTE(HASHLINK, 2))
410 +                                       rprintf(FINFO, "(5) linkname = %s\n", linkname);
411 +                               if (DEBUG_GTE(HASHLINK, 1))
412 +                                       rprintf(FINFO, "link-by-hash: max link count exceeded, starting new file \"%s\".\n", linkname);
413 +                       } else {
414 +                               rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
415 +                                       linkname, full_fname(fname));
416 +                               rc = robust_rename(fnametmp, fname, NULL, 0644);
417 +                       }
418 +               } else {
419 +                       do_unlink(fnametmp);
420 +               }
421 +       }
422 +
423 +       if (first) {
424 +               if (DEBUG_GTE(HASHLINK, 2))
425 +                       rprintf(FINFO, "link-by-hash (new): %s -> \"%s\"\n", full_fname(fname),linkname);
426 +
427 +               rc = robust_rename(fnametmp, fname, NULL, 0644);
428 +               if (rc != 0) {
429 +                       rsyserr(FERROR, errno, "rename \"%s\" -> \"%s\"",
430 +                               full_fname(fnametmp), full_fname(fname));
431 +               }
432 +               rc = do_link(fname,linkname);
433 +               if (rc != 0) {
434 +                       rsyserr(FERROR, errno, "link \"%s\" -> \"%s\"",
435 +                               full_fname(fname), linkname);
436 +               }
437 +       }
438 +
439 +       free(linkname);
440 +       free(hashname);
441 +       return rc;
442 +}
443 +#endif
444 diff --git a/loadparm.c b/loadparm.c
445 --- a/loadparm.c
446 +++ b/loadparm.c
447 @@ -119,6 +119,7 @@ typedef struct {
448         char *include;
449         char *include_from;
450         char *incoming_chmod;
451 +       char *link_by_hash_dir;
452         char *lock_file;
453         char *log_file;
454         char *log_format;
455 @@ -195,6 +196,7 @@ static const all_vars Defaults = {
456   /* include; */                        NULL,
457   /* include_from; */           NULL,
458   /* incoming_chmod; */         NULL,
459 + /* link_by_hash_dir; */       NULL,
460   /* lock_file; */              DEFAULT_LOCK_FILE,
461   /* log_file; */               NULL,
462   /* log_format; */             "%o %h [%a] %m (%u) %f %l",
463 @@ -336,6 +338,7 @@ static struct parm_struct parm_table[] =
464   {"include from",      P_STRING, P_LOCAL, &Vars.l.include_from,        NULL,0},
465   {"include",           P_STRING, P_LOCAL, &Vars.l.include,             NULL,0},
466   {"incoming chmod",    P_STRING, P_LOCAL, &Vars.l.incoming_chmod,      NULL,0},
467 + {"link by hash dir",  P_STRING, P_LOCAL, &Vars.l.link_by_hash_dir,    NULL,0},
468   {"list",              P_BOOL,   P_LOCAL, &Vars.l.list,                NULL,0},
469   {"lock file",         P_STRING, P_LOCAL, &Vars.l.lock_file,           NULL,0},
470   {"log file",          P_STRING, P_LOCAL, &Vars.l.log_file,            NULL,0},
471 @@ -464,6 +467,7 @@ FN_LOCAL_STRING(lp_hosts_deny, hosts_deny)
472  FN_LOCAL_STRING(lp_include, include)
473  FN_LOCAL_STRING(lp_include_from, include_from)
474  FN_LOCAL_STRING(lp_incoming_chmod, incoming_chmod)
475 +FN_LOCAL_STRING(lp_link_by_hash_dir, link_by_hash_dir)
476  FN_LOCAL_STRING(lp_lock_file, lock_file)
477  FN_LOCAL_STRING(lp_log_file, log_file)
478  FN_LOCAL_STRING(lp_log_format, log_format)
479 diff --git a/log.c b/log.c
480 --- a/log.c
481 +++ b/log.c
482 @@ -683,23 +683,14 @@ static void log_formatted(enum logcode code, const char *format, const char *op,
483                         if (protocol_version >= 30
484                          && (iflags & ITEM_TRANSFER
485                           || (always_checksum && S_ISREG(file->mode)))) {
486 -                               int i, x1, x2;
487                                 const char *sum = iflags & ITEM_TRANSFER
488                                                 ? sender_file_sum : F_SUM(file);
489 -                               c = buf2 + checksum_len*2;
490 -                               *c = '\0';
491 -                               for (i = checksum_len; --i >= 0; ) {
492 -                                       x1 = CVAL(sum, i);
493 -                                       x2 = x1 >> 4;
494 -                                       x1 &= 0xF;
495 -                                       *--c = x1 <= 9 ? x1 + '0' : x1 + 'a' - 10;
496 -                                       *--c = x2 <= 9 ? x2 + '0' : x2 + 'a' - 10;
497 -                               }
498 +                               n = sum_as_hex(sum);
499                         } else {
500                                 memset(buf2, ' ', checksum_len*2);
501                                 buf2[checksum_len*2] = '\0';
502 +                               n = buf2;
503                         }
504 -                       n = buf2;
505                         break;
506                 case 'i':
507                         if (iflags & ITEM_DELETED) {
508 diff --git a/options.c b/options.c
509 --- a/options.c
510 +++ b/options.c
511 @@ -159,6 +159,7 @@ char *backup_suffix = NULL;
512  char *tmpdir = NULL;
513  char *partial_dir = NULL;
514  char *basis_dir[MAX_BASIS_DIRS+1];
515 +char *link_by_hash_dir = NULL;
516  char *config_file = NULL;
517  char *shell_cmd = NULL;
518  char *logfile_name = NULL;
519 @@ -208,7 +209,7 @@ static const char *debug_verbosity[] = {
520         /*2*/ "BIND,CMD,CONNECT,DEL,DELTASUM,DUP,FILTER,FLIST,ICONV",
521         /*3*/ "ACL,BACKUP,CONNECT2,DELTASUM2,DEL2,EXIT,FILTER2,FLIST2,FUZZY,GENR,OWN,RECV,SEND,TIME",
522         /*4*/ "CMD2,DELTASUM3,DEL3,EXIT2,FLIST3,ICONV2,OWN2,PROTO,TIME2",
523 -       /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HLINK",
524 +       /*5*/ "CHDIR,DELTASUM4,FLIST4,FUZZY2,HASH,HASHLINK,HLINK",
525  };
526  
527  #define MAX_VERBOSITY ((int)(sizeof debug_verbosity / sizeof debug_verbosity[0]) - 1)
528 @@ -278,6 +279,7 @@ static struct output_struct debug_words[COUNT_DEBUG+1] = {
529         DEBUG_WORD(FUZZY, W_REC, "Debug fuzzy scoring (levels 1-2)"),
530         DEBUG_WORD(GENR, W_REC, "Debug generator functions"),
531         DEBUG_WORD(HASH, W_SND|W_REC, "Debug hashtable code"),
532 +       DEBUG_WORD(HASHLINK, W_REC, "Debug hashlink code (levels 1-2)"),
533         DEBUG_WORD(HLINK, W_SND|W_REC, "Debug hard-link actions (levels 1-3)"),
534         DEBUG_WORD(ICONV, W_CLI|W_SRV, "Debug iconv character conversions (levels 1-2)"),
535         DEBUG_WORD(IO, W_CLI|W_SRV, "Debug I/O routines (levels 1-4)"),
536 @@ -759,6 +761,7 @@ void usage(enum logcode F)
537    rprintf(F,"     --compare-dest=DIR      also compare destination files relative to DIR\n");
538    rprintf(F,"     --copy-dest=DIR         ... and include copies of unchanged files\n");
539    rprintf(F,"     --link-dest=DIR         hardlink to files in DIR when unchanged\n");
540 +  rprintf(F,"     --link-by-hash=DIR      create hardlinks by hash into DIR\n");
541    rprintf(F," -z, --compress              compress file data during the transfer\n");
542    rprintf(F,"     --compress-level=NUM    explicitly set compression level\n");
543    rprintf(F,"     --skip-compress=LIST    skip compressing files with a suffix in LIST\n");
544 @@ -811,7 +814,7 @@ enum {OPT_VERSION = 1000, OPT_DAEMON, OPT_SENDER, OPT_EXCLUDE, OPT_EXCLUDE_FROM,
545        OPT_FILTER, OPT_COMPARE_DEST, OPT_COPY_DEST, OPT_LINK_DEST, OPT_HELP,
546        OPT_INCLUDE, OPT_INCLUDE_FROM, OPT_MODIFY_WINDOW, OPT_MIN_SIZE, OPT_CHMOD,
547        OPT_READ_BATCH, OPT_WRITE_BATCH, OPT_ONLY_WRITE_BATCH, OPT_MAX_SIZE,
548 -      OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG,
549 +      OPT_NO_D, OPT_APPEND, OPT_NO_ICONV, OPT_INFO, OPT_DEBUG, OPT_LINK_BY_HASH,
550        OPT_USERMAP, OPT_GROUPMAP, OPT_CHOWN, OPT_BWLIMIT,
551        OPT_SERVER, OPT_REFUSED_BASE = 9000};
552  
553 @@ -955,6 +958,7 @@ static struct poptOption long_options[] = {
554    {"compare-dest",     0,  POPT_ARG_STRING, 0, OPT_COMPARE_DEST, 0, 0 },
555    {"copy-dest",        0,  POPT_ARG_STRING, 0, OPT_COPY_DEST, 0, 0 },
556    {"link-dest",        0,  POPT_ARG_STRING, 0, OPT_LINK_DEST, 0, 0 },
557 +  {"link-by-hash",     0,  POPT_ARG_STRING, 0, OPT_LINK_BY_HASH, 0, 0},
558    {"fuzzy",           'y', POPT_ARG_NONE,   0, 'y', 0, 0 },
559    {"no-fuzzy",         0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
560    {"no-y",             0,  POPT_ARG_VAL,    &fuzzy_basis, 0, 0, 0 },
561 @@ -1299,6 +1303,9 @@ int parse_arguments(int *argc_p, const char ***argv_p)
562                 iconv_opt = strdup(arg);
563  #endif
564  
565 +       if (*lp_link_by_hash_dir(module_id))
566 +               set_refuse_options("link-by-hash");
567 +
568         /* TODO: Call poptReadDefaultConfig; handle errors. */
569  
570         /* The context leaks in case of an error, but if there's a
571 @@ -1785,6 +1792,21 @@ int parse_arguments(int *argc_p, const char ***argv_p)
572                         return 0;
573  #endif
574  
575 +                case OPT_LINK_BY_HASH:
576 +#ifdef HAVE_LINK
577 +                       arg = poptGetOptArg(pc);
578 +                       if (sanitize_paths)
579 +                               arg = sanitize_path(NULL, arg, NULL, 0, SP_DEFAULT);
580 +                       link_by_hash_dir = (char *)arg;
581 +                       break;
582 +#else
583 +                       snprintf(err_buf, sizeof err_buf,
584 +                                "hard links are not supported on this %s\n",
585 +                                am_server ? "server" : "client");
586 +                       rprintf(FERROR, "ERROR: %s", err_buf);
587 +                       return 0;
588 +#endif
589 +
590                 default:
591                         /* A large opt value means that set_refuse_options()
592                          * turned this option off. */
593 @@ -2675,6 +2697,11 @@ void server_options(char **args, int *argc_p)
594         } else if (inplace)
595                 args[ac++] = "--inplace";
596  
597 +       if (link_by_hash_dir && am_sender) {
598 +               args[ac++] = "--link-by-hash";
599 +               args[ac++] = link_by_hash_dir;
600 +       }
601 +
602         if (files_from && (!am_sender || filesfrom_host)) {
603                 if (filesfrom_host) {
604                         args[ac++] = "--files-from";
605 diff --git a/rsync.c b/rsync.c
606 --- a/rsync.c
607 +++ b/rsync.c
608 @@ -47,6 +47,7 @@ extern int flist_eof;
609  extern int file_old_total;
610  extern int keep_dirlinks;
611  extern int make_backups;
612 +extern char *link_by_hash_dir;
613  extern struct file_list *cur_flist, *first_flist, *dir_flist;
614  extern struct chmod_mode_struct *daemon_chmod_modes;
615  #ifdef ICONV_OPTION
616 @@ -648,7 +649,12 @@ int finish_transfer(const char *fname, const char *fnametmp,
617         /* move tmp file over real file */
618         if (DEBUG_GTE(RECV, 1))
619                 rprintf(FINFO, "renaming %s to %s\n", fnametmp, fname);
620 -       ret = robust_rename(fnametmp, fname, temp_copy_name, file->mode);
621 +#ifdef HAVE_LINK
622 +       if (link_by_hash_dir)
623 +               ret = link_by_hash(fnametmp, fname, file);
624 +       else
625 +#endif
626 +               ret = robust_rename(fnametmp, fname, temp_copy_name, file->mode);
627         if (ret < 0) {
628                 rsyserr(FERROR_XFER, errno, "%s %s -> \"%s\"",
629                         ret == -2 ? "copy" : "rename",
630 diff --git a/rsync.h b/rsync.h
631 --- a/rsync.h
632 +++ b/rsync.h
633 @@ -909,6 +909,14 @@ struct stats {
634         int xferred_files;
635  };
636  
637 +struct hashfile_struct {
638 +       struct hashfile_struct *next;
639 +       struct hashfile_struct *prev;
640 +       char *name;
641 +       int fd;
642 +       uint32 nlink;
643 +};
644 +
645  struct chmod_mode_struct;
646  
647  struct flist_ndx_item {
648 @@ -1263,7 +1271,8 @@ extern short info_levels[], debug_levels[];
649  #define DEBUG_FUZZY (DEBUG_FLIST+1)
650  #define DEBUG_GENR (DEBUG_FUZZY+1)
651  #define DEBUG_HASH (DEBUG_GENR+1)
652 -#define DEBUG_HLINK (DEBUG_HASH+1)
653 +#define DEBUG_HASHLINK (DEBUG_HASH+1)
654 +#define DEBUG_HLINK (DEBUG_HASHLINK+1)
655  #define DEBUG_ICONV (DEBUG_HLINK+1)
656  #define DEBUG_IO (DEBUG_ICONV+1)
657  #define DEBUG_OWN (DEBUG_IO+1)
658 diff --git a/rsync.yo b/rsync.yo
659 --- a/rsync.yo
660 +++ b/rsync.yo
661 @@ -416,6 +416,7 @@ to the detailed description below for a complete description.  verb(
662       --compare-dest=DIR      also compare received files relative to DIR
663       --copy-dest=DIR         ... and include copies of unchanged files
664       --link-dest=DIR         hardlink to files in DIR when unchanged
665 +     --link-by-hash=DIR      create hardlinks by hash into DIR
666   -z, --compress              compress file data during the transfer
667       --compress-level=NUM    explicitly set compression level
668       --skip-compress=LIST    skip compressing files with suffix in LIST
669 @@ -1830,6 +1831,19 @@ bf(--link-dest) from working properly for a non-super-user when bf(-o) was
670  specified (or implied by bf(-a)).  You can work-around this bug by avoiding
671  the bf(-o) option when sending to an old rsync.
672  
673 +dit(bf(--link-by-hash=DIR)) This option hard links the destination files into
674 +em(DIR), a link farm arranged by MD5 file hash (or sometimes MD4). The result
675 +is that the system will only store one copy of the unique contents of each
676 +file, regardless of the file's name.
677 +
678 +For a modern rsync (3.0.0 and newer), the link farm's directory hierarchy is
679 +determined by the file's MD5 hash.  It is recommended that you don't use this
680 +option with any rsync older than that.  However, if you really need to be able
681 +to interact with an older rsync on the sending side, you can use the options
682 +bf(--checksum-seed=1) and bf(--protocol=29) to force a consistent MD4 file
683 +checksum that will be used instead of MD5.  Note that this MD4 checksum is not
684 +compatible with older versions of this patch (prior to 3.1.0).
685 +
686  dit(bf(-z, --compress)) With this option, rsync compresses the file data
687  as it is sent to the destination machine, which reduces the amount of data
688  being transmitted -- something that is useful over a slow connection.
689 diff --git a/rsyncd.conf.yo b/rsyncd.conf.yo
690 --- a/rsyncd.conf.yo
691 +++ b/rsyncd.conf.yo
692 @@ -283,6 +283,13 @@ message telling them to try later.  The default is 0, which means no limit.
693  A negative value disables the module.
694  See also the "lock file" parameter.
695  
696 +dit(bf(link by hash dir)) When the "link by hash dir" parameter is set to a
697 +non-empty string, received files will be hard linked into em(DIR), a link farm
698 +arranged by MD5 file hash (or sometimes MD4). See the bf(--link-by-hash) option
699 +for a full explaination.  If this parameter is set it will disable the
700 +bf(--link-by-hash) command-line option. The default is for this parameter to be
701 +unset.
702 +
703  dit(bf(log file)) When the "log file" parameter is set to a non-empty
704  string, the rsync daemon will log messages to the indicated file rather
705  than using syslog. This is particularly useful on systems (such as AIX)