1 /* Dupmerge - Reclaim disk space by linking identical files together
3 This is a utility that scans a UNIX directory tree looking for pairs of
4 distinct files with identical content. When it finds such files, it
5 deletes one file to reclaim its disk space and then recreates its path
6 name as a link to the other copy.
7 My first version of this program circa 1993 worked by computing MD5
8 hashes of every file, sorting the hashes and then looking for duplicates.
9 This worked, but it was unnecessarily slow. The comparison function I use
10 now stops comparing two files as soon as it determines their lengths are
11 different, which is a win when you have many large files with unique lengths.
13 * This program reads from standard input a list of files (such
14 * as that generated by "find . -print") and discovers which files are
15 * identical. Dupmerge unlinks one file of each identical pair and
16 * recreates its path name as a link to the other.
18 * Non-plain files in the input (directories, pipes, devices, etc)
19 * are ignored. Identical files must be on the same file system to be linked.
21 * Dupmerge prefers to keep the older of two identical files, as the older
22 * timestamp is more likely to be the correct one given that many
23 * copy utilities (e.g., 'cp') do not by default preserve modification
26 * Dupmerge works by quicksorting a list of path names, with the
27 * actual unlinking and relinking steps performed as side effects of
28 * the comparison function. The results of the sort are discarded.
30 * Command line arguments:
31 * -n Suppress the actual unlinking and relinking
32 * -q Operate in quiet mode (otherwise, relinks are displayed on stdout)
34 * 12 February 1998 Phil Karn, karn@ka9q.ampr.org
35 * Copyright Phil Karn. May be used under the terms of the GNU Public License.
37 --------------------------------------------------------------------------------
39 Added swap macro, invers mode, no linking of zero length files or files with no disk usage (sparse files),
40 void casts for semantic checkers, version number, sorting of equal size files due to dev and ino and name,
41 switched from newline terminated file names to zero terminated file names ...
43 Tested under SuSE 9.2 and Debian (both Kernel 2.6), tested with 4 and 7 Gigabyte files, coreutils sources,
44 md5 collisions, kernel sources, my archive with CD and DVD collections in an encrypted partition (200 GB,
45 1/2 Million files, 5 hours run time, about 30 GB reclaimed by 100,000 hard links).
46 Changed to reading/writing/comparing 8 byte blocks (instead of 1 byte blocks): 2 times faster now.
47 Changed inverse mode from O(n*ln(n)) to O(n).
48 Added sparse and desparse mode and combo flag.
49 Dr. Rolf Freitag a.k.a. Dr. No, 2004.
51 For files bigger than 2 GiB, > 2^31 Byte, you should use the compiler option
52 "-D_FILE_OFFSET_BITS=64".
54 Example for compilation (and striping + prelinking) on i686 ("PC" with Pentium III or higher or Athlon)
55 with shell function c (in ~/.bashrc):
57 gcc -Wall -I. -O3 -D_GNU_SOURCE -D__SMP__ -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT \
58 -pthread -mcpu=i686 -fexpensive-optimizations -DCPU=686 -ffast-math -m486 -lm -lz -o $1 $1.c && strip $1 \
59 && prelink -fmRv ./$1 && chmod +t $1
62 Example usage of this shell function (in ~/.bashrc): c dupmerge
64 But a simple "gcc -D_GNU_SOURCE -O2 -o dupmerge dupmerge.c" also does the job.
66 The inverse mode expands the hard links to simple files if used without the option -n.
67 With option -n hard links are only reported.
68 The reverse mode can be used e. g. for expanding backups, which have been shrinked with
69 with the default mode.
71 Caution: If there is not enough space for expanding the hard links (in inverse mode) hard links
72 will be lost! It's also a bad idea to kill dupmerge with signal 9 before it finishes because
73 files can be lost or damaged or new temporary and maybe big files can remain.
76 find ./ -type f -print0 | dupmerge 2>&1 | tee /tmp/user0/dupmerge.out
79 - progress meter with approx. percent
80 - change from comparing signed to unsigned numbers for clearer ordering
81 - inverse mode optimization: expand with the same file attributes (copy most of struct stat)
82 - option m <number> for minimum block size; (not file size because of sparse files).
83 smaller files will not be hard linked
84 - ANSI-C (C99) comppatible error handling of the comparison functions
85 - for option n (nodo): normal mode: show the number of blocks which can be saved,
86 - refactorisation: separate help function and bit fiels +macros instead of legacy variables
87 - error message +advice when wrong parameters are used
88 - option f for using a soft link as second try when a hard link could not be made and the reverse for
89 inverse mode (inverse mode: expand hard and soft links)
90 - option z for not excluding zero length files from linking/expanding/erasing
91 - more user friendly output with e. g. freed block + saved space in kB instead of kiB or blocks (see IEC 60027-2),
92 extra explanation in output when -n is used,
93 - tests with ntfs partitions for MS-Win-users
94 - correkt block counting (sparse files are yet not counted correct)
96 - autoconf/automake + rpm and deb package
97 - An option for scheduling in parallel with 2..N pthreads e. g. for SMP boxes with ramdisk.
98 - other features like a gziped cvs file with the file attributes from lstat of the files which were
100 - an extra option for creating the tmpfiles on a specified partition for speedup with a ramdisk
101 - an option for calling dupmerge with find to shorten the command line the user has to do
102 - check if the user has enough permissions to replace files (in not nodo mode)
103 - self-calling, "user-friendly", mode with extra option which does calls dupmerge with find via the system call
104 - test with files which can be deleted only via the inode (http://www.cyberciti.biz/tips/delete-remove-files-with-inode-number.html)
105 - An additional option for doing deletion irreversible via shred or wipe or ... instead of unlink or rm.
106 - Checking for and reporting of filesystems where there are no links possible; e. g. no hard links with FAT.
109 Version 1.73, 2008-03-01
115 #include <string.h> // strndup,
116 #include <sys/types.h>
117 #include <sys/stat.h>
118 #include <unistd.h> // Nessisary for unlink, gtopt, link, fcmp.
119 #include <iso646.h> // for and, bitand, or, bitor ...
120 #include <stdbool.h> // _Bool
121 #include <signal.h> // signals
122 #include <limits.h> // UCHAR_MAX
123 #include <sys/mman.h> // mlockall
124 #include <sys/types.h> // vfork
125 #include <sys/wait.h> // vfork
126 #include <time.h> // localtime
129 #define mc_MIN(a, b) ((a) < (b) ? (a) : (b))
130 #define mc_MAX(a, b) ((a) > (b) ? (a) : (b))
133 // Swap two items a, b of size i_size, use (static) swap variable c for performance.
134 #define mc_SWAP_ITEMS(a, b, c, i_size) { memcpy (&(c), &(a), (i_size));\
135 memcpy (&(a), &(b), (i_size));\
136 memcpy (&(b), &(c), (i_size)); }
139 #define mc_ADVICE if (argc > 1 && 0 == strncmp (argv[1], "-advice", 8) ) \
141 (void)printf ("Don't Panic!\n"); \
143 } /* 42: The meaning of life, the universe, and everything ;-) */
146 // For registering the signal handler void sig_handler (int sig). This macro saves 32 bytes ;-)
147 #define mc_register_sig_handler \
150 for(i=0; i<=UCHAR_MAX; i++)\
151 signal (i, sig_handler);\
155 // union for copy and compare
163 int cmp (const void *, const void *); // function for qsort
164 int cmp0 (const void *, const void *); // function for qsort
165 int fcmp (const void *, const void *); // function for qsort
166 int fcmp0 (const void *, const void *); // function for qsort
167 int fcmp1 (const void *);
168 int fcmp2 (const void *, const _Bool);
169 _Bool different_files (const void *, const void *);
171 const double d_version = 1.73; // very simple version number of %1.2f format
172 struct stat s_swap; // swap variable
173 int Nodo = 0; // for nodo mode (only report)
174 int Quiet = 0; // for quiet mode
175 int i_soft = 0; // for using always soft links instead of the default hard links
176 _Bool b_inv = false; // invers mode indicator
177 _Bool b_sparse = false; // flag for sparseing
178 _Bool b_comb = false; // combo flag for normal + sparse mode
179 int i_minlinks = 1; // minimum of found hard links
180 int i_maxlinks = 1; // maximum of found hard links
181 long int li_minblocks = LONG_MAX; // minimum of saved blocks by sparse copying
182 long int li_maxblocks = 0; // maximum of saved blocks by sparse copying
183 int Files_deleted = 0;
184 int Blocks_reclaimed = 0;
185 int i_files_expanded = 0;
186 int i_blocks_declaimed = 0;
187 volatile int i_exit_flag = 0; // flag for exiting soon (not immediately) and controlled after a signal to terminate
188 char **a_names = NULL; // array of file names
189 char *a_flags = NULL; // Array with the flags of the files in a_names. 1 = marked for deletion because another file the the same content was found,
190 int nfiles = 0; // number of files
194 sig_handler (const int sig) // signal handler
196 if ((SIGINT == sig) or (SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig) or (SIGTERM == sig))
198 i_exit_flag = 1; // set the flag for a graceful exit
199 // if ((SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig))
201 // (void) printf ("\a\a\a\a\a\a\a Signal %d, program exiting... \r\n\a\a\a", sig);
209 // initialisation for main
214 Blocks_reclaimed = 0;
215 i_files_expanded = 0;
216 i_blocks_declaimed = 0;
219 li_minblocks = LONG_MAX;
224 // delete the files (of name a_name[0 ... i_files] which are marked for deletion
226 delete_marked (const int i_files)
231 for (i = 0; i < i_files; i++)
235 if (a_flags[i] bitand 1) // delete
237 if (-1 == lstat (a_names[i], &sa)) // or ! S_ISREG (sa.st_mode))
239 fprintf (stderr, "lstat(%s) failed\n", a_names[i]);
245 printf ("deleting %s, freeing %lld blocks\n", a_names[i], ((1 == sa.st_nlink) ? sa.st_blocks : 0));
249 if (1 == sa.st_nlink) // file a is no (hard) link
251 Blocks_reclaimed += sa.st_blocks;
260 main (const int argc, const char *const argv[])
262 char **names, buf[BUFSIZ]; //, *cp; // BUFSIZE is the default buffer size in stdio.h, usually 8192
263 int i, j, i1, i_mode = 0, i_ferror = 0, i_ret = 0; // integers, mode: 0: common, 1: deletion, i_ferror: for ferror
264 FILE *tmp; // file pointer for tmpfile
265 const char c_zero = '\0'; // for terminating strings
266 time_t now; // for actual time
267 struct tm *ptr_tm = (time (&now), localtime (&now)); // set actual time, convert to localtime
270 // read parameters for quiet mode and suppressing the actual unlinking and relinking. Rolf Freitag
271 while ((i = getopt (argc, (char *const *) argv, "cdhinqsSV")) != EOF)
279 i_mode = 1; // deletion mode
288 fprintf (stdout, "This program can reclaim disk space by linking identical files together.\n");
289 fprintf (stdout, "It can also expand all hard links and reads the list of files from standard input.\n");
290 fprintf (stdout, "Example usage: find ./ -type f -print0 | dupmerge 2>&1 | tee ../dupmerge_log.txt\n");
291 fprintf (stdout, "Options:\n-h \t Show this help and exit.\n-V \t Show version number and exit.\n");
292 fprintf (stdout, "-d \t delete multiple files and hard links. Default: Preserve the alphabetically first file name.\n");
293 //fprintf (stdout, " \t Switch for ascessing/decessing order in deletion mode.\n");
294 fprintf (stdout, "-q \t Quiet mode.\n-n \t Nodo mode / read-only mode.\n");
295 fprintf (stdout, "-i \t Inverse switch: Expand all hard links in normal mode, replace files by their desparsed version if it is bigger.\n");
296 fprintf (stdout, "-s \t Flag for soft linking (default is hard linking). This option is beta because for linking of all ");
297 fprintf (stdout, "equal files more than one run of dupmerge is necessary and the inverse (expanding of soft links) is untested.\n");
298 fprintf (stdout, "-S \t Flag for Sparse mode: Replace files by their sparse version if it is smaller.\n");
299 fprintf (stdout, "-c \t Combo mode: Default mode +sparse mode. With -i it means inverse mode with unlinking and desparsing.\n");
307 fprintf (stdout, "dupmerge version %1.2f\n", d_version);
323 (void) printf ("%s started at %d-%s%d-%s%d %s%d:%s%d:%s%d\n", argv[0],
324 ptr_tm->tm_year + 1900, ptr_tm->tm_mon + 1 > 9 ? "" : "0", ptr_tm->tm_mon + 1,
325 ptr_tm->tm_mday > 9 ? "" : "0", ptr_tm->tm_mday,
326 ptr_tm->tm_hour > 9 ? "" : "0", ptr_tm->tm_hour,
327 ptr_tm->tm_min > 9 ? "" : "0", ptr_tm->tm_min, ptr_tm->tm_sec > 9 ? "" : "0", ptr_tm->tm_sec);
331 mlockall (MCL_CURRENT bitor MCL_FUTURE); // be cached
333 // Read list of file names into tmp file and count
337 (void) fprintf (stderr, "could not open temporary file, exiting\n");
343 (void) printf ("tmpfile (pointer %p) created, processing ...\n", tmp);
346 while (!feof (stdin)) // while not end of input
348 buf[0] = '\0'; // set strnlen to 0 so that strnlen (buf, BUFSIZ) is 0 at EOF
349 for (i = 0; i < BUFSIZ; i++) // read till the sequence operator \0 is found
351 fread (&buf[i], 1, 1, stdin);
352 i_ferror = ferror (stdin);
355 (void) fprintf (stderr, "stdin: error %d.\n", i_ferror);
358 if ('\0' == buf[i]) // termination found
361 if ('\0' not_eq buf[sizeof (buf) - 1]) // no termination at buffer end
362 buf[sizeof (buf) - 1] = '\0';
363 if (strnlen (buf, BUFSIZ))
364 nfiles++; // new file: increment file counter
365 if ((EOF == fputs (buf, tmp)) or (not fwrite (&c_zero, 1, 1, tmp))) // store new file name in tmp file
367 (void) fprintf (stderr, "could not write to temporary file, exiting\n");
371 // Now that we know how many there are, printf, allocate space and re-read.
374 (void) printf ("Input: %d files, processing ...\n", nfiles);
377 if (0 == nfiles) // no input, nothing to do
380 names = (char **) calloc (nfiles, sizeof (char *));
381 a_names = (char **) calloc (nfiles, sizeof (char *));
382 a_flags = (char *) calloc (nfiles, sizeof (char));
383 if ((NULL == names) or (NULL == a_flags) or (NULL == a_names))
385 (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
388 for (i = 0; i < nfiles; i++) // get the file names from the tmp file
390 for (j = 0; j < BUFSIZ; j++) // read till the sequence operator \0 is found
392 fread (&buf[j], 1, 1, tmp);
393 i_ferror = ferror (stdin);
396 (void) fprintf (stderr, "tmpfile: error %d.\n", i_ferror);
399 if ('\0' == buf[j]) // termination found
402 if ((NULL == (names[i] = strndup (buf, BUFSIZ))) or (NULL == (a_names[i] = strndup (buf, BUFSIZ))))
403 { // The command "gcc -o dupmerge dupmerge.c" prduces the Warning "dupmerge.c:191: warning: assignment
404 // makes pointer from integer without a cast" but this is a warning bug; in the if statement above there is no int!
405 // To get rid of that warning you can cast strndup to (char *) or equivalent (typeof(names[i])) or use -O3. [gcc version 3.3.4]
406 // For later versions of gcc you should use the command line option -D_GNU_SOURCE, but that's not necessary under cygwin.
407 (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
412 mc_register_sig_handler; // register signal handler before critical sections
413 qsort (a_names, nfiles, sizeof (char *), cmp0); // sort only the (pointers to the) file names in a_names
416 case 0: // no deletion
417 switch (b_comb and b_inv) // For desparse mode after nomal inverted mode. Otherwise a second run would be nessisary.
419 case 0: // not combo mode and inverse switch
420 if (b_sparse) // sparse/desparse files first because sparsing/desparsing expands hard links (changes inode)
422 // no sorted list of files at this point
424 if (b_comb and Nodo) // combo mode and read-only (Nodo): do nodo sparsing/desparsing during sorting with output here
429 Nodo = 1; // for only sorting the names
430 qsort (names, nfiles, sizeof (char *), fcmp); // sort only the (pointers to the) file names
431 Nodo = i; // restore Nodo
432 Quiet = i1; // restore Quiet
434 main_init1 (); // start with new init
435 // sparse or desparse
436 for (i = 0; i < nfiles; i++)
438 // only different files, call fcmp2/3 only for the last file in a block of equal hard links
439 if ((nfiles - 1 == i) or different_files (names + i, names + i + 1))
440 (void) fcmp2 (names + i, not b_inv);
446 ("Files%s %ssparsed: %d of %d, Disk blocks%s %sclaimed: %d\n",
447 (Nodo ? " which can be" : ""), (b_inv ? "de" : ""),
448 (b_inv ? i_files_expanded : Files_deleted), nfiles,
449 (Nodo ? " can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
452 ("Minimum of disk blocks which could be %s by %s copying: %ld, Maximum: %ld.\n",
453 (b_inv ? "declaimed" : "reclaimed"), (b_inv ? "desparse" : "sparse"), li_minblocks, li_maxblocks);
456 if ((b_comb and Nodo) or (b_inv)) // all work done
458 } // if (b_sparse), no break
460 main_init1 (); // start with new init
461 if ((b_sparse and b_comb) or (not b_sparse)) // sparse mode +combo flag or no sparse mode: normal mode, maybe with inversion
462 { // Without the combo flag and with sparse mode there is no normal (maybe inverted) mode!
465 qsort (names, nfiles, sizeof (char *), fcmp);
468 (void) printf ("Scanning for more dups ...\n");
471 for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
472 (void) fcmp (names + i, names + i + 1); // e. g. with gcc qsort at coreutil sources.
478 (void) printf ("Scanning for hard links ...\n");
481 for (i = 0; i < nfiles; i++)
482 (void) fcmp1 (names + i);
488 ("Files%s %s: %d of %d, Disk blocks%s %sclaimed: %d\n",
489 (Nodo ? " which can be" : ""),
490 (b_inv ? "expanded" : "linked"),
491 (b_inv ? i_files_expanded : Files_deleted), nfiles,
492 (Nodo ? " which can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
493 (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
496 } // if ((b_sparse and b_comb) or (not b_sparse))
497 if (b_comb and b_inv) // do desparsing in inverse mode
501 (void) printf ("Unexpected switch error ...\n");
507 // no sorted list of files at this point
508 qsort (names, nfiles, sizeof (char *), fcmp0); // sort only the file names
510 for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
511 (void) fcmp0 (names + i, names + i + 1); // e. g. with gcc qsort at coreutil sources.
512 delete_marked (nfiles);
517 ("Duplicate files (%s deleted): %d of %d, Disk blocks%s reclaimed: %d\n",
518 (Nodo ? "which can be" : "which have been"), (b_inv ? i_files_expanded : Files_deleted), nfiles, (Nodo ? " which can be" : ""), Blocks_reclaimed);
519 (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
524 (void) printf ("Unexpected switch error ...\n");
532 // For alphabetic ordering/searching.
534 cmp (const void *p0, const void *p1)
537 char **p2 = (char **) p0;
538 char **p3 = (char **) p1;
539 i = strncmp (*p2, *p3, BUFSIZ); // compare the two strings
544 // For alphabetic ordering/searching. With global inversion flag for inverse mode.
546 cmp0 (const void *p0, const void *p1)
549 char **p2 = (char **) p0;
550 char **p3 = (char **) p1;
551 i = strncmp (*p2, *p3, BUFSIZ); // compare the two strings
558 // This is the comparison function called by qsort, where the real work
559 // is done as a side effect. Due to ANSI-C the current version is not
560 // completely correct because if the same objects are passed more than once to the
561 // comparison function the results must be consistent with another; see 7.20.5 in C99.
562 // If errors like lstat failed do occur, this is not fullfilled yet. With fcmp1 it's the same.
563 // sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
565 fcmp (const void *a, const void *b)
569 int i1, i2, i_nlinka, i_nlinkb, i_fa, i_fb;
571 const char *filea, *fileb;
572 union u_tag u_buffer1 = {
577 void *buffer1 = (void *) &u_buffer1;
578 void *buffer2 = (void *) &u_buffer2;
579 _Bool b_swap = false;
583 // Nonexistent or non-plain files are less than any other file
586 filea = *(const char **) a;
587 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
589 fprintf (stderr, "lstat(%s) failed\n", filea);
595 fileb = *(const char **) b;
596 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
598 fprintf (stderr, "lstat(%s) failed\n", fileb);
602 i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
603 i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
604 // Smaller files are "less"
605 if (sa.st_size < sb.st_size)
607 if (sa.st_size > sb.st_size)
609 // We now know both files exist, are plain files, are the same size
610 // if both files are hard linked or zero length: sort by name
611 if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)) or (0 == sa.st_size))
612 return (strncmp (filea, fileb, BUFSIZ));
613 // We now know both files exist, are plain files, are the same size > 0,
614 // and are not already linked, so compare them alphabetical, if they are on the same device
615 if (sa.st_dev != sb.st_dev)
616 return ((sa.st_dev < sb.st_dev) ? -1 : 1);
617 if (NULL == (fa = fopen (filea, "r")))
618 return (-1); // Unreadable files are "less than"
619 if (NULL == (fb = fopen (fileb, "r")))
624 // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
625 // On big endian machines it's alphabetic ordering, on little andian machines it's
626 // alphabetic ordering with 64 bit 'characters'.
627 // Du to C99, 7.19.8.1 read must be done with chars.
628 while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
630 // Mask the unsused 1..7 bytes, because the standard says nothing about them.
631 memset (&(u_buffer1.c[i1]), 0, 8 - i1); // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
632 memset (&(u_buffer2.c[i2]), 0, 8 - i2);
633 // check for file errors
636 if (i_fa or i_fb) // check for file errors
640 (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
641 rval = -1; // error file: smaller
646 (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
651 if (u_buffer1.i64 != u_buffer2.i64) // compare
653 rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
659 if (rval) // unequal files of same size
662 if ((sa.st_blocks) or (sb.st_blocks)) // avoid linking sparse files with no blocks allocated
664 // The two files have same content, size > 0, blocks allocated > 0 and are on the same device, so link them.
665 // We prefer to keep the one with less blocks, or if they're the
666 // same the older one, or if they're the same, the one with more (hard) links.
667 if ((sb.st_blocks < sa.st_blocks) or (sb.st_mtime > sa.st_mtime) or (sb.st_nlink < sa.st_nlink))
669 mc_SWAP_ITEMS (sa, sb, s_swap, sizeof (struct stat)); // swap items to keep original sb
670 mc_SWAP_ITEMS (filea, fileb, s_swap, sizeof (char *)); // swap file name pointers
673 // before linking: store number of hard links of each file
674 i_nlinka = sa.st_nlink;
675 i_nlinkb = sb.st_nlink;
676 if (1 == sa.st_nlink) // file a is no (hard) link
679 Blocks_reclaimed += sa.st_blocks;
681 if (!Nodo and (unlink (filea)))
683 (void) fprintf (stderr, "unlink(%s) failed\n", filea);
689 if (!Nodo and (-1 == link (fileb, filea)))
691 (void) fprintf (stderr, "link(%s,%s) failed\n", fileb, filea);
698 if (!Nodo and (-1 == symlink (fileb, filea)))
700 (void) fprintf (stderr, "symlink(%s,%s) failed\n", filea, fileb);
707 (void) printf ("ln %s %s %s: %d, %d -> %d, freed +%llu blocks\n", i_soft ? "-s" : "", fileb, filea, i_nlinkb, i_nlinka, i_nlinka + i_nlinkb,
712 return ((b_swap) ? strncmp (fileb, filea, BUFSIZ) : strncmp (filea, fileb, BUFSIZ)); //
716 // return the index of file with name a_name
718 nindex (const char *a_name)
720 void *p_v = bsearch (&a_name, a_names, nfiles, sizeof (char *), cmp0);
721 int i_ret = (int) (((((void *) p_v) - ((void *) a_names))) / (sizeof (char *)));
724 printf ("Error: bsearch(%s, %p, %d, %d, %p) returned NULL.\n", a_name, a_names, nfiles, sizeof (char *), cmp);
725 return (-1); // error
731 // This is the comparison function called by qsort, for marking all duplicate files with that flag
732 // Due to ANSI-C the current version is not
733 // completely correct because if the same objects are passed more than one to the
734 // comparison function the results must be consistent with another, 7.20.5 C99.
735 // If errors like lstat failed do occur, this is not fullfilled yet.
736 // sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
738 fcmp0 (const void *a, const void *b)
743 int rval = 0, ja = 0, jb = 0, i_fa, i_fb;
744 const char *filea, *fileb;
745 union u_tag u_buffer1 = {
750 void *buffer1 = (void *) &u_buffer1;
751 void *buffer2 = (void *) &u_buffer2;
755 // Nonexistent or non-plain files are less than any other file
758 filea = *(const char **) a;
759 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
761 fprintf (stderr, "lstat(%s) failed\n", filea);
767 fileb = *(const char **) b;
768 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
770 fprintf (stderr, "lstat(%s) failed\n", fileb);
774 i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
775 i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
776 // Smaller files are "less"
777 if (sa.st_size < sb.st_size)
779 if (sa.st_size > sb.st_size)
781 // We now know both files exist, are plain files, are the same size
782 // if both files are hard linked: sort by name
783 if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)))
786 // set ja, jb to the index of a, b
789 if ((ja >= 0) and (a_flags[ja] bitand 1))
790 return (-1); // marked file first
791 if ((jb >= 0) and (a_flags[jb] bitand 1))
792 return (1); // marked file first
793 // now both files are not marked
794 if ((ja >= 0) and (jb >= 0))
796 if ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ) <= 0) // alphabetic ordering
801 (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", filea, fileb);
804 return (-1); // marked file first
811 (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", fileb, filea);
814 return (1); // marked file first
817 return ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ));
819 // We now know both files exist, are plain files, are the same size > 0,
820 // and are not already linked, so compare them alphabetical
821 if (NULL == (fa = fopen (filea, "r")))
822 return (-1); // Unreadable files are "less than"
823 if (NULL == (fb = fopen (fileb, "r")))
828 // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
829 // On big endian machines it's alphabetic ordering, on little andian machines it's
830 // alphabetic ordering with 64 bit 'characters'.
831 while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
833 // Mask the unsused 1..7 bytes, because the standard says nothing about them.
834 memset (&(u_buffer1.c[i1]), 0, 8 - i1); // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
835 memset (&(u_buffer2.c[i2]), 0, 8 - i2);
836 // check for file errors
839 if (i_fa or i_fb) // check for file errors
843 (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
844 rval = -1; // error file: smaller
849 (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
854 if (u_buffer1.i64 != u_buffer2.i64) // compare
856 rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
862 if (rval) // unequal files of same size
869 // This is the hard link expansion function for the inverse mode. Needs only one file name as argument.
871 fcmp1 (const void *a)
875 int i1, i, i_b, i_fa, i_fb, i_ret = 0;
876 char a_cd[BUFSIZ] = { '\0' }; // for directory
877 char a_cf[BUFSIZ] = { '\0' }; // for tempfile
878 const char *filea, *fileb = a_cf; // file names, fileb after mkstemp
879 union u_tag u_buffer1 = {
882 void *buffer1 = (void *) &u_buffer1;
886 // Nonexistent or non-plain files are less than any other file
889 filea = *(const char **) a;
890 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
892 fprintf (stderr, "lstat(%s) failed\n", filea);
896 i_minlinks = mc_MIN (i_minlinks, sa.st_nlink);
897 i_maxlinks = mc_MAX (i_maxlinks, sa.st_nlink);
898 // check for hard links
899 if (sa.st_nlink > 1) // hard link found
902 i_blocks_declaimed += sa.st_blocks;
903 if (!Nodo) // expand the hard link
905 strncpy (a_cd, filea, BUFSIZ);
906 // extract the directory from the file name: delete from end to last "/"
907 for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
914 // now the actual directory is in a_c, create temporary file there
915 strncpy (a_cf, a_cd, BUFSIZ);
916 strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2); // template for mkstemp
917 i_b = mkstemp (a_cf);
920 (void) fprintf (stderr, "Could not create temporary file.\n");
921 perror ("file_create");
924 if ((NULL == (fa = fopen (filea, "r"))) or (NULL == (fb = fdopen (i_b, "w")))) // open file and tmpfile
926 (void) fprintf (stderr, "Expanding the hard link(%s) failed.\n", filea);
931 while ((i1 = fread (buffer1, 1, 8, fa)) != 0) // copy fa to fb in 8 byte blocks
933 (void) fwrite (buffer1, 1, i1, fb);
937 if (i_fa or i_fb) // check for file errors
940 (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
942 (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
950 (void) fprintf (stderr, "unlink(%s) failed, tempfile %s remains\n", filea, fileb);
954 // rename to original name
955 if (rename (fileb, filea))
957 (void) fprintf (stderr, "rename(%s, %s) failed, tempfile %s remains\n", fileb, filea, fileb);
964 (void) printf ("expand %s: %d -> %d, unfreed %llu blocks\n", filea, sa.st_nlink, sa.st_nlink - 1, sa.st_blocks); // sa has the old cached value
967 } // if (sa.st_ino > 1)
972 // This is the (de)sparsing function.
973 // Copying sparse/desparse can reduce/increase st_blocks while st_size remains constant.
975 fcmp2 (const void *a, const _Bool b_S)
979 int i, i_b, status = 0, i_pid, i_ret = 0;
981 char a_cd[BUFSIZ] = { '\0' }; // for directory, temporary string
982 char a_cf[BUFSIZ] = { '\0' }; // for tempfile
983 char a_ca[BUFSIZ] = { '\0' }; // for tempfile
984 const char *filea = a_ca, *fileb = a_cf; // file names
988 // Nonexistent or non-plain files are less than any other file
991 strncpy (a_ca, *(const char **) a, BUFSIZ - 1);
992 // filea = *(const char **) a;
993 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
995 fprintf (stderr, "lstat(%s) failed\n", filea);
999 // get tmpname: mkstemp, unlink
1000 strncpy (a_cd, filea, BUFSIZ);
1001 // extract the directory from the file name: delete from end to last "/"
1002 for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
1009 // now the actual directory is in a_c, create temporary file there
1010 strncpy (a_cf, a_cd, BUFSIZ);
1011 strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2); // template for mkstemp
1012 i_b = mkstemp (a_cf);
1015 (void) fprintf (stderr, "Could not create temporary file.\n");
1019 // dummy open and close to close the tmpfile
1020 if (NULL == (fb = fdopen (i_b, "w"))) // open tmpfile
1022 (void) fprintf (stderr, "fdopen (%d) failed.\n", i_b);
1027 if (unlink (fileb)) // delete file because we do need only the name
1029 (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
1033 switch (i_pid = vfork ())
1035 case -1: // parent with fork error
1037 fprintf (stderr, "vfork failed.\n");
1040 memset (a_cd, '\0', sizeof (a_cd));
1041 strncpy (a_cd, (b_S ? "--sparse=always" : "--sparse=never"), BUFSIZ);
1042 i = execl ("/bin/cp", "cp", a_cd, filea, fileb, NULL);
1045 fprintf (stderr, "execlp(cp %s %s %s) failed, return value %d\n", a_cd, filea, fileb, i);
1050 default: // parent with no error: do proceed
1053 i = waitpid (i_pid, &status, WUNTRACED); // wait till child terminates
1056 fprintf (stderr, "copying failed; child with pid %d returned %d\n", i, status);
1060 // lstat and check copy
1061 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
1063 fprintf (stderr, "lstat(%s) failed\n", fileb);
1067 li = sa.st_blocks - sb.st_blocks;
1070 li_minblocks = mc_MIN (li_minblocks, li);
1071 li_maxblocks = mc_MAX (li_maxblocks, li);
1072 // if shrinked in sparse mode or expanded in desparse mode: unlink original and rename tmpfile to original name
1073 if ((b_S ? (sa.st_blocks > sb.st_blocks) : (sa.st_blocks < sb.st_blocks)))
1075 b_S ? Files_deleted++ : i_files_expanded++;
1076 b_S ? (Blocks_reclaimed += sa.st_blocks - sb.st_blocks) : (i_blocks_declaimed += sb.st_blocks - sa.st_blocks);
1079 (void) printf ("%s: %lld -> %lld blocks\n", filea, sa.st_blocks, sb.st_blocks);
1084 if (unlink (filea)) // delete original
1086 (void) fprintf (stderr, "unlink(%s) failed\n", filea);
1090 // rename to original name
1091 if (rename (fileb, filea))
1093 (void) fprintf (stderr, "rename(%s, %s) failed\n", fileb, filea);
1102 if (unlink (fileb)) // delete tmpfile
1104 (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
1112 // return true if the two files are not hard linkded
1114 different_files (const void *a, const void *b)
1117 const char *filea, *fileb;
1119 // Nonexistent or non-plain files are less than any other file
1122 filea = *(const char **) a;
1123 if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
1125 fprintf (stderr, "lstat(%s) failed\n", filea);
1131 fileb = *(const char **) b;
1132 if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
1134 fprintf (stderr, "lstat(%s) failed\n", fileb);
1138 if ((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)) // same device and same inode?
1139 return (false); // yes: equal files
1141 } // different_files