--- /dev/null
+/* Dupmerge - Reclaim disk space by linking identical files together
+
+This is a utility that scans a UNIX directory tree looking for pairs of
+distinct files with identical content. When it finds such files, it
+deletes one file to reclaim its disk space and then recreates its path
+name as a link to the other copy.
+My first version of this program circa 1993 worked by computing MD5
+hashes of every file, sorting the hashes and then looking for duplicates.
+This worked, but it was unnecessarily slow. The comparison function I use
+now stops comparing two files as soon as it determines their lengths are
+different, which is a win when you have many large files with unique lengths.
+
+ * This program reads from standard input a list of files (such
+ * as that generated by "find . -print") and discovers which files are
+ * identical. Dupmerge unlinks one file of each identical pair and
+ * recreates its path name as a link to the other.
+ *
+ * Non-plain files in the input (directories, pipes, devices, etc)
+ * are ignored. Identical files must be on the same file system to be linked.
+ *
+ * Dupmerge prefers to keep the older of two identical files, as the older
+ * timestamp is more likely to be the correct one given that many
+ * copy utilities (e.g., 'cp') do not by default preserve modification
+ * times.
+ *
+ * Dupmerge works by quicksorting a list of path names, with the
+ * actual unlinking and relinking steps performed as side effects of
+ * the comparison function. The results of the sort are discarded.
+ *
+ * Command line arguments:
+ * -n Suppress the actual unlinking and relinking
+ * -q Operate in quiet mode (otherwise, relinks are displayed on stdout)
+ *
+ * 12 February 1998 Phil Karn, karn@ka9q.ampr.org
+ * Copyright Phil Karn. May be used under the terms of the GNU Public License.
+
+--------------------------------------------------------------------------------
+
+Added swap macro, invers mode, no linking of zero length files or files with no disk usage (sparse files),
+void casts for semantic checkers, version number, sorting of equal size files due to dev and ino and name,
+switched from newline terminated file names to zero terminated file names ...
+Switched to C99.
+Tested under SuSE 9.2 and Debian (both Kernel 2.6), tested with 4 and 7 Gigabyte files, coreutils sources,
+md5 collisions, kernel sources, my archive with CD and DVD collections in an encrypted partition (200 GB,
+1/2 Million files, 5 hours run time, about 30 GB reclaimed by 100,000 hard links).
+Changed to reading/writing/comparing 8 byte blocks (instead of 1 byte blocks): 2 times faster now.
+Changed inverse mode from O(n*ln(n)) to O(n).
+Added sparse and desparse mode and combo flag.
+Dr. Rolf Freitag a.k.a. Dr. No, 2004.
+
+For files bigger than 2 GiB, > 2^31 Byte, you should use the compiler option
+"-D_FILE_OFFSET_BITS=64".
+
+Example for compilation (and striping + prelinking) on i686 ("PC" with Pentium III or higher or Athlon)
+with shell function c (in ~/.bashrc):
+function c {
+ gcc -Wall -I. -O3 -D_GNU_SOURCE -D__SMP__ -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_REENTRANT \
+ -pthread -mcpu=i686 -fexpensive-optimizations -DCPU=686 -ffast-math -m486 -lm -lz -o $1 $1.c && strip $1 \
+ && prelink -fmRv ./$1 && chmod +t $1
+}
+
+Example usage of this shell function (in ~/.bashrc): c dupmerge
+
+But a simple "gcc -D_GNU_SOURCE -O2 -o dupmerge dupmerge.c" also does the job.
+
+The inverse mode expands the hard links to simple files if used without the option -n.
+With option -n hard links are only reported.
+The reverse mode can be used e. g. for expanding backups, which have been shrinked with
+with the default mode.
+
+Caution: If there is not enough space for expanding the hard links (in inverse mode) hard links
+will be lost! It's also a bad idea to kill dupmerge with signal 9 before it finishes because
+files can be lost or damaged or new temporary and maybe big files can remain.
+
+example for usage:
+find ./ -type f -print0 | dupmerge 2>&1 | tee /tmp/user0/dupmerge.out
+
+Todo:
+ - progress meter with approx. percent
+ - change from comparing signed to unsigned numbers for clearer ordering
+ - inverse mode optimization: expand with the same file attributes (copy most of struct stat)
+ - option m <number> for minimum block size; (not file size because of sparse files).
+ smaller files will not be hard linked
+ - ANSI-C (C99) comppatible error handling of the comparison functions
+ - for option n (nodo): normal mode: show the number of blocks which can be saved,
+ - refactorisation: separate help function and bit fiels +macros instead of legacy variables
+ - error message +advice when wrong parameters are used
+ - option f for using a soft link as second try when a hard link could not be made and the reverse for
+ inverse mode (inverse mode: expand hard and soft links)
+ - option z for not excluding zero length files from linking/expanding/erasing
+ - more user friendly output with e. g. freed block + saved space in kB instead of kiB or blocks (see IEC 60027-2),
+ extra explanation in output when -n is used,
+ - tests with ntfs partitions for MS-Win-users
+ - correkt block counting (sparse files are yet not counted correct)
+ - man page, info page
+ - autoconf/automake + rpm and deb package
+ - An option for scheduling in parallel with 2..N pthreads e. g. for SMP boxes with ramdisk.
+ - other features like a gziped cvs file with the file attributes from lstat of the files which were
+ linked
+ - an extra option for creating the tmpfiles on a specified partition for speedup with a ramdisk
+ - an option for calling dupmerge with find to shorten the command line the user has to do
+ - check if the user has enough permissions to replace files (in not nodo mode)
+ - self-calling, "user-friendly", mode with extra option which does calls dupmerge with find via the system call
+ - test with files which can be deleted only via the inode (http://www.cyberciti.biz/tips/delete-remove-files-with-inode-number.html)
+ - An additional option for doing deletion irreversible via shred or wipe or ... instead of unlink or rm.
+ - Checking for and reporting of filesystems where there are no links possible; e. g. no hard links with FAT.
+ - GUI
+
+ Version 1.73, 2008-03-01
+
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> // strndup,
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h> // Nessisary for unlink, gtopt, link, fcmp.
+#include <iso646.h> // for and, bitand, or, bitor ...
+#include <stdbool.h> // _Bool
+#include <signal.h> // signals
+#include <limits.h> // UCHAR_MAX
+#include <sys/mman.h> // mlockall
+#include <sys/types.h> // vfork
+#include <sys/wait.h> // vfork
+#include <time.h> // localtime
+
+
+#define mc_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define mc_MAX(a, b) ((a) > (b) ? (a) : (b))
+
+
+// Swap two items a, b of size i_size, use (static) swap variable c for performance.
+#define mc_SWAP_ITEMS(a, b, c, i_size) { memcpy (&(c), &(a), (i_size));\
+ memcpy (&(a), &(b), (i_size));\
+ memcpy (&(b), &(c), (i_size)); }
+
+// easter egg
+#define mc_ADVICE if (argc > 1 && 0 == strncmp (argv[1], "-advice", 8) ) \
+{ \
+ (void)printf ("Don't Panic!\n"); \
+ exit (42); \
+} /* 42: The meaning of life, the universe, and everything ;-) */
+
+
+// For registering the signal handler void sig_handler (int sig). This macro saves 32 bytes ;-)
+#define mc_register_sig_handler \
+{\
+ int i;\
+ for(i=0; i<=UCHAR_MAX; i++)\
+ signal (i, sig_handler);\
+}
+
+
+// union for copy and compare
+union u_tag
+{
+ int64_t i64;
+ char c[8];
+};
+
+
+int cmp (const void *, const void *); // function for qsort
+int cmp0 (const void *, const void *); // function for qsort
+int fcmp (const void *, const void *); // function for qsort
+int fcmp0 (const void *, const void *); // function for qsort
+int fcmp1 (const void *);
+int fcmp2 (const void *, const _Bool);
+_Bool different_files (const void *, const void *);
+
+const double d_version = 1.73; // very simple version number of %1.2f format
+struct stat s_swap; // swap variable
+int Nodo = 0; // for nodo mode (only report)
+int Quiet = 0; // for quiet mode
+int i_soft = 0; // for using always soft links instead of the default hard links
+_Bool b_inv = false; // invers mode indicator
+_Bool b_sparse = false; // flag for sparseing
+_Bool b_comb = false; // combo flag for normal + sparse mode
+int i_minlinks = 1; // minimum of found hard links
+int i_maxlinks = 1; // maximum of found hard links
+long int li_minblocks = LONG_MAX; // minimum of saved blocks by sparse copying
+long int li_maxblocks = 0; // maximum of saved blocks by sparse copying
+int Files_deleted = 0;
+int Blocks_reclaimed = 0;
+int i_files_expanded = 0;
+int i_blocks_declaimed = 0;
+volatile int i_exit_flag = 0; // flag for exiting soon (not immediately) and controlled after a signal to terminate
+char **a_names = NULL; // array of file names
+char *a_flags = NULL; // Array with the flags of the files in a_names. 1 = marked for deletion because another file the the same content was found,
+int nfiles = 0; // number of files
+
+
+void
+sig_handler (const int sig) // signal handler
+{
+ if ((SIGINT == sig) or (SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig) or (SIGTERM == sig))
+ {
+ i_exit_flag = 1; // set the flag for a graceful exit
+// if ((SIGILL == sig) or (SIGKILL == sig) or (SIGSEGV == sig))
+// {
+// (void) printf ("\a\a\a\a\a\a\a Signal %d, program exiting... \r\n\a\a\a", sig);
+// exit (0);
+// }
+ }
+ return;
+}
+
+
+// initialisation for main
+inline void
+main_init1 (void)
+{
+ Files_deleted = 0;
+ Blocks_reclaimed = 0;
+ i_files_expanded = 0;
+ i_blocks_declaimed = 0;
+ i_minlinks = 1;
+ i_maxlinks = 1;
+ li_minblocks = LONG_MAX;
+ li_maxblocks = 0;
+}
+
+
+// delete the files (of name a_name[0 ... i_files] which are marked for deletion
+void
+delete_marked (const int i_files)
+{
+ struct stat sa;
+ int i;
+
+ for (i = 0; i < i_files; i++)
+ {
+ if (i_exit_flag)
+ exit (0);
+ if (a_flags[i] bitand 1) // delete
+ {
+ if (-1 == lstat (a_names[i], &sa)) // or ! S_ISREG (sa.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", a_names[i]);
+ perror ("lstat");
+ return;
+ }
+ if (0 == Nodo)
+ {
+ printf ("deleting %s, freeing %lld blocks\n", a_names[i], ((1 == sa.st_nlink) ? sa.st_blocks : 0));
+ unlink (a_names[i]);
+ }
+ Files_deleted++;
+ if (1 == sa.st_nlink) // file a is no (hard) link
+ {
+ Blocks_reclaimed += sa.st_blocks;
+ }
+ }
+ }
+ return;
+}
+
+
+int
+main (const int argc, const char *const argv[])
+{
+ char **names, buf[BUFSIZ]; //, *cp; // BUFSIZE is the default buffer size in stdio.h, usually 8192
+ int i, j, i1, i_mode = 0, i_ferror = 0, i_ret = 0; // integers, mode: 0: common, 1: deletion, i_ferror: for ferror
+ FILE *tmp; // file pointer for tmpfile
+ const char c_zero = '\0'; // for terminating strings
+ time_t now; // for actual time
+ struct tm *ptr_tm = (time (&now), localtime (&now)); // set actual time, convert to localtime
+
+ mc_ADVICE;
+ // read parameters for quiet mode and suppressing the actual unlinking and relinking. Rolf Freitag
+ while ((i = getopt (argc, (char *const *) argv, "cdhinqsSV")) != EOF)
+ {
+ switch (i)
+ {
+ case 'c':
+ b_comb = 1;
+ break;
+ case 'd':
+ i_mode = 1; // deletion mode
+ break;
+ case 'n':
+ Nodo = 1;
+ break;
+ case 'q':
+ Quiet = 1;
+ break;
+ case 'h':
+ fprintf (stdout, "This program can reclaim disk space by linking identical files together.\n");
+ fprintf (stdout, "It can also expand all hard links and reads the list of files from standard input.\n");
+ fprintf (stdout, "Example usage: find ./ -type f -print0 | dupmerge 2>&1 | tee ../dupmerge_log.txt\n");
+ fprintf (stdout, "Options:\n-h \t Show this help and exit.\n-V \t Show version number and exit.\n");
+ fprintf (stdout, "-d \t delete multiple files and hard links. Default: Preserve the alphabetically first file name.\n");
+ //fprintf (stdout, " \t Switch for ascessing/decessing order in deletion mode.\n");
+ fprintf (stdout, "-q \t Quiet mode.\n-n \t Nodo mode / read-only mode.\n");
+ fprintf (stdout, "-i \t Inverse switch: Expand all hard links in normal mode, replace files by their desparsed version if it is bigger.\n");
+ fprintf (stdout, "-s \t Flag for soft linking (default is hard linking). This option is beta because for linking of all ");
+ fprintf (stdout, "equal files more than one run of dupmerge is necessary and the inverse (expanding of soft links) is untested.\n");
+ fprintf (stdout, "-S \t Flag for Sparse mode: Replace files by their sparse version if it is smaller.\n");
+ fprintf (stdout, "-c \t Combo mode: Default mode +sparse mode. With -i it means inverse mode with unlinking and desparsing.\n");
+ fflush (stdout);
+ exit (0);
+ break;
+ case 's':
+ i_soft = 1;
+ break;
+ case 'V':
+ fprintf (stdout, "dupmerge version %1.2f\n", d_version);
+ fflush (stdout);
+ exit (0);
+ break;
+ case 'i':
+ b_inv = true;
+ break;
+ case 'S':
+ b_sparse = true;
+ break;
+ default:
+ break;
+ }
+ }
+ if (not Quiet)
+ {
+ (void) printf ("%s started at %d-%s%d-%s%d %s%d:%s%d:%s%d\n", argv[0],
+ ptr_tm->tm_year + 1900, ptr_tm->tm_mon + 1 > 9 ? "" : "0", ptr_tm->tm_mon + 1,
+ ptr_tm->tm_mday > 9 ? "" : "0", ptr_tm->tm_mday,
+ ptr_tm->tm_hour > 9 ? "" : "0", ptr_tm->tm_hour,
+ ptr_tm->tm_min > 9 ? "" : "0", ptr_tm->tm_min, ptr_tm->tm_sec > 9 ? "" : "0", ptr_tm->tm_sec);
+ fflush (stdout);
+ }
+#ifndef __CYGWIN__
+ mlockall (MCL_CURRENT bitor MCL_FUTURE); // be cached
+#endif
+ // Read list of file names into tmp file and count
+ tmp = tmpfile ();
+ if (NULL == tmp)
+ {
+ (void) fprintf (stderr, "could not open temporary file, exiting\n");
+ exit (-1);
+ }
+ nfiles = 0;
+ if (not Quiet)
+ {
+ (void) printf ("tmpfile (pointer %p) created, processing ...\n", tmp);
+ fflush (stdout);
+ }
+ while (!feof (stdin)) // while not end of input
+ {
+ buf[0] = '\0'; // set strnlen to 0 so that strnlen (buf, BUFSIZ) is 0 at EOF
+ for (i = 0; i < BUFSIZ; i++) // read till the sequence operator \0 is found
+ {
+ fread (&buf[i], 1, 1, stdin);
+ i_ferror = ferror (stdin);
+ if (i_ferror)
+ {
+ (void) fprintf (stderr, "stdin: error %d.\n", i_ferror);
+ i_ret = -1;
+ }
+ if ('\0' == buf[i]) // termination found
+ break;
+ }
+ if ('\0' not_eq buf[sizeof (buf) - 1]) // no termination at buffer end
+ buf[sizeof (buf) - 1] = '\0';
+ if (strnlen (buf, BUFSIZ))
+ nfiles++; // new file: increment file counter
+ if ((EOF == fputs (buf, tmp)) or (not fwrite (&c_zero, 1, 1, tmp))) // store new file name in tmp file
+ {
+ (void) fprintf (stderr, "could not write to temporary file, exiting\n");
+ exit (-1);
+ }
+ }
+ // Now that we know how many there are, printf, allocate space and re-read.
+ if (not Quiet)
+ {
+ (void) printf ("Input: %d files, processing ...\n", nfiles);
+ fflush (stdout);
+ }
+ if (0 == nfiles) // no input, nothing to do
+ exit (0);
+ rewind (tmp);
+ names = (char **) calloc (nfiles, sizeof (char *));
+ a_names = (char **) calloc (nfiles, sizeof (char *));
+ a_flags = (char *) calloc (nfiles, sizeof (char));
+ if ((NULL == names) or (NULL == a_flags) or (NULL == a_names))
+ {
+ (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
+ exit (-1);
+ }
+ for (i = 0; i < nfiles; i++) // get the file names from the tmp file
+ {
+ for (j = 0; j < BUFSIZ; j++) // read till the sequence operator \0 is found
+ {
+ fread (&buf[j], 1, 1, tmp);
+ i_ferror = ferror (stdin);
+ if (i_ferror)
+ {
+ (void) fprintf (stderr, "tmpfile: error %d.\n", i_ferror);
+ i_ret = -1;
+ }
+ if ('\0' == buf[j]) // termination found
+ break;
+ }
+ if ((NULL == (names[i] = strndup (buf, BUFSIZ))) or (NULL == (a_names[i] = strndup (buf, BUFSIZ))))
+ { // The command "gcc -o dupmerge dupmerge.c" prduces the Warning "dupmerge.c:191: warning: assignment
+ // makes pointer from integer without a cast" but this is a warning bug; in the if statement above there is no int!
+ // To get rid of that warning you can cast strndup to (char *) or equivalent (typeof(names[i])) or use -O3. [gcc version 3.3.4]
+ // For later versions of gcc you should use the command line option -D_GNU_SOURCE, but that's not necessary under cygwin.
+ (void) fprintf (stderr, "%s: Out of memory\n", argv[0]);
+ exit (1);
+ }
+ }
+ (void) fclose (tmp);
+ mc_register_sig_handler; // register signal handler before critical sections
+ qsort (a_names, nfiles, sizeof (char *), cmp0); // sort only the (pointers to the) file names in a_names
+ switch (i_mode)
+ {
+ case 0: // no deletion
+ switch (b_comb and b_inv) // For desparse mode after nomal inverted mode. Otherwise a second run would be nessisary.
+ {
+ case 0: // not combo mode and inverse switch
+ if (b_sparse) // sparse/desparse files first because sparsing/desparsing expands hard links (changes inode)
+ {
+ // no sorted list of files at this point
+ i1 = Quiet;
+ if (b_comb and Nodo) // combo mode and read-only (Nodo): do nodo sparsing/desparsing during sorting with output here
+ Quiet = 0;
+ else
+ Quiet = 1;
+ i = Nodo;
+ Nodo = 1; // for only sorting the names
+ qsort (names, nfiles, sizeof (char *), fcmp); // sort only the (pointers to the) file names
+ Nodo = i; // restore Nodo
+ Quiet = i1; // restore Quiet
+ foo:
+ main_init1 (); // start with new init
+ // sparse or desparse
+ for (i = 0; i < nfiles; i++)
+ {
+ // only different files, call fcmp2/3 only for the last file in a block of equal hard links
+ if ((nfiles - 1 == i) or different_files (names + i, names + i + 1))
+ (void) fcmp2 (names + i, not b_inv);
+ }
+ if (not Quiet)
+ {
+ (void)
+ printf
+ ("Files%s %ssparsed: %d of %d, Disk blocks%s %sclaimed: %d\n",
+ (Nodo ? " which can be" : ""), (b_inv ? "de" : ""),
+ (b_inv ? i_files_expanded : Files_deleted), nfiles,
+ (Nodo ? " can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
+ (void)
+ printf
+ ("Minimum of disk blocks which could be %s by %s copying: %ld, Maximum: %ld.\n",
+ (b_inv ? "declaimed" : "reclaimed"), (b_inv ? "desparse" : "sparse"), li_minblocks, li_maxblocks);
+ fflush (stdout);
+ }
+ if ((b_comb and Nodo) or (b_inv)) // all work done
+ exit (0);
+ } // if (b_sparse), no break
+ case 1:
+ main_init1 (); // start with new init
+ if ((b_sparse and b_comb) or (not b_sparse)) // sparse mode +combo flag or no sparse mode: normal mode, maybe with inversion
+ { // Without the combo flag and with sparse mode there is no normal (maybe inverted) mode!
+ if (not b_inv)
+ {
+ qsort (names, nfiles, sizeof (char *), fcmp);
+ if (not Quiet)
+ {
+ (void) printf ("Scanning for more dups ...\n");
+ fflush (stdout);
+ }
+ for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
+ (void) fcmp (names + i, names + i + 1); // e. g. with gcc qsort at coreutil sources.
+ }
+ else
+ {
+ if (not Quiet)
+ {
+ (void) printf ("Scanning for hard links ...\n");
+ fflush (stdout);
+ }
+ for (i = 0; i < nfiles; i++)
+ (void) fcmp1 (names + i);
+ }
+ if (not Quiet)
+ {
+ (void)
+ printf
+ ("Files%s %s: %d of %d, Disk blocks%s %sclaimed: %d\n",
+ (Nodo ? " which can be" : ""),
+ (b_inv ? "expanded" : "linked"),
+ (b_inv ? i_files_expanded : Files_deleted), nfiles,
+ (Nodo ? " which can be" : ""), (b_inv ? "de" : "re"), (b_inv ? i_blocks_declaimed : Blocks_reclaimed));
+ (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
+ fflush (stdout);
+ }
+ } // if ((b_sparse and b_comb) or (not b_sparse))
+ if (b_comb and b_inv) // do desparsing in inverse mode
+ goto foo;
+ break;
+ default:
+ (void) printf ("Unexpected switch error ...\n");
+ exit (-1);
+ break;
+ } // switch
+ break;
+ case 1: // deletion
+ // no sorted list of files at this point
+ qsort (names, nfiles, sizeof (char *), fcmp0); // sort only the file names
+ // second run
+ for (i = 0; i <= nfiles - 2; i++) // Second run: nessisary for qsort versions which do use cached values, where fcmp is not always called,
+ (void) fcmp0 (names + i, names + i + 1); // e. g. with gcc qsort at coreutil sources.
+ delete_marked (nfiles);
+ if (not Quiet)
+ {
+ (void)
+ printf
+ ("Duplicate files (%s deleted): %d of %d, Disk blocks%s reclaimed: %d\n",
+ (Nodo ? "which can be" : "which have been"), (b_inv ? i_files_expanded : Files_deleted), nfiles, (Nodo ? " which can be" : ""), Blocks_reclaimed);
+ (void) printf ("Minimum of found hard links: %d, Maximum: %d.\n", i_minlinks, i_maxlinks);
+ fflush (stdout);
+ }
+ break;
+ default:
+ (void) printf ("Unexpected switch error ...\n");
+ exit (-1);
+ break;
+ } // switch
+ exit (i_ret);
+} // main
+
+
+// For alphabetic ordering/searching.
+int
+cmp (const void *p0, const void *p1)
+{
+ int i = 0;
+ char **p2 = (char **) p0;
+ char **p3 = (char **) p1;
+ i = strncmp (*p2, *p3, BUFSIZ); // compare the two strings
+ return (i);
+}
+
+
+// For alphabetic ordering/searching. With global inversion flag for inverse mode.
+int
+cmp0 (const void *p0, const void *p1)
+{
+ int i = 0;
+ char **p2 = (char **) p0;
+ char **p3 = (char **) p1;
+ i = strncmp (*p2, *p3, BUFSIZ); // compare the two strings
+ if (b_inv)
+ return (-i);
+ return (i);
+}
+
+
+// This is the comparison function called by qsort, where the real work
+// is done as a side effect. Due to ANSI-C the current version is not
+// completely correct because if the same objects are passed more than once to the
+// comparison function the results must be consistent with another; see 7.20.5 in C99.
+// If errors like lstat failed do occur, this is not fullfilled yet. With fcmp1 it's the same.
+// sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
+int
+fcmp (const void *a, const void *b)
+{
+ struct stat sa, sb;
+ FILE *fa, *fb;
+ int i1, i2, i_nlinka, i_nlinkb, i_fa, i_fb;
+ int rval = 0;
+ const char *filea, *fileb;
+ union u_tag u_buffer1 = {
+ 0
+ }, u_buffer2 =
+ {
+ 0};
+ void *buffer1 = (void *) &u_buffer1;
+ void *buffer2 = (void *) &u_buffer2;
+ _Bool b_swap = false;
+
+ if (i_exit_flag)
+ exit (0);
+ // Nonexistent or non-plain files are less than any other file
+ if (NULL == a)
+ return (-1);
+ filea = *(const char **) a;
+ if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", filea);
+ perror ("lstat");
+ return (-1);
+ }
+ if (NULL == b)
+ return 1;
+ fileb = *(const char **) b;
+ if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", fileb);
+ perror ("lstat");
+ return (1);
+ }
+ i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
+ i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
+ // Smaller files are "less"
+ if (sa.st_size < sb.st_size)
+ return (-1);
+ if (sa.st_size > sb.st_size)
+ return (1);
+ // We now know both files exist, are plain files, are the same size
+ // if both files are hard linked or zero length: sort by name
+ if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)) or (0 == sa.st_size))
+ return (strncmp (filea, fileb, BUFSIZ));
+ // We now know both files exist, are plain files, are the same size > 0,
+ // and are not already linked, so compare them alphabetical, if they are on the same device
+ if (sa.st_dev != sb.st_dev)
+ return ((sa.st_dev < sb.st_dev) ? -1 : 1);
+ if (NULL == (fa = fopen (filea, "r")))
+ return (-1); // Unreadable files are "less than"
+ if (NULL == (fb = fopen (fileb, "r")))
+ {
+ fclose (fa);
+ return (1);
+ }
+ // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
+ // On big endian machines it's alphabetic ordering, on little andian machines it's
+ // alphabetic ordering with 64 bit 'characters'.
+ // Du to C99, 7.19.8.1 read must be done with chars.
+ while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
+ {
+ // Mask the unsused 1..7 bytes, because the standard says nothing about them.
+ memset (&(u_buffer1.c[i1]), 0, 8 - i1); // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
+ memset (&(u_buffer2.c[i2]), 0, 8 - i2);
+ // check for file errors
+ i_fa = ferror (fa);
+ i_fb = ferror (fb);
+ if (i_fa or i_fb) // check for file errors
+ {
+ if (i_fa)
+ {
+ (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
+ rval = -1; // error file: smaller
+ break;
+ }
+ if (i_fb)
+ {
+ (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
+ rval = 1;
+ break;
+ }
+ }
+ if (u_buffer1.i64 != u_buffer2.i64) // compare
+ {
+ rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
+ break;
+ }
+ }
+ (void) fclose (fa);
+ (void) fclose (fb);
+ if (rval) // unequal files of same size
+ return rval;
+
+ if ((sa.st_blocks) or (sb.st_blocks)) // avoid linking sparse files with no blocks allocated
+ {
+ // The two files have same content, size > 0, blocks allocated > 0 and are on the same device, so link them.
+ // We prefer to keep the one with less blocks, or if they're the
+ // same the older one, or if they're the same, the one with more (hard) links.
+ if ((sb.st_blocks < sa.st_blocks) or (sb.st_mtime > sa.st_mtime) or (sb.st_nlink < sa.st_nlink))
+ {
+ mc_SWAP_ITEMS (sa, sb, s_swap, sizeof (struct stat)); // swap items to keep original sb
+ mc_SWAP_ITEMS (filea, fileb, s_swap, sizeof (char *)); // swap file name pointers
+ b_swap = true;
+ }
+ // before linking: store number of hard links of each file
+ i_nlinka = sa.st_nlink;
+ i_nlinkb = sb.st_nlink;
+ if (1 == sa.st_nlink) // file a is no (hard) link
+ {
+ Files_deleted++;
+ Blocks_reclaimed += sa.st_blocks;
+ }
+ if (!Nodo and (unlink (filea)))
+ {
+ (void) fprintf (stderr, "unlink(%s) failed\n", filea);
+ perror ("unlink");
+ return (-1);
+ }
+ if (!i_soft)
+ {
+ if (!Nodo and (-1 == link (fileb, filea)))
+ {
+ (void) fprintf (stderr, "link(%s,%s) failed\n", fileb, filea);
+ perror ("link");
+ return (-1);
+ }
+ }
+ else
+ {
+ if (!Nodo and (-1 == symlink (fileb, filea)))
+ {
+ (void) fprintf (stderr, "symlink(%s,%s) failed\n", filea, fileb);
+ perror ("symlink");
+ return (-1);
+ }
+ }
+ if (!Quiet)
+ {
+ (void) printf ("ln %s %s %s: %d, %d -> %d, freed +%llu blocks\n", i_soft ? "-s" : "", fileb, filea, i_nlinkb, i_nlinka, i_nlinka + i_nlinkb,
+ sb.st_blocks);
+ fflush (stdout);
+ }
+ }
+ return ((b_swap) ? strncmp (fileb, filea, BUFSIZ) : strncmp (filea, fileb, BUFSIZ)); //
+} // fcmp
+
+
+// return the index of file with name a_name
+int
+nindex (const char *a_name)
+{
+ void *p_v = bsearch (&a_name, a_names, nfiles, sizeof (char *), cmp0);
+ int i_ret = (int) (((((void *) p_v) - ((void *) a_names))) / (sizeof (char *)));
+ if (NULL == p_v)
+ {
+ printf ("Error: bsearch(%s, %p, %d, %d, %p) returned NULL.\n", a_name, a_names, nfiles, sizeof (char *), cmp);
+ return (-1); // error
+ }
+ return (i_ret);
+}
+
+
+// This is the comparison function called by qsort, for marking all duplicate files with that flag
+// Due to ANSI-C the current version is not
+// completely correct because if the same objects are passed more than one to the
+// comparison function the results must be consistent with another, 7.20.5 C99.
+// If errors like lstat failed do occur, this is not fullfilled yet.
+// sort levels: 0: size (and accessability), 1: device, 2: content, 3.: name
+int
+fcmp0 (const void *a, const void *b)
+{
+ struct stat sa, sb;
+ FILE *fa, *fb;
+ int i1, i2;
+ int rval = 0, ja = 0, jb = 0, i_fa, i_fb;
+ const char *filea, *fileb;
+ union u_tag u_buffer1 = {
+ 0
+ }, u_buffer2 =
+ {
+ 0};
+ void *buffer1 = (void *) &u_buffer1;
+ void *buffer2 = (void *) &u_buffer2;
+
+ if (i_exit_flag)
+ exit (0);
+ // Nonexistent or non-plain files are less than any other file
+ if (NULL == a)
+ return (-1);
+ filea = *(const char **) a;
+ if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", filea);
+ perror ("lstat");
+ return (-1);
+ }
+ if (NULL == b)
+ return 1;
+ fileb = *(const char **) b;
+ if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", fileb);
+ perror ("lstat");
+ return (1);
+ }
+ i_minlinks = mc_MIN (i_minlinks, mc_MIN (sa.st_nlink, sb.st_nlink));
+ i_maxlinks = mc_MAX (i_maxlinks, mc_MAX (sa.st_nlink, sb.st_nlink));
+ // Smaller files are "less"
+ if (sa.st_size < sb.st_size)
+ return (-1);
+ if (sa.st_size > sb.st_size)
+ return (1);
+ // We now know both files exist, are plain files, are the same size
+ // if both files are hard linked: sort by name
+ if (((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)))
+ {
+ out_equal:
+ // set ja, jb to the index of a, b
+ ja = nindex (filea);
+ jb = nindex (fileb);
+ if ((ja >= 0) and (a_flags[ja] bitand 1))
+ return (-1); // marked file first
+ if ((jb >= 0) and (a_flags[jb] bitand 1))
+ return (1); // marked file first
+ // now both files are not marked
+ if ((ja >= 0) and (jb >= 0))
+ {
+ if ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ) <= 0) // alphabetic ordering
+ {
+ a_flags[ja] = 1;
+ if (!Quiet)
+ {
+ (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", filea, fileb);
+ fflush (stdout);
+ }
+ return (-1); // marked file first
+ }
+ else
+ {
+ a_flags[jb] = 1;
+ if (!Quiet)
+ {
+ (void) printf ("Equal files: %s will get deleted, %s will get preserved.\n", fileb, filea);
+ fflush (stdout);
+ }
+ return (1); // marked file first
+ }
+ }
+ return ((b_inv ? -1 : 1) * strncmp (filea, fileb, BUFSIZ));
+ }
+ // We now know both files exist, are plain files, are the same size > 0,
+ // and are not already linked, so compare them alphabetical
+ if (NULL == (fa = fopen (filea, "r")))
+ return (-1); // Unreadable files are "less than"
+ if (NULL == (fb = fopen (fileb, "r")))
+ {
+ fclose (fa);
+ return 1;
+ }
+ // Loop for comparing the files in 64 bit (instead of 8 bit) blocks.
+ // On big endian machines it's alphabetic ordering, on little andian machines it's
+ // alphabetic ordering with 64 bit 'characters'.
+ while (((i1 = fread (buffer1, 1, 8, fa)) != 0) and ((i2 = fread (buffer2, 1, 8, fb)) != 0))
+ {
+ // Mask the unsused 1..7 bytes, because the standard says nothing about them.
+ memset (&(u_buffer1.c[i1]), 0, 8 - i1); // start at the first unused byte buffer1.c[i1], end at Byte7 buffer1.c[7]
+ memset (&(u_buffer2.c[i2]), 0, 8 - i2);
+ // check for file errors
+ i_fa = ferror (fa);
+ i_fb = ferror (fb);
+ if (i_fa or i_fb) // check for file errors
+ {
+ if (i_fa)
+ {
+ (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
+ rval = -1; // error file: smaller
+ break;
+ }
+ if (i_fb)
+ {
+ (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
+ rval = 1;
+ break;
+ }
+ }
+ if (u_buffer1.i64 != u_buffer2.i64) // compare
+ {
+ rval = (u_buffer1.i64 < u_buffer2.i64) ? -1 : 1;
+ break;
+ }
+ }
+ (void) fclose (fa);
+ (void) fclose (fb);
+ if (rval) // unequal files of same size
+ return rval;
+ // else
+ goto out_equal;
+} // fcmp0
+
+
+// This is the hard link expansion function for the inverse mode. Needs only one file name as argument.
+int
+fcmp1 (const void *a)
+{
+ struct stat sa;
+ FILE *fa, *fb;
+ int i1, i, i_b, i_fa, i_fb, i_ret = 0;
+ char a_cd[BUFSIZ] = { '\0' }; // for directory
+ char a_cf[BUFSIZ] = { '\0' }; // for tempfile
+ const char *filea, *fileb = a_cf; // file names, fileb after mkstemp
+ union u_tag u_buffer1 = {
+ 0
+ };
+ void *buffer1 = (void *) &u_buffer1;
+
+ if (i_exit_flag)
+ exit (0);
+ // Nonexistent or non-plain files are less than any other file
+ if (NULL == a)
+ return (-1);
+ filea = *(const char **) a;
+ if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", filea);
+ perror ("lstat");
+ return (-1);
+ }
+ i_minlinks = mc_MIN (i_minlinks, sa.st_nlink);
+ i_maxlinks = mc_MAX (i_maxlinks, sa.st_nlink);
+ // check for hard links
+ if (sa.st_nlink > 1) // hard link found
+ {
+ i_files_expanded++;
+ i_blocks_declaimed += sa.st_blocks;
+ if (!Nodo) // expand the hard link
+ {
+ strncpy (a_cd, filea, BUFSIZ);
+ // extract the directory from the file name: delete from end to last "/"
+ for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
+ {
+ if (a_cd[i] != '/')
+ a_cd[i] = '\0';
+ else
+ break;
+ }
+ // now the actual directory is in a_c, create temporary file there
+ strncpy (a_cf, a_cd, BUFSIZ);
+ strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2); // template for mkstemp
+ i_b = mkstemp (a_cf);
+ if (-1 == i_b)
+ {
+ (void) fprintf (stderr, "Could not create temporary file.\n");
+ perror ("file_create");
+ return (-1);
+ }
+ if ((NULL == (fa = fopen (filea, "r"))) or (NULL == (fb = fdopen (i_b, "w")))) // open file and tmpfile
+ {
+ (void) fprintf (stderr, "Expanding the hard link(%s) failed.\n", filea);
+ perror ("expand");
+ return (-1);
+ }
+ // expand to tmpfile
+ while ((i1 = fread (buffer1, 1, 8, fa)) != 0) // copy fa to fb in 8 byte blocks
+ {
+ (void) fwrite (buffer1, 1, i1, fb);
+ }
+ i_fa = ferror (fa);
+ i_fb = ferror (fb);
+ if (i_fa or i_fb) // check for file errors
+ {
+ if (i_fa)
+ (void) fprintf (stderr, "file %s: error %d.\n", filea, i_fa);
+ if (i_fb)
+ (void) fprintf (stderr, "file %s: error %d.\n", fileb, i_fb);
+ i_ret = -1;
+ }
+ (void) fclose (fa);
+ (void) fclose (fb);
+ // unlink
+ if (unlink (filea))
+ {
+ (void) fprintf (stderr, "unlink(%s) failed, tempfile %s remains\n", filea, fileb);
+ perror ("unlink");
+ return (-1);
+ }
+ // rename to original name
+ if (rename (fileb, filea))
+ {
+ (void) fprintf (stderr, "rename(%s, %s) failed, tempfile %s remains\n", fileb, filea, fileb);
+ perror ("unlink");
+ return (-1);
+ }
+ } // if (!Nodo)
+ if (!Quiet)
+ {
+ (void) printf ("expand %s: %d -> %d, unfreed %llu blocks\n", filea, sa.st_nlink, sa.st_nlink - 1, sa.st_blocks); // sa has the old cached value
+ fflush (stdout);
+ }
+ } // if (sa.st_ino > 1)
+ return (i_ret);
+} // fcmp1
+
+
+// This is the (de)sparsing function.
+// Copying sparse/desparse can reduce/increase st_blocks while st_size remains constant.
+int
+fcmp2 (const void *a, const _Bool b_S)
+{
+ FILE *fb;
+ struct stat sa, sb;
+ int i, i_b, status = 0, i_pid, i_ret = 0;
+ long int li;
+ char a_cd[BUFSIZ] = { '\0' }; // for directory, temporary string
+ char a_cf[BUFSIZ] = { '\0' }; // for tempfile
+ char a_ca[BUFSIZ] = { '\0' }; // for tempfile
+ const char *filea = a_ca, *fileb = a_cf; // file names
+
+ if (i_exit_flag)
+ exit (0);
+ // Nonexistent or non-plain files are less than any other file
+ if (NULL == a)
+ return (-1);
+ strncpy (a_ca, *(const char **) a, BUFSIZ - 1);
+ // filea = *(const char **) a;
+ if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", filea);
+ perror ("lstat");
+ return (-1);
+ }
+ // get tmpname: mkstemp, unlink
+ strncpy (a_cd, filea, BUFSIZ);
+ // extract the directory from the file name: delete from end to last "/"
+ for (i = strnlen (a_cd, BUFSIZ); i >= 0; i--)
+ {
+ if (a_cd[i] != '/')
+ a_cd[i] = '\0';
+ else
+ break;
+ }
+ // now the actual directory is in a_c, create temporary file there
+ strncpy (a_cf, a_cd, BUFSIZ);
+ strncat (a_cf, "dup__XXXXXXX", BUFSIZ / 2); // template for mkstemp
+ i_b = mkstemp (a_cf);
+ if (-1 == i_b)
+ {
+ (void) fprintf (stderr, "Could not create temporary file.\n");
+ perror ("mkstemp");
+ return (-1);
+ }
+ // dummy open and close to close the tmpfile
+ if (NULL == (fb = fdopen (i_b, "w"))) // open tmpfile
+ {
+ (void) fprintf (stderr, "fdopen (%d) failed.\n", i_b);
+ perror ("fdopen");
+ return (-1);
+ }
+ fclose (fb);
+ if (unlink (fileb)) // delete file because we do need only the name
+ {
+ (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
+ perror ("unlink");
+ return (-1);
+ }
+ switch (i_pid = vfork ())
+ {
+ case -1: // parent with fork error
+ perror ("vfork");
+ fprintf (stderr, "vfork failed.\n");
+ break;
+ case 0: // child
+ memset (a_cd, '\0', sizeof (a_cd));
+ strncpy (a_cd, (b_S ? "--sparse=always" : "--sparse=never"), BUFSIZ);
+ i = execl ("/bin/cp", "cp", a_cd, filea, fileb, NULL);
+ if (i)
+ {
+ fprintf (stderr, "execlp(cp %s %s %s) failed, return value %d\n", a_cd, filea, fileb, i);
+ perror ("fork");
+ }
+ _exit (i);
+ break;
+ default: // parent with no error: do proceed
+ break;
+ }
+ i = waitpid (i_pid, &status, WUNTRACED); // wait till child terminates
+ if (status)
+ {
+ fprintf (stderr, "copying failed; child with pid %d returned %d\n", i, status);
+ i_ret = -1;
+ goto end;
+ }
+ // lstat and check copy
+ if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", fileb);
+ perror ("lstat");
+ return (-1);
+ }
+ li = sa.st_blocks - sb.st_blocks;
+ if (not b_S)
+ li = -li;
+ li_minblocks = mc_MIN (li_minblocks, li);
+ li_maxblocks = mc_MAX (li_maxblocks, li);
+ // if shrinked in sparse mode or expanded in desparse mode: unlink original and rename tmpfile to original name
+ if ((b_S ? (sa.st_blocks > sb.st_blocks) : (sa.st_blocks < sb.st_blocks)))
+ {
+ b_S ? Files_deleted++ : i_files_expanded++;
+ b_S ? (Blocks_reclaimed += sa.st_blocks - sb.st_blocks) : (i_blocks_declaimed += sb.st_blocks - sa.st_blocks);
+ if (!Quiet)
+ {
+ (void) printf ("%s: %lld -> %lld blocks\n", filea, sa.st_blocks, sb.st_blocks);
+ fflush (stdout);
+ }
+ if (not Nodo)
+ {
+ if (unlink (filea)) // delete original
+ {
+ (void) fprintf (stderr, "unlink(%s) failed\n", filea);
+ perror ("unlink");
+ return (-1);
+ }
+ // rename to original name
+ if (rename (fileb, filea))
+ {
+ (void) fprintf (stderr, "rename(%s, %s) failed\n", fileb, filea);
+ perror ("unlink");
+ i_ret = -1;
+ goto end;
+ }
+ return (0);
+ }
+ }
+end:
+ if (unlink (fileb)) // delete tmpfile
+ {
+ (void) fprintf (stderr, "unlink(%s) failed\n", fileb);
+ perror ("unlink");
+ return (-1);
+ }
+ return (i_ret);
+} // fcmp2
+
+
+// return true if the two files are not hard linkded
+_Bool
+different_files (const void *a, const void *b)
+{
+ struct stat sa, sb;
+ const char *filea, *fileb;
+
+ // Nonexistent or non-plain files are less than any other file
+ if (NULL == a)
+ return (true);
+ filea = *(const char **) a;
+ if (-1 == lstat (filea, &sa)) // or ! S_ISREG (sa.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", filea);
+ perror ("lstat");
+ return (true);
+ }
+ if (NULL == b)
+ return (true);
+ fileb = *(const char **) b;
+ if (-1 == lstat (fileb, &sb)) // or ! S_ISREG (sb.st_mode))
+ {
+ fprintf (stderr, "lstat(%s) failed\n", fileb);
+ perror ("lstat");
+ return (true);
+ }
+ if ((sa.st_dev == sb.st_dev) and (sa.st_ino == sb.st_ino)) // same device and same inode?
+ return (false); // yes: equal files
+ return (true);
+} // different_files