diff --git a/.travis.yml b/.travis.yml index 99b462b5..ec62f41f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ language: c install: - sudo apt-get update - - sudo apt-get install python3-sphinx gettext python3-setuptools + - sudo apt-get install scons python3-sphinx gettext python3-setuptools - sudo apt-get install libblkid-dev libelf-dev libglib2.0-dev libjson-glib-dev - sudo apt-get install clang - sudo easy_install3 $(cat test-requirements.txt) diff --git a/.version b/.version index 4ebff58f..11a5bd1b 100644 --- a/.version +++ b/.version @@ -1 +1 @@ -2.8.0 Maidenly Moose +2.9.0 Odd Olm diff --git a/CHANGELOG.md b/CHANGELOG.md index 519c8c68..ad709837 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,48 @@ All notable changes to this project will be documented in this file. The format follows [keepachangelog.com]. Please stick to it. +## [2.9.0 Odd Olm] -- 2019-08-20 + +### Added + +* An up-to-date COPR package for Fedora (thanks eclipseo). +* Add --xattr as --xattr-write, --xattr-read & --write-unfinished to enable easy + checksum caching for the next run. +* Unique option for json and csv formatters (-c json:unique) +* New -k option for rmlint.sh to keep directory timestamps like before deletion. + +### Changed + +* Warn when using -j without -D. +* The exist status is now EXIT_FAILURE when rmlint was interrupted. +* Slightly changed progressbar ETA algorithm to deliver more stable results. +* docs: added a few scripting examples to underline the possibility to use 3rd-party tools. +* --dedupe options now checks if the files are already reflinks to reduce disk thrashing. +* Made sure that -T dd is the same as specifying -D. + +### Deprecated + +Nothing was deprecated. + +### Removed + +Nothing was removed. + +### Fixed + +* Dry-run was not honoured when a user-defined command was used. +* Fix crash when compiled with -msse4.2 +* Relative symbolic links were compared wrong when using -f (See issue #333) +* gui: The size options wer enot correctly passed to rmlint (only Megabytes) +* xattr: timestamps where often compared wrongly. +* Fiemap (and thus reflink detection) code was improved. +* --replay did not replay duplicate directories correctly. +* --replay did not honour --hidden and --size correctly. +* Various build issues. +* gui: fix "render from selected" feature. +* gui: generated script now correctly removes itself. +* gui: fix display of paths with ampersands in them. + ## [2.8.0 Maidenly Moose] -- 2018-10-30 Mostly a bugfix release with smaller functional changes. diff --git a/README.rst b/README.rst index 34b49b8c..2982a5a8 100644 --- a/README.rst +++ b/README.rst @@ -38,11 +38,14 @@ Finds… **Differences to other duplicate finders:** -- Extremely fast (no exaggeration, we promise!). +- Extremely fast (no exaggeration, we promise!) - Paranoia mode for those who do not trust hashsums. - Many output formats. - No interactivity. -- Search for files only newer than a certain ``mtime``. +- Search for files only newer than a certain ``mtime``. +- Many ways to handle duplicates. +- Caching and replaying. +- ``btrfs`` support. - ... It runs and compiles under most Unices, including Linux, FreeBSD and Darwin. diff --git a/SConstruct b/SConstruct index b6f17490..f78dd938 100755 --- a/SConstruct +++ b/SConstruct @@ -287,7 +287,7 @@ def check_faccessat(context): def check_xattr(context): rc = 1 - for func in ['getxattr', 'setxattr', 'removexattr']: + for func in ['getxattr', 'setxattr', 'removexattr', 'listxattr']: if tests.CheckFunc( context, func, header= @@ -304,6 +304,27 @@ def check_xattr(context): return rc + +def check_lxattr(context): + rc = 1 + + for func in ['lgetxattr', 'lsetxattr', 'lremovexattr', 'llistxattr']: + if tests.CheckFunc( + context, func, + header= + '#include ' + '#include ' + ): + rc = 0 + break + + conf.env['HAVE_LXATTR'] = rc + + context.did_show_result = True + context.Result(rc) + return rc + + def check_sha512(context): rc = 1 if tests.CheckDeclaration(context, 'G_CHECKSUM_SHA512', includes='#include \n'): @@ -562,6 +583,7 @@ conf = Configure(env, custom_tests={ 'check_libelf': check_libelf, 'check_fiemap': check_fiemap, 'check_xattr': check_xattr, + 'check_lxattr': check_lxattr, 'check_sha512': check_sha512, 'check_blkid': check_blkid, 'check_posix_fadvise': check_posix_fadvise, @@ -695,6 +717,7 @@ conf.check_sys_block() conf.check_libelf() conf.check_fiemap() conf.check_xattr() +conf.check_lxattr() conf.check_bigfiles() conf.check_sha512() conf.check_gettext() diff --git a/docs/_static/benchmarks/found_items.html b/docs/_static/benchmarks/found_items.html index 8c3a33dc..d3196d98 100644 --- a/docs/_static/benchmarks/found_items.html +++ b/docs/_static/benchmarks/found_items.html @@ -28,4 +28,13 @@ #table-7e93a246-3029-4e5b-83fc-e25d48bf01fd tr:nth-child(2n-1) td { background-color: #f2f2f2; } -
rdfindfdupesrmlintrmlint-paranoidrmlint-replayrmlint-v2.2.2rmlint-v2.2.2-paranoidrmlint-xxhashrmlint-olddupdbaseline.py
Duplicates027.203k27.203k27.203k27.203k27.203k27.203k27.203k39.656k43.217k67.931k
Originals016.115k16.115k16.115k16.115k16.115k16.115k16.115k15.133k16.109k22.848k
\ No newline at end of file + + + + + + + + + +
fdupesrmlintrmlint-paranoidrmlint-replayrmlint-v2.2.2rmlint-v2.2.2-paranoidrmlint-xxhashrmlint-olddupdbaseline.py
Duplicates27.203k27.203k27.203k27.203k27.203k27.203k27.203k39.656k43.217k67.931k
Originals16.115k16.115k16.115k16.115k16.115k16.115k16.115k15.133k16.109k22.848k
\ No newline at end of file diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst index 59c5e76c..eda66486 100644 --- a/docs/benchmarks.rst +++ b/docs/benchmarks.rst @@ -13,6 +13,12 @@ on ``rmlint``. You're of course free to interpret something different or re-run_ the benchmarks on your own machine. The exact version of each program is given in the plots. +.. warning:: + + This page is a little out of date. Help in updating it would be appreciated. + The performance characteristics of rmlint have improved overall, but + so might have other tools. + It should be noted that it is very hard to compare these tools, since *each* tool investigated a slightly different amount of data and produces different results on the dataset below. This is partly due to the fact that some tools @@ -95,7 +101,6 @@ case. .. raw:: html :file: _static/benchmarks/found_items.html -| Surprisingly each tool found a different set of files. As stated above, direct comparison may not be possible here. For most tools except ``rdfind`` and diff --git a/docs/conf.py b/docs/conf.py index 7a4d23e3..f6f74497 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -66,7 +66,7 @@ # General information about the project. project = 'rmlint documentation' -copyright = '2014-2015, Christopher Pahl & Daniel Thomas' +copyright = '2014-2019, Christopher Pahl & Daniel Thomas' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/docs/index.rst b/docs/index.rst index 386a8dcb..0cbdc068 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -129,8 +129,8 @@ Authors ``rmlint`` was and is written by: =================================== ============================= =========================================== -*Christopher Pahl* https://github.com/sahib 2010-2018 -*Daniel Thomas* https://github.com/SeeSpotRun 2014-2018 +*Christopher Pahl* https://github.com/sahib 2010-2019 +*Daniel Thomas* https://github.com/SeeSpotRun 2014-2019 =================================== ============================= =========================================== Additional thanks to: diff --git a/docs/install.rst b/docs/install.rst index 500700a2..b22e22a9 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -2,9 +2,9 @@ Installation ============ Many major Linux distribution might already package ``rmlint`` -- but watch out for -the version. This manual describes the rewrite of ``rmlint`` (i.e. version :math:`\geq 2`). -Old versions before this might contain bugs, have design flaws or might eat your -hamster. We recommend using the newest version. +the version. If possible, we recommend using the `newest version`_ available. + +.. _`newest version`: https://github.com/sahib/rmlint/releases If there is no package yet or you want to try a development version, you gonna need to compile ``rmlint`` from source. @@ -38,7 +38,7 @@ Here's a list of readily prepared commands for known operating systems: .. code-block:: bash - $ yum -y install git scons python3-sphinx gettext json-glib-devel + $ yum -y install pkgconf git scons python3-sphinx gettext json-glib-devel $ yum -y install glib2-devel libblkid-devel elfutils-libelf-devel # Optional dependencies for the GUI: $ yum -y install pygobject3 gtk3 librsvg2 @@ -50,6 +50,10 @@ Here's a list of readily prepared commands for known operating systems: $ dnf copr enable eclipseo/rmlint $ dnf install rmlint + Since **Fedora 29** we also have an `official package`_. + +.. _`official package`: https://bugzilla.redhat.com/show_bug.cgi?id=1655338 + .. _`Fedora Copr`: https://copr.fedorainfracloud.org/coprs/eclipseo/rmlint/ * **ArchLinux:** @@ -64,7 +68,7 @@ Here's a list of readily prepared commands for known operating systems: .. code-block:: bash - $ pacman -S git scons python-sphinx + $ pacman -S pkgconf git scons python-sphinx $ pacman -S glib2 libutil-linux elfutils json-glib # Optional dependencies for the GUI: $ pacman -S gtk3 python-gobject librsvg @@ -82,7 +86,10 @@ Here's a list of readily prepared commands for known operating systems: .. _`PKGBUILD`: https://aur.archlinux.org/packages/rm/rmlint-git/PKGBUILD .. _`ArchLinux AUR`: https://aur.archlinux.org/packages/rmlint-git -* **Ubuntu** :math:`\geq 12.04`: +* **Debian** / **Ubuntu** :math:`\geq 12.04`: + + Note: Debian also `ships an official package`_. + Use the below instructions if you need a more recent version. This most likely applies to most distributions that are derived from Ubuntu. Note that the ``GUI`` depends on ``GTK+ >= 3.12``! @@ -90,12 +97,14 @@ Here's a list of readily prepared commands for known operating systems: .. code-block:: bash - $ apt-get install git scons python3-sphinx python3-nose gettext build-essential + $ apt-get install pkg-config git scons python3-sphinx python3-nose gettext build-essential # Optional dependencies for more features: $ apt-get install libelf-dev libglib2.0-dev libblkid-dev libjson-glib-1.0 libjson-glib-dev # Optional dependencies for the GUI: $ apt-get install python3-gi gir1.2-rsvg gir1.2-gtk-3.0 python-cairo gir1.2-polkit-1.0 gir1.2-gtksource-3.0 + +.. _`ships an official package`: https://packages.debian.org/de/sid/rmlint .. _`still ships`: https://github.com/sahib/rmlint/issues/171#issuecomment-199070974 * **macOS** @@ -130,8 +139,7 @@ Here's a list of readily prepared commands for known operating systems: ----- Send us a note if you want to see your distribution here or the instructions -need an update. -The commands above install the full dependencies, therefore +need an update. The commands above install the full dependencies, therefore some packages might be stripped if you do not need the feature they enable. Only hard requirement for the commandline is ``glib``. diff --git a/docs/rmlint.1.rst b/docs/rmlint.1.rst index d9c44933..79cef8b7 100644 --- a/docs/rmlint.1.rst +++ b/docs/rmlint.1.rst @@ -22,7 +22,7 @@ It's main focus lies on finding duplicate files and directories. It is able to find the following types of lint: -* Duplicate files and directories (and as a result unique files). +* Duplicate files and directories (and as a by-product unique files). * Nonstripped Binaries (Binaries with debug symbols; needs to be explicitly enabled). * Broken symbolic links. * Empty files and directories (also nested empty directories). @@ -94,7 +94,7 @@ General Options * ``emptyfiles``, ``ef``: Find empty files. * ``nonstripped``, ``ns``: Find nonstripped binaries. * ``duplicates``, ``df``: Find duplicate files. - * ``duplicatedirs``, ``dd``: Find duplicate directories. + * ``duplicatedirs``, ``dd``: Find duplicate directories (This is the same ``-D``!) **WARNING:** It is good practice to enclose the description in single or double quotes. In obscure cases argument parsing might fail in weird ways, @@ -514,23 +514,43 @@ Caching *NOTE:* In ``--replay`` mode, a new ``.json`` file will be written to ``rmlint.replay.json`` in order to avoid overwriting ``rmlint.json``. +:``-C --xattr``: + + Shortcut for ``--xattr-write``, ``--xattr-write``, ``--write-unfinished``. + This will write a checksum and a timestamp to the extended attributes of each + file that rmlint hashed. This speeds up subsequent runs on the same data set. + Please note that not all filesystems may support extended attributes and you + need write support to use this feature. + + See the individual options below for more details and some examples. + :``--xattr-read`` / ``--xattr-write`` / ``--xattr-clear``: Read or write cached checksums from the extended file attributes. This feature can be used to speed up consecutive runs. - **CAUTION:** This could potentially lead to false positives if file contents are - somehow modified without changing the file mtime. + **CAUTION:** This could potentially lead to false positives if file + contents are somehow modified without changing the file modification time. + rmlint uses the mtime to determine the modification timestamp if a checksum + is outdated. This is not a problem if you use the clone or reflink + operation on a filesystem like btrfs. There an outdated checksum entry + would simply lead to some duplicate work done in the kernel but would do no + harm otherwise. **NOTE:** Many tools do not support extended file attributes properly, resulting in a loss of the information when copying the file or editing it. - Also, this is a linux specific feature that works not on all filesystems and - only if you have write permissions to the file. + + **NOTE:** You can specify ``--xattr-write`` and ``--xattr-read`` at the same time. + This will read from existing checksums at the start of the run and update all hashed + files at the end. Usage example:: - $ rmlint large_file_cluster/ -U --xattr-write # first run. - $ rmlint large_file_cluster/ --xattr-read # second run. + $ rmlint large_file_cluster/ -U --xattr-write # first run should be slow. + $ rmlint large_file_cluster/ --xattr-read # second run should be faster. + + # Or do the same in just one run: + $ rmlint large_file_cluster/ --xattr :``-U --write-unfinished``: @@ -542,6 +562,8 @@ Caching re-running rmlint on a large dataset this can greatly speed up a re-run in some cases. Please refer to ``--xattr-read`` for an example. + If you want to output unique files, please look into the ``uniques`` output formatter. + Rarely used, miscellaneous options ---------------------------------- @@ -614,6 +636,7 @@ FORMATTERS Available options: * *no_header*: Do not write a first line describing the column headers. + * *unique*: Include unique files in the output. * ``sh``: Output all found lint as shell script This formatter is activated as default. @@ -667,12 +690,19 @@ FORMATTERS Available options: + - *unique*: Include unique files in the output. - *no_header=[true|false]:* Print the header with metadata (default: true) - *no_footer=[true|false]:* Print the footer with statistics (default: true) - *oneline=[true|false]:* Print one json document per line (default: false) This is useful if you plan to parse the output line-by-line, e.g. while ``rmlint`` is sill running. + This formatter is extremely useful if you're in need of scripting more complex behaviour, + that is not directly possible with rmlint's built-in options. A very handy tool here is ``jq``. + Here is an example to output all original files directly from a ``rmlint`` run: + + ``$ rmlint -o | json jq -r '.[1:-1][] | select(.is_original) | .path'`` + * ``py``: Outputs a python script and a JSON document, just like the **json** formatter. The JSON document is written to ``.rmlint.json``, executing the script will make it read from there. This formatter is mostly intended for complex use-cases @@ -680,6 +710,9 @@ FORMATTERS Therefore the python script can be modified to do things standard ``rmlint`` is not able to do easily. +* ``uniques``: Outputs all unique paths found during the run, one path per line. + This is often useful for scripting purposes. + * ``stamp``: Outputs a timestamp of the time ``rmlint`` was run. @@ -777,18 +810,19 @@ OTHER STAND-ALONE COMMANDS :``rmlint --is-reflink [-v|-V] ``: Tests whether ``file1`` and ``file2`` are reflinks (reference same data). - Return codes: - 0: files are reflinks - 1: files are not reflinks - 3: not a regular file - 4: file sizes differ - 5: fiemaps can't be read - 6: file1 and file2 are the same path - 7: file1 and file2 are the same file under different mountpoints - 8: files are hardlinks - 9: files are symlinks (TODO) - 10: files are not on same device - 11: other error encountered + This command makes ``rmlint`` exit with one of the following exit codes: + + * 0: files are reflinks + * 1: files are not reflinks + * 3: not a regular file + * 4: file sizes differ + * 5: fiemaps can't be read + * 6: file1 and file2 are the same path + * 7: file1 and file2 are the same file under different mountpoints + * 8: files are hardlinks + * 9: files are symlinks + * 10: files are not on same device + * 11: other error encountered EXAMPLES @@ -868,8 +902,21 @@ This is a collection of common use cases and other tricks: ``$ rmlint --equal a b c && echo "Files are equal" || echo "Files are not equal"`` * Test if two files are reflinks - ``rmlint --is-reflink a b && echo "Files are reflinks" || echo "Files are not reflinks"``. + ``$ rmlint --is-reflink a b && echo "Files are reflinks" || echo "Files are not reflinks"``. + +* Cache calculated checksums for next run. The checksums will be written to the extended file attributes: + + ``$ rmlint --xattr`` + +* Produce a list of unique files in a folder: + + ``$ rmlint -o uniques`` + +* Produce a list of files that are unique, including original files ("one of each"): + + ``$ rmlint t -o json -o uniques:unique_files | jq -r '.[1:-1][] | select(.is_original) | .path' | sort > original_files`` + ``$ cat unique_files original_files`` PROBLEMS ======== @@ -923,7 +970,7 @@ You can build a debug build of ``rmlint`` like this: * ``git clone git@github.com:sahib/rmlint.git`` * ``cd rmlint`` -* ``scons DEBUG=1`` +* ``scons GDB=1 DEBUG=1`` * ``sudo scons install # Optional`` LICENSE diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 1d882c7f..67e1e5e4 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -455,10 +455,11 @@ paranoia level can be adjusted using the ``-p`` (``--paranoid``) switch. Here's what they do in detail: -* ``-p`` is equivalent to ``--algorithm=paranoid`` -* ``-P`` is equivalent to ``--algorithm=highway256`` -* ``-PP`` is equivalent to ``--algorithm=metro256`` -* ``-PP`` is equivalent to ``--algorithm=metro`` +- ``-p`` is equivalent to ``--algorithm=paranoid`` +- ``-P`` is equivalent to ``--algorithm=highway256`` +- ``-PP`` is equivalent to ``--algorithm=metro256`` +- ``-PPP`` is equivalent to ``--algorithm=metro`` + As you see, it just enables a certain duplicate detection algorithm to either use a stronger hash function or to do a byte-by-byte comparison. While this might sound @@ -666,8 +667,9 @@ here, that the original is taken from a directory that was preserved. So exactly one copy of the ``xxx``-content file stays on the filesystem in the end. ``rmlint`` finds duplicate directories by counting all files in the directory -tree and looking up if there's an equal amount of duplicate and empty files. -If so, it tries the same with the parent directory. +tree and looking up if there's an equal amount of duplicate and empty files. If +yes, it checks if the hashes are equal using a Merkle Tree. If it matches, it +continues on the parent directory. Some file like hidden files will not be recognized as duplicates, but still added to the count. This will of course lead to unmerged directories. That's why @@ -755,17 +757,19 @@ Miscellaneous options --------------------- If you read so far, you know ``rmlint`` pretty well by now. -Here's just a list of options that are nice to know, but not essential: +Here's just a list of options that are nice to know, but are not essential: -- Consecutive runs of ``rmlint`` can be speed up by using ``--xattr-read``. +- Consecutive runs of ``rmlint`` can be speed up by using ``--xattr``. .. code-block:: python - $ rmlint large_dataset/ --xattr-write --write-unfinished - $ rmlint large_dataset/ --xattr-read + $ rmlint large_dataset/ --xattr + $ rmlint large_dataset/ --xattr - Here, the second run should (or *might*) run a lot faster. - But be sure to read the caveats stated in the `manpage`_! + Here, the second run should (or *might*) run a lot faster, since the first + run saved all calculated checksums to the extended attributes of each + processed file. But be sure to read the caveats stated in the `manpage`_! + Especially keep in mind that you need to have write access to the files for this to work. - ``-r`` (``--hidden``): Include hidden files and directories. The default is to ignore these, to save you from destroying git repositories (or similar @@ -817,3 +821,47 @@ Here's just a list of options that are nice to know, but not essential: $ rmlint -q 100 -Q .9 .. _manpage: http://rmlint.readthedocs.org/en/latest/rmlint.1.html + +Scripting +--------- + +Sometimes ``rmlint`` can't help you with everything. While we get a lot of +feature request to have some more comfort-options built-in, we have to decline +most of them to make sure that ``rmlint`` does not become too bloated. But fret not, +we have lots of possibilities to do some scripting. Let's look at some common problem +that can be easily solved with some other tools. + +Copying unique files +~~~~~~~~~~~~~~~~~~~~ + +Imagine you have a folder from which you want to extract one copy of each file +by content. This does not only mean unique files, but also original files - but +not their duplicates. This sounds like a job for the ``uniques`` formatter +(which outputs all unique paths, one by line) and ``jq``. ``jq`` is a great +little `tool`_ which makes it really easy to extract data from a ``json`` file: + +.. _tool: https://stedolan.github.io/jq + +.. code-block:: bash + + # This command produces two files: + # - unique_files: Files that have a unique checksum in the directory. + # - original_files: Files that have the "is_original" field set to true in the json output. + # The '.[1:-1]' part is for filtering the header and footer part of the json response. + $ rmlint t -o json -o uniques:unique_files | jq -r '.[1:-1][] | select(.is_original) | .path' | sort > original_files + # Now we only need to combine both files: + $ cat unique_files original_files + +Filter by regular expressions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A very often requested feature is to provide something like ``--include`` or +``--exclude`` where you could pass a regular expression to include or exclude +some files from traversing. But ``rmlint`` is not a clone of ``find``. Instead +you can use ``find`` just fine to pipe output into ``rmlint``: + + +.. code-block:: bash + + # Deduplicate all png pictures that were not modified in the last 24 hours: + $ find ~ -iname '*.png' ! -mtime 0 | rmlint - diff --git a/gui/Makefile b/gui/Makefile index 8ce64ff9..1a479211 100644 --- a/gui/Makefile +++ b/gui/Makefile @@ -7,6 +7,6 @@ schema: @glib-compile-schemas ~/.glib-schemas run: resources schema - @GSETTINGS_SCHEMA_DIR=~/.glib-schemas/ python3 shredder/__main__.py + @GSETTINGS_SCHEMA_DIR=~/.glib-schemas/ python3 -c "import shredder; shredder.run_gui()" all: run diff --git a/gui/shredder/chart.py b/gui/shredder/chart.py index 977f31c3..c9a2bf6e 100644 --- a/gui/shredder/chart.py +++ b/gui/shredder/chart.py @@ -266,10 +266,12 @@ def __init__(self, node, layer, degree, size, tooltip=None): self.is_selected = False # Cut off too long tooltips: - if len(tooltip) > 60: - self.tooltip = tooltip[:60] + '...' - else: - self.tooltip = tooltip + if tooltip is not None: + tooltip = GLib.markup_escape_text(tooltip, -1) + if len(tooltip) > 60: + self.tooltip = tooltip[:60] + '...' + else: + self.tooltip = tooltip def draw(self, ctx, alloc, max_layers, bg_col): """Trigger the actual drawing of the segment.""" diff --git a/gui/shredder/runner.py b/gui/shredder/runner.py index a5b1db2f..c1394419 100644 --- a/gui/shredder/runner.py +++ b/gui/shredder/runner.py @@ -175,9 +175,9 @@ def _create_rmlint_process( min_size, max_size = cfg.get_value('traverse-size-limits') extra_options += [ - '--size', '{a}M-{b}M'.format( - a=min_size // (1024 ** 2), - b=max_size // (1024 ** 2) + '--size', '{a}-{b}'.format( + a=min_size, + b=max_size ) ] @@ -390,6 +390,7 @@ def replay(self, allowed_paths=None): with open(replay_path, 'w') as handle: handle.write(json.dumps(results)) + # Regenerate output formatters by calling rmlint: process = _create_rmlint_process( self.settings, self._tmpdir.name, @@ -427,11 +428,26 @@ def save(self, dest_path, file_type='sh'): return try: - shutil.copy(source_path_func(), dest_path) + source_path = source_path_func() + shutil.copy(source_path, dest_path) + if file_type == "sh": + _fix_shell_auto_remove_path(dest_path, source_path) except OSError: LOGGER.exception('Could not save') +def _fix_shell_auto_remove_path(sh_path, temp_path): + """ + Shell scripts contain the path that they were created under. + This is used to remove the script after a successful run. + """ + with open(sh_path, "r") as fd: + text = fd.read() + + with open(sh_path, "w") as fd: + fd.write(text.replace(temp_path, sh_path)) + + def _strip_ascii_colors(text): """Strip ascii colors from `text`""" return ASCII_COLOR_REGEX.sub('', text) diff --git a/gui/shredder/tree.py b/gui/shredder/tree.py index 3e495f97..8854e9f5 100644 --- a/gui/shredder/tree.py +++ b/gui/shredder/tree.py @@ -125,7 +125,7 @@ def __getitem__(self, idx): if idx is Column.PATH: return self.name elif idx is Column.TOOLTIP: - return self.build_path() + return GLib.markup_escape_text(self.build_path(), -1) else: return self.row[idx] @@ -871,7 +871,7 @@ def get_selected_nodes(self): yield node def get_selected_node(self): - """Return thefirst selected node or None.""" + """Return the first selected node or None.""" try: return next(self.get_selected_nodes()) except StopIteration: diff --git a/gui/shredder/views/runner.py b/gui/shredder/views/runner.py index 0f2de8cc..b2d27f9f 100644 --- a/gui/shredder/views/runner.py +++ b/gui/shredder/views/runner.py @@ -397,37 +397,42 @@ def on_selection_changed(self, _): self.group_revealer.set_reveal_child(False) self.chart_stack.render(node) - def _generate_script(self, trie, node): + def _generate_script(self, trie, nodes): """Do the actual generation work, starting at `node` in `trie`.""" self._script_generated = True - gen = trie.iterate(node=node) - self.runner.replay({ - ch.build_path(): NodeState.should_keep(ch[Column.TAG]) for ch in gen if ch.is_leaf - }) + path_to_is_original = {} + for node in nodes: + for ch in trie.iterate(node=node): + if not ch.is_leaf: + continue + path_to_is_original[ch.build_path()] = \ + NodeState.should_keep(ch[Column.TAG]) + + self.runner.replay(path_to_is_original) self.app_window.views.go_right.set_sensitive(True) self.app_window.views.switch('editor') def on_generate_script(self, _): """Generate the full script.""" - self._generate_script(self.model.trie, self.model.trie.root) + self._generate_script(self.model.trie, [self.model.trie.root]) def on_generate_filtered_script(self, _): """Generate the script with only the visible content.""" model = self.treeview.get_model() - self._generate_script(model.trie, model.trie.root) + self._generate_script(model.trie, [model.trie.root]) def on_generate_selection_script(self, _): """Generate the script only from the current selected dir or files.""" model = self.treeview.get_model() - selected_node = self.treeview.get_selected_node() + selected_nodes = self.treeview.get_selected_nodes() - if selected_node is None: + if not selected_nodes: LOGGER.info('Nothing selected to make script from.') return - self._generate_script(model.trie, selected_node) + self._generate_script(model.trie, selected_nodes) def on_default_action(self): """Called on Ctrl-Enter""" diff --git a/gui/shredder/window.py b/gui/shredder/window.py index 65ed64a4..15763043 100644 --- a/gui/shredder/window.py +++ b/gui/shredder/window.py @@ -252,11 +252,24 @@ def __init__(self, application): 'clicked', lambda btn: self.views.set_search_mode(btn.get_active()) ) + close_button = Gtk.Button() + close_button.add( + Gtk.Image.new_from_gicon( + Gio.ThemedIcon(name="window-close-symbolic"), + Gtk.IconSize.BUTTON + ) + ) + close_button.get_style_context().add_class("titlebutton") + close_button.connect( + 'clicked', lambda _: self.close(), + ) + if shredder.APP_USE_TRADITIONAL_MENU: menu_button.set_use_popover(False) self.main_grid = Gtk.Grid() self.main_grid.attach(self.view_stack, 0, 3, 1, 1) + self.headerbar.pack_end(close_button) self.headerbar.pack_end(menu_button) self.headerbar.pack_end(search_button) self.add(self.main_grid) diff --git a/lib/SConscript b/lib/SConscript index beba7ef5..7d9b4b61 100644 --- a/lib/SConscript +++ b/lib/SConscript @@ -24,6 +24,7 @@ def build_config_template(target, source, env): HAVE_GIO_UNIX=env['HAVE_GIO_UNIX'], HAVE_FIEMAP=env['HAVE_FIEMAP'], HAVE_XATTR=env['HAVE_XATTR'], + HAVE_LXATTR=env['HAVE_LXATTR'], HAVE_SHA512=env['HAVE_SHA512'], HAVE_BIGFILES=env['HAVE_BIGFILES'], HAVE_STAT64=env['HAVE_BIG_STAT'], diff --git a/lib/cfg.h b/lib/cfg.h index 5b1e7e4a..8029e775 100644 --- a/lib/cfg.h +++ b/lib/cfg.h @@ -176,6 +176,7 @@ typedef struct RmCfg { gboolean run_equal_mode; /* --dedupe options */ bool dedupe; + bool dedupe_check_xattr; bool dedupe_readonly; /* for --is-reflink option */ diff --git a/lib/cmdline.c b/lib/cmdline.c index 2998da55..3940a093 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -352,6 +352,17 @@ static gboolean rm_cmd_parse_limit_sizes(_UNUSED const char *option_name, } } +static gboolean rm_cmd_parse_xattr(_UNUSED const char *option_name, + _UNUSED const gchar *_, + RmSession *session, + _UNUSED GError **error) { + session->cfg->write_cksum_to_xattr = true; + session->cfg->read_cksum_from_xattr= true; + session->cfg->clear_xattr_fields = false; + session->cfg->write_unfinished = true; + return true; +} + static GLogLevelFlags VERBOSITY_TO_LOG_LEVEL[] = {[0] = G_LOG_LEVEL_CRITICAL, [1] = G_LOG_LEVEL_ERROR, [2] = G_LOG_LEVEL_WARNING, @@ -399,11 +410,11 @@ static bool rm_cmd_parse_output_pair(RmSession *session, const char *pair, char *extension = strchr(pair, '.'); if(extension == NULL) { full_path = "stdout"; - strncpy(format_name, pair, strlen(pair)); + strncpy(format_name, pair, sizeof(format_name)-1); } else { extension += 1; full_path = (char *)pair; - strncpy(format_name, extension, strlen(extension)); + strncpy(format_name, extension, sizeof(format_name)-1); } } else { full_path = separator + 1; @@ -991,7 +1002,7 @@ static gboolean rm_cmd_parse_merge_directories(_UNUSED const char *option_name, RmCfg *cfg = session->cfg; cfg->merge_directories = true; - /* Pull in some options for convinience, + /* Pull in some options for convenience, * duplicate dir detection works better with them. * * They may be disabled explicitly though. @@ -1006,6 +1017,24 @@ static gboolean rm_cmd_parse_merge_directories(_UNUSED const char *option_name, return true; } +static gboolean rm_cmd_parse_hidden(_UNUSED const char *option_name, + _UNUSED const gchar *_, + RmSession *session, + _UNUSED GError **error) { + session->cfg->ignore_hidden = false; + session->cfg->partial_hidden = false; + return true; +} + +static gboolean rm_cmd_parse_no_hidden(_UNUSED const char *option_name, + _UNUSED const gchar *_, + RmSession *session, + _UNUSED GError **error) { + session->cfg->ignore_hidden = true; + session->cfg->partial_hidden = false; + return true; +} + static gboolean rm_cmd_parse_honour_dir_layout(_UNUSED const char *option_name, _UNUSED const gchar *_, RmSession *session, @@ -1066,7 +1095,7 @@ static gboolean rm_cmd_parse_sortby(_UNUSED const char *option_name, } /* Remember the criteria string */ - strncpy(cfg->rank_criteria, criteria, sizeof(cfg->rank_criteria)); + strncpy(cfg->rank_criteria, criteria, sizeof(cfg->rank_criteria)-1); /* ranking the files depends on caching them to the end of the program */ cfg->cache_file_structs = true; @@ -1135,7 +1164,6 @@ static gboolean rm_cmd_parse_btrfs_clone(_UNUSED const char *option_name, static gboolean rm_cmd_parse_btrfs_readonly(_UNUSED const char *option_name, _UNUSED const gchar *x, RmSession *session, _UNUSED GError **error) { - rm_log_warning_line("option --btrfs-readonly is deprecated, use --dedupe-readonly"); session->cfg->dedupe_readonly = true; return true; } @@ -1264,9 +1292,8 @@ bool rm_cmd_parse_args(int argc, char **argv, RmSession *session) { const int DISABLE = G_OPTION_FLAG_REVERSE, EMPTY = G_OPTION_FLAG_NO_ARG, HIDDEN = G_OPTION_FLAG_HIDDEN, OPTIONAL = G_OPTION_FLAG_OPTIONAL_ARG; - /* Free/Used Options: - Used: abBcCdDeEfFgGHhiI kKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ| - Free: jJ | + /* Free short options: AJ + * (beside special characters and numbers) */ /* clang-format off */ @@ -1283,6 +1310,7 @@ bool rm_cmd_parse_args(int argc, char **argv, RmSession *session) { {"newer-than-stamp" , 'n' , 0 , G_OPTION_ARG_CALLBACK , FUNC(timestamp_file) , _("Newer than stamp file") , "PATH"} , {"newer-than" , 'N' , 0 , G_OPTION_ARG_CALLBACK , FUNC(timestamp) , _("Newer than timestamp") , "STAMP"} , {"config" , 'c' , 0 , G_OPTION_ARG_CALLBACK , FUNC(config) , _("Configure a formatter") , "FMT:K[=V]"} , + {"xattr" , 'C' , EMPTY , G_OPTION_ARG_CALLBACK , FUNC(xattr) , _("Enable xattr based caching") , ""} , /* Non-trivial switches */ {"progress" , 'g' , EMPTY , G_OPTION_ARG_CALLBACK , FUNC(progress) , _("Enable progressbar") , NULL} , @@ -1293,11 +1321,11 @@ bool rm_cmd_parse_args(int argc, char **argv, RmSession *session) { /* Trivial boolean options */ {"no-with-color" , 'W' , DISABLE , G_OPTION_ARG_NONE , &cfg->with_color , _("Be not that colorful") , NULL} , - {"hidden" , 'r' , DISABLE , G_OPTION_ARG_NONE , &cfg->ignore_hidden , _("Find hidden files") , NULL} , + {"hidden" , 'r' , EMPTY , G_OPTION_ARG_CALLBACK , FUNC(hidden) , _("Find hidden files") , NULL} , {"followlinks" , 'f' , EMPTY , G_OPTION_ARG_CALLBACK , FUNC(follow_symlinks) , _("Follow symlinks") , NULL} , {"no-followlinks" , 'F' , DISABLE , G_OPTION_ARG_NONE , &cfg->follow_symlinks , _("Ignore symlinks") , NULL} , {"paranoid" , 'p' , EMPTY , G_OPTION_ARG_CALLBACK , FUNC(paranoid) , _("Use more paranoid hashing") , NULL} , - {"no-crossdev" , 'x' , DISABLE , G_OPTION_ARG_NONE , &cfg->crossdev , _("Do not cross mounpoints") , NULL} , + {"no-crossdev" , 'x' , DISABLE , G_OPTION_ARG_NONE , &cfg->crossdev , _("Do not cross mountpoints") , NULL} , {"keep-all-tagged" , 'k' , 0 , G_OPTION_ARG_NONE , &cfg->keep_all_tagged , _("Keep all tagged files") , NULL} , {"keep-all-untagged" , 'K' , 0 , G_OPTION_ARG_NONE , &cfg->keep_all_untagged , _("Keep all untagged files") , NULL} , {"must-match-tagged" , 'm' , 0 , G_OPTION_ARG_NONE , &cfg->must_match_tagged , _("Must have twin in tagged dir") , NULL} , @@ -1317,7 +1345,8 @@ bool rm_cmd_parse_args(int argc, char **argv, RmSession *session) { /* COW filesystem deduplication support */ {"dedupe" , 0 , 0 , G_OPTION_ARG_NONE , &cfg->dedupe , _("Dedupe matching extents from source to dest (if filesystem supports)") , NULL} , - {"dedupe-readonly" , 'r' , 0 , G_OPTION_ARG_NONE , &cfg->dedupe_readonly , _("(--dedupe option) even dedupe read-only snapshots (needs root)") , NULL} , + {"dedupe-xattr" , 0 , 0 , G_OPTION_ARG_NONE , &cfg->dedupe_check_xattr , _("Check extended attributes to see if the file is already deduplicated") , NULL} , + {"dedupe-readonly" , 0 , 0 , G_OPTION_ARG_NONE , &cfg->dedupe_readonly , _("(--dedupe option) even dedupe read-only snapshots (needs root)") , NULL} , {"is-reflink" , 0 , 0 , G_OPTION_ARG_NONE , &cfg->is_reflink , _("Test if two files are reflinks (share same data extents)") , NULL} , /* Callback */ @@ -1333,7 +1362,7 @@ bool rm_cmd_parse_args(int argc, char **argv, RmSession *session) { }; const GOptionEntry inversed_option_entries[] = { - {"no-hidden" , 'R' , HIDDEN , G_OPTION_ARG_NONE , &cfg->ignore_hidden , "Ignore hidden files" , NULL} , + {"no-hidden" , 'R' , EMPTY | HIDDEN , G_OPTION_ARG_CALLBACK , FUNC(no_hidden) , "Ignore hidden files" , NULL} , {"with-color" , 'w' , HIDDEN , G_OPTION_ARG_NONE , &cfg->with_color , "Be colorful like a unicorn" , NULL} , {"hardlinked" , 'l' , HIDDEN , G_OPTION_ARG_NONE , &cfg->find_hardlinked_dupes , _("Report hardlinks as duplicates") , NULL} , {"crossdev" , 'X' , HIDDEN , G_OPTION_ARG_NONE , &cfg->crossdev , "Cross mountpoints" , NULL} , @@ -1486,13 +1515,16 @@ bool rm_cmd_parse_args(int argc, char **argv, RmSession *session) { if(cfg->partial_hidden && !cfg->merge_directories) { /* --partial-hidden only makes sense with --merge-directories. - * If the latter is not specfified, ignore it all together */ + * If the latter is not specfified, ignore it all together + * and act as if --no-hidden was specified. */ cfg->ignore_hidden = true; cfg->partial_hidden = false; } if(cfg->honour_dir_layout && !cfg->merge_directories) { rm_log_warning_line(_("--honour-dir-layout (-j) makes no sense without --merge-directories (-D)")); + rm_log_warning_line(_("Note that not having duplicate directories enabled as lint type (e.g via -T df)")); + rm_log_warning_line(_("will also disable --merge-directories and trigger this warning.")); } if(cfg->progress_enabled) { @@ -1718,5 +1750,9 @@ int rm_cmd_main(RmSession *session) { return session->equal_exit_code; } + if(exit_state == EXIT_SUCCESS && rm_session_was_aborted()) { + exit_state = EXIT_FAILURE; + } + return exit_state; } diff --git a/lib/config.h.in b/lib/config.h.in index c9f3bfda..44d7e5d9 100644 --- a/lib/config.h.in +++ b/lib/config.h.in @@ -10,6 +10,7 @@ #define HAVE_GIO_UNIX ({HAVE_GIO_UNIX}) #define HAVE_FIEMAP ({HAVE_FIEMAP}) #define HAVE_XATTR ({HAVE_XATTR}) +#define HAVE_LXATTR ({HAVE_LXATTR}) #define HAVE_SHA512 ({HAVE_SHA512}) #define HAVE_BIGFILES ({HAVE_BIGFILES}) #define HAVE_STAT64 ({HAVE_STAT64}) @@ -120,51 +121,62 @@ # define N_(String) gettext_noop (String) #endif +GMutex RM_LOG_MTX; + +#define RM_LOG_INIT g_mutex_init(&RM_LOG_MTX); typedef guint64 RmOff; /* Stupid macros to make printing error lines easier */ -#define rm_log_error_prefix() \ - rm_log_error(RED); \ - rm_log_error(_("ERROR")); \ - rm_log_error(": "RESET); \ +#define rm_log_error_prefix() \ + rm_log_error(RED); \ + rm_log_error(_("ERROR")); \ + rm_log_error(": "RESET); \ #define rm_log_warning_prefix() \ rm_log_warning(YELLOW); \ rm_log_warning(_("WARNING")); \ rm_log_warning(": "RESET); \ -#define rm_log_info_prefix() \ - rm_log_warning(GREEN); \ - rm_log_warning(_("INFO")); \ - rm_log_warning(": "RESET); \ +#define rm_log_info_prefix() \ + rm_log_warning(GREEN); \ + rm_log_warning(_("INFO")); \ + rm_log_warning(": "RESET); \ -#define rm_log_debug_prefix() \ - rm_log_debug(BLUE); \ - rm_log_debug(_("DEBUG")); \ - rm_log_debug(": "RESET); \ +#define rm_log_debug_prefix() \ + rm_log_debug(BLUE); \ + rm_log_debug(_("DEBUG")); \ + rm_log_debug(": "RESET); \ /////////////// -#define rm_log_error_line(...) \ - rm_log_error_prefix() \ - rm_log_error(__VA_ARGS__); \ - rm_log_error("\n"); \ +#define rm_log_error_line(...) \ + g_mutex_lock(&RM_LOG_MTX); \ + rm_log_error_prefix() \ + rm_log_error(__VA_ARGS__); \ + rm_log_error("\n"); \ + g_mutex_unlock(&RM_LOG_MTX); \ #define rm_log_warning_line(...) \ + g_mutex_lock(&RM_LOG_MTX); \ rm_log_warning_prefix() \ rm_log_warning(__VA_ARGS__); \ rm_log_warning("\n"); \ + g_mutex_unlock(&RM_LOG_MTX); \ #define rm_log_info_line(...) \ + g_mutex_lock(&RM_LOG_MTX); \ rm_log_info_prefix() \ rm_log_warning(__VA_ARGS__); \ rm_log_warning("\n"); \ - -#define rm_log_debug_line(...) \ - rm_log_debug_prefix() \ - rm_log_debug(__VA_ARGS__); \ - rm_log_debug("\n"); \ + g_mutex_unlock(&RM_LOG_MTX); \ + +#define rm_log_debug_line(...) \ + g_mutex_lock(&RM_LOG_MTX); \ + rm_log_debug_prefix() \ + rm_log_debug(__VA_ARGS__); \ + rm_log_debug("\n"); \ + g_mutex_unlock(&RM_LOG_MTX); \ /* Domain for reporting errors. Needed by GOptions */ #define RM_ERROR_QUARK (g_quark_from_static_string("rmlint")) diff --git a/lib/file.c b/lib/file.c index c3798deb..b0757c28 100644 --- a/lib/file.c +++ b/lib/file.c @@ -137,8 +137,7 @@ static const char *LINT_TYPES[] = {[RM_LINT_TYPE_UNKNOWN] = "", [RM_LINT_TYPE_EMPTY_FILE] = "emptyfile", [RM_LINT_TYPE_DUPE_CANDIDATE] = "duplicate_file", [RM_LINT_TYPE_DUPE_DIR_CANDIDATE] = "duplicate_dir", - [RM_LINT_TYPE_UNIQUE_FILE] = "unfinished_cksum"}; -/* TODO: rename 'unfinished_cksum; to 'unique_file' and update nosetests accordingly */ + [RM_LINT_TYPE_UNIQUE_FILE] = "unique_file"}; const char *rm_file_lint_type_to_string(RmLintType type) { return LINT_TYPES[MIN(type, sizeof(LINT_TYPES) / sizeof(const char *))]; diff --git a/lib/formats/csv.c b/lib/formats/csv.c index 0e4b3062..187ea3e3 100644 --- a/lib/formats/csv.c +++ b/lib/formats/csv.c @@ -46,23 +46,33 @@ static void rm_fmt_head(_UNUSED RmSession *session, _UNUSED RmFmtHandler *parent return; } - fprintf(out, "%s%s%s%s%s%s%s\n", "type", CSV_SEP, "path", CSV_SEP, "size", CSV_SEP, - "checksum"); + fprintf( + out, + "%s%s%s%s%s%s%s\n", + "type", CSV_SEP, + "path", CSV_SEP, + "size", CSV_SEP, + "checksum" + ); } static void rm_fmt_elem(_UNUSED RmSession *session, _UNUSED RmFmtHandler *parent, FILE *out, RmFile *file) { - if(file->lint_type == RM_LINT_TYPE_UNIQUE_FILE && - (!file->digest || !session->cfg->write_unfinished)) { - /* unique file with no partial checksum */ - return; + if(file->lint_type == RM_LINT_TYPE_UNIQUE_FILE) { + if(!rm_fmt_get_config_value(session->formats, "csv", "unique")) { + if(!file->digest || !session->cfg->write_unfinished) { + return; + } + } } - char checksum_str[rm_digest_get_bytes(file->digest) * 2 + 1]; - memset(checksum_str, '0', sizeof(checksum_str)); - checksum_str[sizeof(checksum_str) - 1] = 0; + char *checksum_str = NULL; + size_t checksum_size = 0; - if(file->digest) { + if(file->digest != NULL) { + checksum_size = rm_digest_get_bytes(file->digest) * 2 + 1; + checksum_str = g_slice_alloc0(checksum_size); + checksum_str[checksum_size - 1] = 0; rm_digest_hexstring(file->digest, checksum_str); } @@ -70,10 +80,20 @@ static void rm_fmt_elem(_UNUSED RmSession *session, _UNUSED RmFmtHandler *parent RM_DEFINE_PATH(file); char *clean_path = rm_util_strsub(file_path, CSV_QUOTE, CSV_QUOTE "" CSV_QUOTE); - fprintf(out, CSV_FORMAT, rm_file_lint_type_to_string(file->lint_type), clean_path, - file->actual_file_size, checksum_str); + fprintf( + out, + CSV_FORMAT, + rm_file_lint_type_to_string(file->lint_type), + clean_path, + file->actual_file_size, + checksum_str + ); g_free(clean_path); + + if(checksum_str != NULL) { + g_slice_free1(checksum_size, checksum_str); + } } static RmFmtHandlerProgress CSV_HANDLER_IMPL = { @@ -85,7 +105,7 @@ static RmFmtHandlerProgress CSV_HANDLER_IMPL = { .elem = rm_fmt_elem, .prog = NULL, .foot = NULL, - .valid_keys = {"no_header", NULL}, + .valid_keys = {"no_header", "unique", NULL}, }}; RmFmtHandler *CSV_HANDLER = (RmFmtHandler *)&CSV_HANDLER_IMPL; diff --git a/lib/formats/json.c b/lib/formats/json.c index e0746448..4a492a74 100644 --- a/lib/formats/json.c +++ b/lib/formats/json.c @@ -55,7 +55,9 @@ static guint32 rm_fmt_json_generate_id(RmFmtHandlerJSON *self, RmFile *file, for(int i = 0; i < 8192; ++i) { hash ^= MurmurHash3_x86_32(file_path, strlen(file_path), i); - hash ^= MurmurHash3_x86_32(cksum, strlen(cksum), i); + if(cksum != NULL) { + hash ^= MurmurHash3_x86_32(cksum, strlen(cksum), i); + } if(!g_hash_table_contains(self->id_set, GUINT_TO_POINTER(hash))) { break; @@ -250,19 +252,28 @@ static void rm_fmt_json_cksum(RmFile *file, char *checksum_str, size_t size) { rm_digest_hexstring(file->digest, checksum_str); } -static void rm_fmt_elem(RmSession *session, _UNUSED RmFmtHandler *parent, FILE *out, - RmFile *file) { +static void rm_fmt_elem(RmSession *session, _UNUSED RmFmtHandler *parent, FILE *out, RmFile *file) { if(rm_fmt_get_config_value(session->formats, "json", "no_body")) { return; } - if(file->lint_type == RM_LINT_TYPE_UNIQUE_FILE && - (!file->digest || !session->cfg->write_unfinished)) { - /* unique file with no partial checksum */ - return; + + if(file->lint_type == RM_LINT_TYPE_UNIQUE_FILE) { + if(!rm_fmt_get_config_value(session->formats, "json", "unique")) { + if(!file->digest || !session->cfg->write_unfinished) { + return; + } + } } - char checksum_str[rm_digest_get_bytes(file->digest) * 2 + 1]; - rm_fmt_json_cksum(file, checksum_str, sizeof(checksum_str)); + char *checksum_str = NULL; + size_t checksum_size = 0; + + if(file->digest != NULL) { + checksum_size = rm_digest_get_bytes(file->digest) * 2 + 1; + checksum_str = g_slice_alloc0(checksum_size); + rm_fmt_json_cksum(file, checksum_str, checksum_size); + checksum_str[checksum_size - 1] = 0; + } RmFmtHandlerJSON *self = (RmFmtHandlerJSON *)parent; @@ -279,10 +290,14 @@ static void rm_fmt_elem(RmSession *session, _UNUSED RmFmtHandler *parent, FILE * gdouble progress = 0; if(session->shred_bytes_after_preprocess) { - progress = CLAMP(100 - - 100 * ((gdouble)session->shred_bytes_remaining / - (gdouble)session->shred_bytes_after_preprocess), - 0, 100); + progress = CLAMP( + 100 - 100 * ( + (gdouble)session->shred_bytes_remaining / + (gdouble)session->shred_bytes_after_preprocess + ), + 0, + 100 + ); } rm_fmt_json_key_int(out, "progress", progress); rm_fmt_json_sep(self, out); @@ -313,7 +328,7 @@ static void rm_fmt_elem(RmSession *session, _UNUSED RmFmtHandler *parent, FILE * if(session->cfg->find_hardlinked_dupes) { RmFile *hardlink_head = RM_FILE_HARDLINK_HEAD(file); - if(hardlink_head && hardlink_head != file) { + if(hardlink_head && hardlink_head != file && file->digest) { char orig_checksum_str[rm_digest_get_bytes(file->digest) * 2 + 1]; rm_fmt_json_cksum(hardlink_head, orig_checksum_str, sizeof(orig_checksum_str)); @@ -331,6 +346,10 @@ static void rm_fmt_elem(RmSession *session, _UNUSED RmFmtHandler *parent, FILE * rm_fmt_json_key_float(out, "mtime", file->mtime); } rm_fmt_json_close(self, out); + + if(checksum_str != NULL) { + g_slice_free1(checksum_size, checksum_str); + } } static RmFmtHandlerJSON JSON_HANDLER_IMPL = { @@ -343,7 +362,7 @@ static RmFmtHandlerJSON JSON_HANDLER_IMPL = { .elem = rm_fmt_elem, .prog = NULL, .foot = rm_fmt_foot, - .valid_keys = {"no_header", "no_footer", "no_body", "oneline", NULL}, + .valid_keys = {"no_header", "no_footer", "no_body", "oneline", "unique", NULL}, }, .pretty = true}; diff --git a/lib/formats/progressbar.c b/lib/formats/progressbar.c index 57e9a1ae..b2ab9606 100644 --- a/lib/formats/progressbar.c +++ b/lib/formats/progressbar.c @@ -58,9 +58,10 @@ typedef struct RmFmtHandlerProgress { GTimer *timer; /* Estimated Time of Arrival calculation */ - RmRunningMean read_diff_mean; + RmRunningMean speed_mean; RmRunningMean eta_mean; - RmOff last_shred_bytes_remaining; + RmOff last_shred_bytes_read; + gint64 last_check_time; /* Last calculation of ETA for caching purpose */ char last_eta[1024]; @@ -83,50 +84,56 @@ static void rm_fmt_progress_format_preprocess(RmSession *session, char *buf, } } -static gdouble rm_fmt_progress_calculated_eta( +static gdouble rm_fmt_progress_calculate_eta( RmSession *session, RmFmtHandlerProgress *self, - gdouble elapsed_sec, RmFmtProgressState state) { - if(self->last_shred_bytes_remaining == 0) { - self->last_shred_bytes_remaining = session->shred_bytes_remaining; - return -1.0; - } - if(state != RM_PROGRESS_STATE_SHREDDER) { return -1.0; } - RmOff last_diff = self->last_shred_bytes_remaining - session->shred_bytes_remaining; - self->last_shred_bytes_remaining = session->shred_bytes_remaining; + // Diff is the number of bytes that we read in the last time interval. + // This does not involve bytes that were just skipped due to bad checksums. + RmOff diff = session->shred_bytes_read - self->last_shred_bytes_read; + + // took is the length of the time interval it took in seconds. + gint64 now = g_get_monotonic_time(); + gdouble took = (now - self->last_check_time) / (1000.0 * 1000.0); - rm_running_mean_add(&self->read_diff_mean, last_diff); + self->last_check_time = now; + self->last_shred_bytes_read = session->shred_bytes_read; - gdouble avg_bytes_read = rm_running_mean_get(&self->read_diff_mean); - gdouble throughput = avg_bytes_read / elapsed_sec; + if(diff > 0) { + // speed is the average time a single file took to process or filter in files/sec. + gdouble speed = diff / took; - gdouble eta_sec = 0; - if(throughput != 0.0) { - eta_sec = session->shred_bytes_remaining / throughput; + // Average the mean, to make sure that we do not directly give in to "speed bumps" + rm_running_mean_add(&self->speed_mean, speed); } + gdouble mean_speed = rm_running_mean_get(&self->speed_mean); + if(FLOAT_SIGN_DIFF(mean_speed, 0.0, MTIME_TOL) <= 0) { + return -1; + } + + gdouble eta_sec = session->shred_bytes_remaining / mean_speed; rm_running_mean_add(&self->eta_mean, eta_sec); - return rm_running_mean_get(&self->eta_mean); + + gdouble mean_eta = rm_running_mean_get(&self->eta_mean); + return mean_eta; } static char *rm_fmt_progress_get_cached_eta( RmSession *session, - RmFmtHandlerProgress *self, - gfloat elapsed_sec + RmFmtHandlerProgress *self ) { /* If there was already an ETA calculation in the last 500ms, * use that one to avoid flickering (see issue #292) */ - gdouble eta_sec = rm_fmt_progress_calculated_eta( + gdouble eta_sec = rm_fmt_progress_calculate_eta( session, self, - elapsed_sec, self->last_state ); @@ -153,7 +160,6 @@ static void rm_fmt_progress_format_text( RmSession *session, RmFmtHandlerProgress *self, int max_len, - gdouble elapsed_sec, FILE *out ) { /* This is very ugly, but more or less required since we need to translate @@ -188,7 +194,7 @@ static void rm_fmt_progress_format_text( self->percent = 1.0 - ((gdouble)session->shred_bytes_remaining / (gdouble)session->shred_bytes_after_preprocess); - char *eta_info = rm_fmt_progress_get_cached_eta(session, self, elapsed_sec); + char *eta_info = rm_fmt_progress_get_cached_eta(session, self); rm_util_size_to_human_readable( session->shred_bytes_remaining, num_buf, sizeof(num_buf) @@ -395,9 +401,10 @@ static void rm_fmt_prog(RmSession *session, const char *update_interval_str = rm_fmt_get_config_value(session->formats, "progressbar", "update_interval"); - rm_running_mean_init(&self->read_diff_mean, 10); - rm_running_mean_init(&self->eta_mean, 50); - self->last_shred_bytes_remaining = 0; + rm_running_mean_init(&self->speed_mean, 10); + rm_running_mean_init(&self->eta_mean, 25); + self->last_shred_bytes_read = 0; + self->last_check_time = g_get_monotonic_time(); self->plain = true; if(rm_fmt_get_config_value(session->formats, "progressbar", "fancy") != NULL) { @@ -474,7 +481,7 @@ static void rm_fmt_prog(RmSession *session, gdouble elapsed_sec = g_timer_elapsed(self->timer, NULL); if(force_draw || elapsed_sec * 1000.0 >= self->update_interval) { - rm_fmt_progress_format_text(session, self, text_width, elapsed_sec, out); + rm_fmt_progress_format_text(session, self, text_width, out); if(state == RM_PROGRESS_STATE_PRE_SHUTDOWN) { /* do not overwrite last messages */ self->percent = 1.05; @@ -500,7 +507,7 @@ static void rm_fmt_prog(RmSession *session, if(state == RM_PROGRESS_STATE_PRE_SHUTDOWN) { fprintf(out, "\n\n"); g_timer_destroy(self->timer); - rm_running_mean_unref(&self->read_diff_mean); + rm_running_mean_unref(&self->speed_mean); rm_running_mean_unref(&self->eta_mean); } } diff --git a/lib/formats/py.py b/lib/formats/py.py index ba9787dc..6bb4ee7e 100644 --- a/lib/formats/py.py +++ b/lib/formats/py.py @@ -91,7 +91,7 @@ def handle_duplicate_file(path, original, args, **kwargs): os.remove(path) -def handle_unfinished_cksum(path, **kwargs): +def handle_unique_file(path, **kwargs): pass # doesn't need any handling. @@ -133,7 +133,7 @@ def handle_badugid(path, **kwargs): OPERATIONS = { "duplicate_dir": handle_duplicate_dir, "duplicate_file": handle_duplicate_file, - "unfinished_cksum": handle_unfinished_cksum, + "unique_file": handle_unique_file, "emptydir": handle_empty_dir, "emptyfile": handle_empty_file, "nonstripped": handle_nonstripped, @@ -157,7 +157,7 @@ def exec_operation(item, original=None, args=None): MESSAGES = { 'duplicate_dir': '{c[yellow]}Deleting duplicate directory:', 'duplicate_file': '{c[yellow]}Deleting duplicate:', - 'unfinished_cksum': 'checking', + 'unique_file': 'checking', 'emptydir': '{c[green]}Deleting empty directory:', 'emptyfile': '{c[green]}Deleting empty file:', 'nonstripped': '{c[green]}Stripping debug symbols:', diff --git a/lib/formats/sh.c.in b/lib/formats/sh.c.in index 9baaf4e7..6823ee96 100644 --- a/lib/formats/sh.c.in +++ b/lib/formats/sh.c.in @@ -289,6 +289,16 @@ static char *rm_fmt_sh_get_extra_equal_args(RmSession *session) { return g_string_free(buf, false); } +static char *rm_fmt_sh_get_extra_dedupe_args(RmSession *session) { + RmCfg *cfg = session->cfg; + + if(cfg->write_cksum_to_xattr) { + return " --dedupe-xattr"; + } + + return ""; +} + static void rm_fmt_head(RmSession *session, RmFmtHandler *parent, FILE *out) { RmFmtHandlerShScript *self = (RmFmtHandlerShScript *)parent; @@ -306,6 +316,9 @@ static void rm_fmt_head(RmSession *session, RmFmtHandler *parent, FILE *out) { } else if(rm_fmt_get_config_value(session->formats, "sh", "link") != NULL) { /* Preset: try reflinks, then then hardlinks then symlinks */ rm_sh_parse_handlers(self, "reflink,hardlink,symlink"); + } else if(rm_fmt_get_config_value(session->formats, "sh", "reflink") != NULL) { + /* Preset: same as above, just an alias */ + rm_sh_parse_handlers(self, "reflink,hardlink,symlink"); } else if(rm_fmt_get_config_value(session->formats, "sh", "hardlink") != NULL) { /* Preset: try hardlinks before using symlinks */ rm_sh_parse_handlers(self, "hardlink,symlink"); @@ -322,6 +335,7 @@ static void rm_fmt_head(RmSession *session, RmFmtHandler *parent, FILE *out) { } char *equal_extra_args = rm_fmt_sh_get_extra_equal_args(session); + char *dedupe_extra_args = rm_fmt_sh_get_extra_dedupe_args(session); /* Fill in all placeholders in the script template */ /* This is a brittle hack of which the author should be ashamed. */ @@ -334,6 +348,8 @@ static void rm_fmt_head(RmSession *session, RmFmtHandler *parent, FILE *out) { rm_util_get_groupname(), "'%s[%3d%%]%s '", /* The progress format */ equal_extra_args, + dedupe_extra_args, + dedupe_extra_args, (self->user_cmd) ? self->user_cmd : "echo 'no user command defined.'", (session->cfg->joined_argv) ? (session->cfg->joined_argv) : "unknown_commandline" ); @@ -484,7 +500,7 @@ static RmFmtHandlerShScript SH_SCRIPT_HANDLER_IMPL = { .elem = rm_fmt_elem, .prog = NULL, .foot = rm_fmt_foot, - .valid_keys = {"handler", "cmd", "clone", "link", "hardlink", "symlink", NULL}, + .valid_keys = {"handler", "cmd", "clone", "link", "hardlink", "symlink", "reflink", NULL}, }, .last_original = NULL, .line_count = 0 diff --git a/lib/formats/sh.sh b/lib/formats/sh.sh index 2904fb5b..966f6596 100644 --- a/lib/formats/sh.sh +++ b/lib/formats/sh.sh @@ -20,6 +20,8 @@ fi USER='%s' GROUP='%s' +STAMPFILE=$(mktemp 'rmlint.XXXXXXXX.stamp') + # Set to true on -n DO_DRY_RUN= @@ -35,6 +37,9 @@ DO_SHOW_PROGRESS=true # Set to true on -c DO_DELETE_EMPTY_DIRS= +# Set to true on -k +DO_KEEP_DIR_TIMESTAMPS= + ################################## # GENERAL LINT HANDLER FUNCTIONS # ################################## @@ -218,9 +223,9 @@ clone() { echo "${COL_YELLOW}Cloning to: ${COL_RESET}$1" if [ -z "$DO_DRY_RUN" ]; then if [ -n "$DO_CLONE_READONLY" ]; then - $SUDO_COMMAND $RMLINT_BINARY --dedupe -r "$2" "$1" + $SUDO_COMMAND $RMLINT_BINARY --dedupe %s --dedupe-readonly "$2" "$1" else - $RMLINT_BINARY --dedupe "$2" "$1" + $RMLINT_BINARY --dedupe %s "$2" "$1" fi fi } @@ -232,7 +237,7 @@ skip_hardlink() { skip_reflink() { print_progress_prefix - echo "{$COL_BLUE}Leaving as-is (already reflinked to original): ${COL_RESET}$1" + echo "${COL_BLUE}Leaving as-is (already reflinked to original): ${COL_RESET}$1" } user_command() { @@ -250,8 +255,18 @@ remove_cmd() { echo "${COL_YELLOW}Deleting: ${COL_RESET}$1" if original_check "$1" "$2"; then if [ -z "$DO_DRY_RUN" ]; then + if [ ! -z "$DO_KEEP_DIR_TIMESTAMPS" ]; then + touch -r "$(dirname $1)" "$STAMPFILE" + fi + rm -rf "$1" + if [ ! -z "$DO_KEEP_DIR_TIMESTAMPS" ]; then + # Swap back old directory timestamp: + touch -r "$STAMPFILE" "$(dirname $1)" + rm "$STAMPFILE" + fi + if [ ! -z "$DO_DELETE_EMPTY_DIRS" ]; then DIR=$(dirname "$1") while [ ! "$(ls -A "$DIR")" ]; do @@ -310,13 +325,14 @@ OPTIONS: -n Do not perform any modifications, just print what would be done. (implies -d and -x) -c Clean up empty directories while deleting duplicates. -q Do not show progress. + -k Keep the timestamp of directories when removing duplicates. EOF } DO_REMOVE= DO_ASK= -while getopts "dhxnrpqc" OPTION +while getopts "dhxnrpqck" OPTION do case $OPTION in h) @@ -346,6 +362,9 @@ do q) DO_SHOW_PROGRESS= ;; + k) + DO_KEEP_DIR_TIMESTAMPS=true + ;; *) usage exit 1 diff --git a/lib/formats/stats.c b/lib/formats/stats.c index a3e58768..06f3b29b 100644 --- a/lib/formats/stats.c +++ b/lib/formats/stats.c @@ -109,9 +109,12 @@ static void rm_fmt_prog(RmSession *session, char eff_total[64] = "NaN"; char eff_dupes[64] = "NaN"; if(session->shred_bytes_read != 0) { - gfloat efficiency = 100 * (0 + session->duplicate_bytes + - session->original_bytes + session->unique_bytes) / - session->shred_bytes_read; + gfloat efficiency = 100 * ( + session->duplicate_bytes + + session->original_bytes + + session->unique_bytes + ) / session->shred_bytes_read; + snprintf(eff_total, sizeof(eff_total), "%.0f%%", efficiency); efficiency = 100 * (0 + session->duplicate_bytes + session->original_bytes) / diff --git a/lib/hasher.c b/lib/hasher.c index 7c00e918..be7c5263 100644 --- a/lib/hasher.c +++ b/lib/hasher.c @@ -216,7 +216,7 @@ static gboolean rm_hasher_buffered_read(RmHasher *hasher, GThreadPool *hashpipe, rm_log_error_line("Unexpected EOF in rm_hasher_buffered_read"); break; } else if(bytes_read == 0) { - rm_log_error_line(_("Something went wrong reading %s; expected %li bytes, " + rm_log_warning_line(_("Something went wrong reading %s; expected %li bytes, " "got %li; ignoring"), path, (long int)bytes_to_read, (long int)*bytes_actually_read); @@ -491,6 +491,7 @@ gboolean rm_hasher_task_hash(RmHasherTask *task, char *path, guint64 start_offse if(bytes_read_out != NULL) { *bytes_read_out = bytes_read; + } return success; diff --git a/lib/preprocess.c b/lib/preprocess.c index 1167e79e..a0f4b84f 100644 --- a/lib/preprocess.c +++ b/lib/preprocess.c @@ -148,13 +148,22 @@ static guint rm_path_double_hash(const RmPathDoubleKey *key) { static ino_t rm_path_parent_inode(RmFile *file) { char parent_path[PATH_MAX]; - rm_trie_build_path((RmTrie *)&file->session->cfg->file_trie, file->folder->parent, - parent_path, PATH_MAX); + rm_trie_build_path( + (RmTrie *)&file->session->cfg->file_trie, + file->folder->parent, + parent_path, + PATH_MAX + ); + RmStat stat_buf; int retval = rm_sys_stat(parent_path, &stat_buf); if(retval == -1) { - rm_log_error_line("Failed to get parent path: stat failed: %s", - g_strerror(errno)); + RM_DEFINE_PATH(file); + rm_log_error_line( + "Failed to get parent path of %s: stat failed: %s", + file_path, + g_strerror(errno) + ); return 0; } diff --git a/lib/replay.c b/lib/replay.c index fad16294..2cf604ee 100644 --- a/lib/replay.c +++ b/lib/replay.c @@ -190,6 +190,12 @@ static RmFile *rm_parrot_try_next(RmParrot *polly) { file->digest = rm_digest_new(RM_DIGEST_EXT, 0); file->free_digest = true; + // stat() reports directories as size zero. + // Fix this by actually using the size field in this example. + if (type == RM_LINT_TYPE_DUPE_DIR_CANDIDATE && stat_info->st_mode & S_IFDIR) { + file->actual_file_size = json_object_get_int_member(object, "size"); + } + if(file->is_original) { polly->last_original = file; } @@ -246,6 +252,13 @@ static bool rm_parrot_check_size(RmCfg *cfg, RmFile *file) { return true; } + if(file->lint_type != RM_LINT_TYPE_DUPE_CANDIDATE && + file->lint_type != RM_LINT_TYPE_DUPE_DIR_CANDIDATE) { + // Non-lint files always count as good size. + // See: https://github.com/sahib/rmlint/issues/368 + return true; + } + return ((cfg->minsize == (RmOff)-1 || cfg->minsize <= file->actual_file_size) && (cfg->maxsize == (RmOff)-1 || file->actual_file_size <= cfg->maxsize)); } @@ -403,7 +416,8 @@ static void rm_parrot_update_stats(RmParrotCage *cage, RmFile *file) { RmSession *session = cage->session; session->total_files += 1; - if(file->lint_type == RM_LINT_TYPE_DUPE_CANDIDATE) { + if(file->lint_type == RM_LINT_TYPE_DUPE_CANDIDATE || + file->lint_type == RM_LINT_TYPE_DUPE_DIR_CANDIDATE) { session->dup_group_counter += file->is_original; if(!file->is_original) { session->dup_counter += 1; @@ -513,6 +527,7 @@ bool rm_parrot_cage_load(RmParrotCage *cage, const char *json_path, bool is_pref rm_parrot_check_types(cfg, file) && rm_parrot_check_crossdev(polly, file) && rm_parrot_check_path(polly, file, file_path))) { rm_file_destroy(file); + rm_log_debug("[nope]\n"); continue; } diff --git a/lib/session.c b/lib/session.c index 3ccaa33e..df7f2880 100644 --- a/lib/session.c +++ b/lib/session.c @@ -32,6 +32,7 @@ #include "preprocess.h" #include "session.h" #include "traverse.h" +#include "xattr.h" #if HAVE_BTRFS_H #include @@ -166,33 +167,6 @@ void rm_session_acknowledge_abort(const gint abort_count) { g_mutex_unlock(&m); } -/** - * *********** dedupe session main ************ - **/ -int rm_session_dedupe_main(RmCfg *cfg) { -#if HAVE_FIDEDUPERANGE || HAVE_BTRFS_H - g_assert(cfg->path_count == g_slist_length(cfg->paths)); - if(cfg->path_count != 2) { - rm_log_error(_("Usage: rmlint --dedupe [-r] [-v|V] source dest\n")); - return EXIT_FAILURE; - } - - g_assert(cfg->paths); - RmPath *dest = cfg->paths->data; - g_assert(cfg->paths->next); - RmPath *source = cfg->paths->next->data; - rm_log_debug_line("Cloning %s -> %s", source->path, dest->path); - - int source_fd = rm_sys_open(source->path, O_RDONLY); - if(source_fd < 0) { - rm_log_error_line(_("dedupe: failed to open source file")); - return EXIT_FAILURE; - } - - struct stat source_stat; - fstat(source_fd, &source_stat); - gint64 bytes_deduped = 0; - /* FIDEDUPERANGE supercedes the btrfs-only BTRFS_IOC_FILE_EXTENT_SAME as of Linux 4.5 and * should work for ocfs2 and xfs as well as btrfs. We should still support the older * btrfs ioctl so that this still works on Linux 4.2 to 4.4. The two ioctl's are @@ -203,7 +177,6 @@ int rm_session_dedupe_main(RmCfg *cfg) { * because the BTRFS_IOC_FILE_EXTENT_SAME is decided at compile time... */ -/* clang-format off */ #if HAVE_FIDEDUPERANGE # define _DEDUPE_IOCTL_NAME "FIDEDUPERANGE" # define _DEDUPE_IOCTL FIDEDUPERANGE @@ -227,7 +200,51 @@ int rm_session_dedupe_main(RmCfg *cfg) { # define _FILE_DEDUPE_RANGE_INFO btrfs_ioctl_same_extent_info # define _MIN_LINUX_SUBVERSION 2 #endif -/* clang-format on */ + +/** + * *********** dedupe session main ************ + **/ +int rm_session_dedupe_main(RmCfg *cfg) { +#if HAVE_FIDEDUPERANGE || HAVE_BTRFS_H + g_assert(cfg->path_count == g_slist_length(cfg->paths)); + if(cfg->path_count != 2) { + rm_log_error(_("Usage: rmlint --dedupe [-r] [-v|V] source dest\n")); + return EXIT_FAILURE; + } + + g_assert(cfg->paths); + RmPath *dest = cfg->paths->data; + g_assert(cfg->paths->next); + RmPath *source = cfg->paths->next->data; + rm_log_debug_line("Cloning %s -> %s", source->path, dest->path); + + if(cfg->dedupe_check_xattr) { + // Check if we actually need to deduplicate. + // This utility will write a value to the extended attributes + // of the file so we know that we do not need to do it again + // next time. This is supposed to avoid disk thrashing. + // (See also: https://github.com/sahib/rmlint/issues/349) + if(rm_xattr_is_deduplicated(dest->path, cfg->follow_symlinks)) { + rm_log_debug_line("Already deduplicated according to xattr!"); + return EXIT_SUCCESS; + } + } + + // Also use --is-reflink on both files before doing extra work: + if(rm_util_link_type(source->path, dest->path) == RM_LINK_REFLINK) { + rm_log_debug_line("Already an exact reflink!"); + return EXIT_SUCCESS; + } + + int source_fd = rm_sys_open(source->path, O_RDONLY); + if(source_fd < 0) { + rm_log_error_line(_("dedupe: failed to open source file")); + return EXIT_FAILURE; + } + + struct stat source_stat; + fstat(source_fd, &source_stat); + gint64 bytes_deduped = 0; /* a poorly-documented limit for dedupe ioctl's */ static const gint64 max_dedupe_chunk = 16 * 1024 * 1024; @@ -299,8 +316,8 @@ int rm_session_dedupe_main(RmCfg *cfg) { break; } } else if(dedupe.info.status != 0) { - ret=-dedupe.info.status; - errno=ret; + ret = -dedupe.info.status; + errno = ret; break; } else if(dedupe.info.bytes_deduped == 0) { break; @@ -324,23 +341,13 @@ int rm_session_dedupe_main(RmCfg *cfg) { rm_sys_close(dedupe.info._DEST_FD); if(bytes_deduped == source_stat.st_size) { + if(cfg->dedupe_check_xattr && !cfg->dedupe_readonly) { + rm_xattr_mark_deduplicated(dest->path, cfg->follow_symlinks); + } + return EXIT_SUCCESS; } -#undef _DEDUPE_IOCTL_NAME -#undef _DEDUPE_IOCTL -#undef _DEST_FD -#undef _SRC_OFFSET -#undef _DEST_OFFSET -#undef _SRC_LENGTH -#undef _DATA_DIFFERS -#undef _DEST_FD -#undef _FILE_DEDUPE_RANGE -#undef _FILE_DEDUPE_RANGE_INFO -#undef _MIN_LINUX_SUBVERSION -#undef MAX_DEDUPE_CHUNK -#undef MIN_DEDUPE_CHUNK - #else (void)cfg; rm_log_error_line(_("rmlint was not compiled with file cloning support.")) @@ -366,13 +373,15 @@ int rm_session_is_reflink_main(RmCfg *cfg) { g_assert(cfg->path_count == g_slist_length(cfg->paths)); if(cfg->path_count != 2) { - rm_log_error(_("Usage: rmlint --is-clone [-v|V] file1 file2\n")); + rm_log_error(_("Usage: rmlint --is-reflink [-v|V] file1 file2\n")); return EXIT_FAILURE; } g_assert(cfg->paths); + RmPath *a = cfg->paths->data; g_assert(cfg->paths->next); + RmPath *b = cfg->paths->next->data; rm_log_debug_line("Testing if %s is clone of %s", a->path, b->path); @@ -390,5 +399,6 @@ int rm_session_is_reflink_main(RmCfg *cfg) { default: break; } + return result; } diff --git a/lib/traverse.c b/lib/traverse.c index 73f4b9f2..e8c6aef7 100644 --- a/lib/traverse.c +++ b/lib/traverse.c @@ -103,6 +103,32 @@ static void rm_trav_buffer_free(RmTravBuffer *self) { // ACTUAL WORK HERE // ////////////////////// +// Symbolic links may contain relative paths, that are relative to the symbolic +// link location. When using readlink(), those relative paths are returned but +// may not make sense depending from where rmlint was run. This function takes +// the original link_path and the (potentially relatiev) path it is pointing to +// and constructs an absolute path out of it. +// +// See also: https://github.com/sahib/rmlint/issues/333 +static char *rm_traverse_rel_realpath(char *link_path, char *pointing_to) { + if(pointing_to == NULL) { + return NULL; + } + + // Most links will already be absolute. + if(g_path_is_absolute(pointing_to)) { + return realpath(pointing_to, NULL); + } + + char *link_dir_path = g_path_get_dirname(link_path); + char *full_path = g_build_path(G_DIR_SEPARATOR_S, link_dir_path, pointing_to, NULL); + char *clean_path = realpath(full_path, NULL); + + g_free(link_dir_path); + g_free(full_path); + return clean_path; +} + static void rm_traverse_file(RmTravSession *trav_session, RmStat *statp, char *path, bool is_prefd, unsigned long path_index, RmLintType file_type, bool is_symlink, bool is_hidden, @@ -153,11 +179,15 @@ static void rm_traverse_file(RmTravSession *trav_session, RmStat *statp, char *p if(is_symlink && cfg->follow_symlinks) { char *new_path_buf = g_malloc0(PATH_MAX + 1); if(readlink(path, new_path_buf, PATH_MAX) == -1) { - rm_log_perror("failed to follow symbolic link"); + rm_log_warning_line("failed to follow symbolic link of %s: %s", path, g_strerror(errno)); + g_free(new_path_buf); return; } - path = new_path_buf; + char *resolved_path = rm_traverse_rel_realpath(path, new_path_buf); + g_free(new_path_buf); + + path = resolved_path; is_symlink = false; path_needs_free = true; } diff --git a/lib/treemerge.c b/lib/treemerge.c index d7818a82..7960c1dc 100644 --- a/lib/treemerge.c +++ b/lib/treemerge.c @@ -736,9 +736,32 @@ static int rm_tm_cmp_directory_groups(GQueue *a, GQueue *b) { } static int rm_tm_hidden_file(RmFile *file, _UNUSED gpointer user_data) { + RM_DEFINE_PATH(file); return file->is_hidden; } +static void rm_tm_filter_hidden_directories(GQueue *directories) { + GQueue *kill_list = g_queue_new(); + + for(GList *iter = directories->head; iter; iter = iter->next) { + RmDirectory *directory = iter->data; + if(!g_str_has_prefix(rm_util_basename(directory->dirname), ".")) { + continue; + } + + g_queue_push_tail(kill_list, directory); + } + + // Actually remove the directories: + // (If the directory list is only one directory long now, + // it is being filtered later in the process) + for(GList *iter = kill_list->head; iter; iter = iter->next) { + g_queue_remove(directories, iter->data); + } + + g_queue_free(kill_list); +} + static void rm_tm_extract(RmTreeMerger *self) { /* Iterate over all directories per hash (which are same therefore) */ RmCfg *cfg = self->session->cfg; @@ -774,6 +797,12 @@ static void rm_tm_extract(RmTreeMerger *self) { /* Sort the RmDirectory list by their path depth, lowest depth first */ g_queue_sort(dir_list, (GCompareDataFunc)rm_tm_sort_paths, self); + /* If no --hidden is given, do not display top-level directories + * that are hidden. If needed, filter them beforehand. */ + if(cfg->partial_hidden) { + rm_tm_filter_hidden_directories(dir_list); + } + /* Output the directories and mark their children to prevent * duplicate directory reports in lower levels. */ @@ -857,7 +886,7 @@ static void rm_tm_extract(RmTreeMerger *self) { GQueue *file_list = NULL; while(g_hash_table_iter_next(&iter, NULL, (void **)&file_list)) { if(self->session->cfg->partial_hidden) { - /* with --partial-hidden we do not want to output */ + /* with --partial-hidden we do not want to output left overs that are hidden */ rm_util_queue_foreach_remove(file_list, (RmRFunc)rm_tm_hidden_file, NULL); } diff --git a/lib/utilities.c b/lib/utilities.c index d93ece23..817bd3c9 100644 --- a/lib/utilities.c +++ b/lib/utilities.c @@ -271,7 +271,8 @@ gpointer rm_util_slist_pop(GSList **list, GMutex *lock) { } /* checks uid and gid; returns 0 if both ok, else RM_LINT_TYPE_ corresponding * - * to RmFile->filter types */ + * to RmFile->filter types + * */ int rm_util_uid_gid_check(RmStat *statp, RmUserList *userlist) { bool has_gid = 1, has_uid = 1; if(!rm_userlist_contains(userlist, statp->st_uid, statp->st_gid, &has_uid, @@ -732,7 +733,7 @@ static bool rm_mounts_create_tables(RmMountTable *self, bool force_fiemap) { * with tmpfs and with nfs mounts. Try to handle a few such cases. * */ if(rm_mounts_is_ramdisk(entry->fsname)) { - strncpy(diskname, entry->fsname, sizeof(diskname)); + strncpy(diskname, entry->fsname, sizeof(diskname)-1); is_rotational = false; whole_disk = stat_buf_folder.st_dev; } else if((nfs_marker = strstr(entry->fsname, ":/")) != NULL) { @@ -761,7 +762,7 @@ static bool rm_mounts_create_tables(RmMountTable *self, bool force_fiemap) { rm_log_debug_line(RED "devno_to_wholedisk failed for %s" RESET, entry->fsname); whole_disk = stat_buf_dev.st_dev; - strncpy(diskname, entry->fsname, sizeof(diskname)); + strncpy(diskname, entry->fsname, sizeof(diskname)-1); is_rotational = false; } else { is_rotational = rm_mounts_is_rotational_blockdev(diskname); @@ -1012,7 +1013,7 @@ static struct fiemap *rm_offset_get_fiemap(int fd, const int n_extents, * the next non-contiguous extent (fragment) is encountered and writes the corresponding * file offset to &file_offset_next. * */ -RmOff rm_offset_get_from_fd(int fd, RmOff file_offset, RmOff *file_offset_next) { +RmOff rm_offset_get_from_fd(int fd, RmOff file_offset, RmOff *file_offset_next, bool *is_last) { RmOff result = 0; bool done = FALSE; bool first = TRUE; @@ -1063,6 +1064,10 @@ RmOff rm_offset_get_from_fd(int fd, RmOff file_offset, RmOff *file_offset_next) if(fm_ext.fe_flags & FIEMAP_EXTENT_LAST) { done = TRUE; + + if(is_last != NULL) { + *is_last = TRUE; + } } if(fm_ext.fe_length <= 0) { @@ -1093,7 +1098,7 @@ RmOff rm_offset_get_from_path(const char *path, RmOff file_offset, rm_log_info("Error opening %s in rm_offset_get_from_path\n", path); return 0; } - RmOff result = rm_offset_get_from_fd(fd, file_offset, file_offset_next); + RmOff result = rm_offset_get_from_fd(fd, file_offset, file_offset_next, NULL); rm_sys_close(fd); return result; } @@ -1101,7 +1106,7 @@ RmOff rm_offset_get_from_path(const char *path, RmOff file_offset, #else /* Probably FreeBSD */ RmOff rm_offset_get_from_fd(_UNUSED int fd, _UNUSED RmOff file_offset, - _UNUSED RmOff *file_offset_next) { + _UNUSED RmOff *file_offset_next, _UNUSED bool *is_last) { return 0; } @@ -1164,7 +1169,7 @@ RmLinkType rm_util_link_type(char *path1, char *path2) { } RmStat stat1; - int stat_state = rm_sys_stat(path1, &stat1); + int stat_state = rm_sys_lstat(path1, &stat1); if(stat_state == -1) { rm_log_perrorf("rm_util_link_type: Unable to stat file %s", path1); RM_RETURN(RM_LINK_ERROR); @@ -1189,7 +1194,7 @@ RmLinkType rm_util_link_type(char *path1, char *path2) { } RmStat stat2; - stat_state = rm_sys_stat(path2, &stat2); + stat_state = rm_sys_lstat(path2, &stat2); if(stat_state == -1) { rm_log_perrorf("rm_util_link_type: Unable to stat file %s", path2); RM_RETURN(RM_LINK_ERROR); @@ -1227,15 +1232,33 @@ RmLinkType rm_util_link_type(char *path1, char *path2) { } } + /* If both are symbolic links we do not follow them */ + if(S_ISLNK(stat1.st_mode) || S_ISLNK(stat2.st_mode)) { + RM_RETURN(RM_LINK_SYMLINK); + } + #if HAVE_FIEMAP RmOff logical_current = 0; + bool is_last_1 = false; + bool is_last_2 = false; + bool at_least_one_checked = false; + while(!rm_session_was_aborted()) { RmOff logical_next_1 = 0; RmOff logical_next_2 = 0; - RmOff physical_1 = rm_offset_get_from_fd(fd1, logical_current, &logical_next_1); - RmOff physical_2 = rm_offset_get_from_fd(fd2, logical_current, &logical_next_2); + + RmOff physical_1 = rm_offset_get_from_fd(fd1, logical_current, &logical_next_1, &is_last_1); + RmOff physical_2 = rm_offset_get_from_fd(fd2, logical_current, &logical_next_2, &is_last_2); + + if(is_last_1 != is_last_2) { + RM_RETURN(RM_LINK_NONE); + } + + if(is_last_1 && is_last_2 && at_least_one_checked) { + RM_RETURN(RM_LINK_REFLINK); + } if(physical_1 != physical_2) { #if _RM_OFFSET_DEBUG @@ -1282,6 +1305,7 @@ RmLinkType rm_util_link_type(char *path1, char *path2) { } logical_current = logical_next_1; + at_least_one_checked = true; } RM_RETURN(RM_LINK_ERROR); diff --git a/lib/utilities.h b/lib/utilities.h index b2572044..43457367 100644 --- a/lib/utilities.h +++ b/lib/utilities.h @@ -48,8 +48,8 @@ typedef enum RmLinkType { RM_LINK_SAME_FILE = 6, RM_LINK_PATH_DOUBLE = 7, RM_LINK_HARDLINK = 8, - RM_LINK_XDEV = 9, - RM_LINK_SYMLINK = 10, + RM_LINK_SYMLINK = 9, + RM_LINK_XDEV = 10, RM_LINK_ERROR = 11, } RmLinkType; @@ -416,7 +416,7 @@ bool rm_mounts_can_reflink(RmMountTable *self, dev_t source, dev_t dest); * * @return the physical offset starting from the disk. */ -RmOff rm_offset_get_from_fd(int fd, RmOff file_offset, RmOff *file_offset_next); +RmOff rm_offset_get_from_fd(int fd, RmOff file_offset, RmOff *file_offset_next, bool *is_last); /** * @brief Lookup the physical offset of a file path at any given offset. diff --git a/lib/xattr.c b/lib/xattr.c index 35dc9c32..8fc9ac4c 100644 --- a/lib/xattr.c +++ b/lib/xattr.c @@ -49,34 +49,85 @@ #if RM_IS_APPLE -ssize_t rm_sys_getxattr(const char *path, const char *name, void *value, size_t size) { - return getxattr(path, name, value, size, 0, 0); +ssize_t rm_sys_getxattr(const char *path, const char *name, void *value, size_t size, bool follow_link) { + int flags = 0; + if(!follow_link) { + flags |= XATTR_NOFOLLOW; + } + + return getxattr(path, name, value, size, 0, flags); } ssize_t rm_sys_setxattr( - const char *path, const char *name, const void *value, size_t size, int flags) { + const char *path, const char *name, const void *value, size_t size, int flags, bool follow_link) { + if(!follow_link) { + flags |= XATTR_NOFOLLOW; + } + return setxattr(path, name, value, size, 0, flags); } -int rm_sys_removexattr(const char *path, const char *name) { - return removexattr(path, name, 0); +int rm_sys_removexattr(const char *path, const char *name, bool follow_link) { + int flags = 0; + if(!follow_link) { + flags |= XATTR_NOFOLLOW; + } + + return removexattr(path, name, flags); +} + +int rm_sys_listxattr(const char *path, char *out, size_t out_size, bool follow_link) { + int flags = 0; + if(!follow_link) { + flags |= XATTR_NOFOLLOW; + } + + return listxattr(path, out, out_size, flags); } #else -ssize_t rm_sys_getxattr(const char *path, const char *name, void *value, size_t size) { +ssize_t rm_sys_getxattr(const char *path, const char *name, void *value, size_t size, bool follow_link) { + if(!follow_link) { + #if HAVE_LXATTR + return lgetxattr(path, name, value, size); + #endif + } + return getxattr(path, name, value, size); } ssize_t rm_sys_setxattr( - const char *path, const char *name, const void *value, size_t size, int flags) { + const char *path, const char *name, const void *value, size_t size, int flags, bool follow_link) { + if(!follow_link) { + #if HAVE_LXATTR + return lsetxattr(path, name, value, size, flags); + #endif + } + return setxattr(path, name, value, size, flags); } -int rm_sys_removexattr(const char *path, const char *name) { +int rm_sys_removexattr(const char *path, const char *name, bool follow_link) { + if(!follow_link) { + #if HAVE_LXATTR + return lremovexattr(path, name); + #endif + } + return removexattr(path, name); } +int rm_sys_listxattr(const char *path, char *out, size_t out_size, bool follow_link) { + if(!follow_link) { + #if HAVE_LXATTR + return llistxattr(path, out, out_size); + #endif + } + + return listxattr(path, out, out_size); +} + #endif static int rm_xattr_build_key(RmSession *session, @@ -109,13 +160,13 @@ static int rm_xattr_build_cksum(RmFile *file, char *buf, size_t buf_size) { return rm_digest_hexstring(file->digest, buf); } -static int rm_xattr_is_fail(const char *name, int rc) { +static int rm_xattr_is_fail(const char *name, char *path, int rc) { if(rc != -1) { return 0; } if(errno != ENOTSUP && errno != ENODATA) { - rm_log_perror(name); + rm_log_warning_line("failed to %s for %s: %s", name, path, g_strerror(errno)); return errno; } @@ -125,25 +176,29 @@ static int rm_xattr_is_fail(const char *name, int rc) { static int rm_xattr_set(RmFile *file, const char *key, const char *value, - size_t value_size) { + size_t value_size, + bool follow_link) { RM_DEFINE_PATH(file); - return rm_xattr_is_fail("setxattr", - rm_sys_setxattr(file_path, key, value, value_size, 0)); + return rm_xattr_is_fail("setxattr", file_path, + rm_sys_setxattr(file_path, key, value, value_size, 0, follow_link)); } static int rm_xattr_get(RmFile *file, const char *key, char *out_value, - size_t value_size) { + size_t value_size, + bool follow_link) { RM_DEFINE_PATH(file); - return rm_xattr_is_fail("getxattr", - rm_sys_getxattr(file_path, key, out_value, value_size)); + return rm_xattr_is_fail("getxattr", file_path, + rm_sys_getxattr(file_path, key, out_value, value_size, follow_link)); } -static int rm_xattr_del(RmFile *file, const char *key) { +static int rm_xattr_del(RmFile *file, const char *key, bool follow_link) { RM_DEFINE_PATH(file); - return rm_xattr_is_fail("removexattr", rm_sys_removexattr(file_path, key)); + return rm_xattr_is_fail( + "removexattr", file_path, + rm_sys_removexattr(file_path, key, follow_link)); } #endif @@ -167,13 +222,14 @@ int rm_xattr_write_hash(RmFile *file, RmSession *session) { int timestamp_bytes = 0; + bool follow = session->cfg->follow_symlinks; if(rm_xattr_build_key(session, "cksum", cksum_key, sizeof(cksum_key)) || rm_xattr_build_key(session, "mtime", mtime_key, sizeof(mtime_key)) || rm_xattr_build_cksum(file, cksum_hex_str, sizeof(cksum_hex_str)) <= 0 || - rm_xattr_set(file, cksum_key, cksum_hex_str, sizeof(cksum_hex_str)) || + rm_xattr_set(file, cksum_key, cksum_hex_str, sizeof(cksum_hex_str), follow) || (timestamp_bytes = snprintf( timestamp, sizeof(timestamp), "%.9f", file->mtime)) == -1 || - rm_xattr_set(file, mtime_key, timestamp, timestamp_bytes)) { + rm_xattr_set(file, mtime_key, timestamp, timestamp_bytes, follow)) { return errno; } #endif @@ -189,27 +245,31 @@ gboolean rm_xattr_read_hash(RmFile *file, RmSession *session) { return FALSE; } - char cksum_key[64] = {0}, mtime_key[64] = {0}, mtime_buf[64] = {0}, + char cksum_key[64] = {0}, + mtime_key[64] = {0}, + mtime_buf[64] = {0}, cksum_hex_str[512] = {0}; - memset(cksum_hex_str, '0', sizeof(cksum_hex_str)); + memset(cksum_hex_str, 0, sizeof(cksum_hex_str)); cksum_hex_str[sizeof(cksum_hex_str) - 1] = 0; - if(0 || rm_xattr_build_key(session, "cksum", cksum_key, sizeof(cksum_key)) || - rm_xattr_get(file, cksum_key, cksum_hex_str, sizeof(cksum_hex_str) - 1) || + bool follow = session->cfg->follow_symlinks; + if(rm_xattr_build_key(session, "cksum", cksum_key, sizeof(cksum_key)) || + rm_xattr_get(file, cksum_key, cksum_hex_str, sizeof(cksum_hex_str) - 1, follow) || rm_xattr_build_key(session, "mtime", mtime_key, sizeof(mtime_key)) || - rm_xattr_get(file, mtime_key, mtime_buf, sizeof(mtime_buf) - 1)) { + rm_xattr_get(file, mtime_key, mtime_buf, sizeof(mtime_buf) - 1, follow)) { return FALSE; } - if(cksum_hex_str[0] == 0 || strcmp(cksum_hex_str, "")==0) { + if(cksum_hex_str[0] == 0 || mtime_buf[0] == 0) { return FALSE; } - if(FLOAT_SIGN_DIFF(g_ascii_strtod(mtime_buf, NULL), file->mtime, MTIME_TOL) < 0) { + if(FLOAT_SIGN_DIFF(g_ascii_strtod(mtime_buf, NULL), file->mtime, MTIME_TOL) == 0) { /* Data is too old and not useful, autoclean it */ + RM_DEFINE_PATH(file); rm_log_debug_line("Checksum too old for %s, %" G_GINT64_FORMAT " < %" G_GINT64_FORMAT, - file->folder->basename, + file_path, g_ascii_strtoll(mtime_buf, NULL, 10), (gint64)file->mtime); rm_xattr_clear_hash(file, session); @@ -239,7 +299,7 @@ int rm_xattr_clear_hash(RmFile *file, RmSession *session) { continue; } - if(rm_xattr_del(file, key)) { + if(rm_xattr_del(file, key, session->cfg->follow_symlinks)) { error = errno; } } @@ -249,3 +309,172 @@ int rm_xattr_clear_hash(RmFile *file, RmSession *session) { return EXIT_FAILURE; #endif } + +GHashTable *rm_xattr_list(const char *path, bool follow_symlinks) { + const size_t buf_size = 4096; + const size_t val_size = 1024; + const char prefix[13] = "user.rmlint."; + + char buf[buf_size]; + memset(buf, 0, buf_size); + + int rc = rm_sys_listxattr(path, buf, buf_size-1, follow_symlinks); + if(rc < 0) { + rm_xattr_is_fail("listxattr", (char *)path, rc); + return NULL; + } + + GHashTable *map = g_hash_table_new_full( + g_str_hash, g_str_equal, + g_free, g_free + ); + + bool failed = false; + char *curr = buf; + while(true) { + size_t n = buf_size - (curr - buf); + if(n <= 0) { + break; + } + + char *next = memchr(curr, 0, n); + if(next == NULL) { + break; + } + + size_t key_len = next-curr; + if(key_len <= 0) { + break; + } + + if(strncmp(curr, prefix, MIN(key_len, sizeof(prefix) -1)) != 0) { + // Skip this key and save some memory. Not one of ours. + curr = next + 1; + continue; + } + + char *key = g_strndup(curr, key_len); + char *val = g_malloc0(val_size); + + rc = rm_sys_getxattr(path, key, val, val_size, follow_symlinks); + if(rc < 0) { + rm_xattr_is_fail("getxattr", (char *)path, rc); + g_free(key); + g_free(val); + failed = true; + break; + } + + g_hash_table_insert(map, key, val); + curr = next + 1; + } + + if(failed) { + g_hash_table_destroy(map); + map = NULL; + } + + return map; +} + +static void rm_xattr_change_subkey(char *key, char *sub_key) { + size_t key_len = strlen(key); + size_t sub_key_len = strlen(sub_key); + + // This method is only thought to be used for our xattr keys. + // They happen to have all 5 byte long sub keys by chance. + // So take the chance to save some allocations. + if(sub_key_len != 5) { + return; + } + + strcpy(&key[key_len - sub_key_len], sub_key); +} + +bool rm_xattr_is_deduplicated(const char *path, bool follow_symlinks) { + g_assert(path); + + RmStat stat_buf; + if(rm_sys_stat(path, &stat_buf) < 0) { + rm_log_warning_line("failed to check dedupe state of %s: %s", path, g_strerror(errno)); + return EXIT_FAILURE; + } + + bool result = false; + char *key = NULL, *value = NULL; + GHashTable *map = rm_xattr_list(path, follow_symlinks); + GHashTableIter iter; + + g_hash_table_iter_init(&iter, map); + while(g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) { + if(!g_str_has_suffix(key, ".mtime")) { + continue; + } + + gdouble mtime = g_ascii_strtod(value, NULL); + if(FLOAT_SIGN_DIFF(mtime, stat_buf.st_mtime, MTIME_TOL) != 0) { + continue; + } + + rm_xattr_change_subkey(key, "cksum"); + char *cksum = g_hash_table_lookup(map, key); + if(cksum == NULL) { + continue; + } + + rm_xattr_change_subkey(key, "dedup"); + char *dedup = g_hash_table_lookup(map, key); + if(dedup == NULL) { + continue; + } + + if(g_strcmp0(cksum, dedup) != 0) { + continue; + } + + result = true; + break; + } + + g_hash_table_destroy(map); + return result; +} + +int rm_xattr_mark_deduplicated(const char *path, bool follow_symlinks) { + g_assert(path); + + RmStat stat_buf; + if(rm_sys_stat(path, &stat_buf) < 0) { + rm_log_warning_line("failed to mark dedupe state of %s: %s", path, g_strerror(errno)); + return EXIT_FAILURE; + } + + int result = EXIT_FAILURE; + char *key = NULL, *value = NULL; + GHashTable *map = rm_xattr_list(path, follow_symlinks); + GHashTableIter iter; + + g_hash_table_iter_init(&iter, map); + while(g_hash_table_iter_next(&iter, (gpointer)&key, (gpointer)&value)) { + if(!g_str_has_suffix(key, ".mtime")) { + continue; + } + + gdouble mtime = g_ascii_strtod(value, NULL); + if(FLOAT_SIGN_DIFF(mtime, stat_buf.st_mtime, MTIME_TOL) != 0) { + continue; + } + + rm_xattr_change_subkey(key, "cksum"); + char *cksum = g_hash_table_lookup(map, key); + if(cksum == NULL) { + continue; + } + + rm_xattr_change_subkey(key, "dedup"); + result = rm_sys_setxattr(path, key, cksum, strlen(cksum), 0, follow_symlinks); + } + + g_hash_table_destroy(map); + return result; +} diff --git a/lib/xattr.h b/lib/xattr.h index 50ba98d0..8286c96c 100644 --- a/lib/xattr.h +++ b/lib/xattr.h @@ -62,4 +62,24 @@ gboolean rm_xattr_read_hash(RmFile *file, RmSession *session); */ int rm_xattr_clear_hash(RmFile *file, RmSession *session); +/** + * @brief Check if `path` was already deduplicated. + * + * @param path Path to check. + * @param follow_symlinks Wether to check if it is a symbolic link; + * @param res Where to store the result. + * + * @return true if deduplicated + */ +bool rm_xattr_is_deduplicated(const char *path, bool follow_symlinks); + +/** + * @brief Mark the file as deduplicated. + * + * @param res The result from rm_xattr_is_deduplicated; + * + * @return 0 on success, some errno on failure. + */ +int rm_xattr_mark_deduplicated(const char *path, bool follow_symlinks); + #endif diff --git a/po/de.po b/po/de.po index 60d40491..3612c4f4 100644 --- a/po/de.po +++ b/po/de.po @@ -713,7 +713,7 @@ msgid "Use more paranoid hashing" msgstr "Benutze immer paranoideren Hashalgorithmus" #: lib/cmdline.c -msgid "Do not cross mounpoints" +msgid "Do not cross mountpoints" msgstr "Überquere keine Mountpoints" #: lib/cmdline.c diff --git a/po/es.po b/po/es.po index 7b6dd5c3..97e91acf 100644 --- a/po/es.po +++ b/po/es.po @@ -696,7 +696,7 @@ msgid "Use more paranoid hashing" msgstr "Usa un ratreo más paranóico" #: lib/cmdline.c -msgid "Do not cross mounpoints" +msgid "Do not cross mountpoints" msgstr "No cruza los puntos de montaje" #: lib/cmdline.c diff --git a/po/fr.po b/po/fr.po index a8907f87..2c7175a7 100644 --- a/po/fr.po +++ b/po/fr.po @@ -108,7 +108,7 @@ msgstr "Fichier %s%s%s écrit: %s%s%s\n" #: lib/formats/stats.c #, c-format msgid "No shred stats.\n" -msgstr "Pas de statistiques pour shred" +msgstr "Pas de statistiques pour shred\n" #: lib/formats/stats.c #, c-format @@ -135,7 +135,7 @@ msgstr "%s%15s%s bytes de non-doublon\n" #: lib/formats/stats.c #, c-format msgid "%s%15s%s bytes of files data actually read\n" -msgstr "%s%15s%s bytes de données de fichier réellement lus" +msgstr "%s%15s%s bytes de données de fichier réellement lus\n" #: lib/formats/stats.c #, c-format @@ -150,7 +150,7 @@ msgstr "%s%15ld%s fichiers en doublons\n" #: lib/formats/stats.c #, c-format msgid "%s%15ld%s Groups in total\n" -msgstr "%s%15ld%s groupes au total" +msgstr "%s%15ld%s groupes au total\n" #: lib/formats/stats.c #, c-format @@ -692,7 +692,7 @@ msgid "Use more paranoid hashing" msgstr "Utiliser une fonction de hachage paranoïaque" #: lib/cmdline.c -msgid "Do not cross mounpoints" +msgid "Do not cross mountpoints" msgstr "Ne pas traverser les points de montages" #: lib/cmdline.c diff --git a/po/rmlint.pot b/po/rmlint.pot index 6523c2d2..a5eb022e 100644 --- a/po/rmlint.pot +++ b/po/rmlint.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: rmlint 2.8.0\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2018-10-29 23:47+0100\n" +"POT-Creation-Date: 2019-02-14 01:45+0100\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -674,7 +674,7 @@ msgid "Use more paranoid hashing" msgstr "" #: lib/cmdline.c -msgid "Do not cross mounpoints" +msgid "Do not cross mountpoints" msgstr "" #: lib/cmdline.c @@ -949,3 +949,21 @@ msgstr "" #, c-format msgid "No stamp file at `%s`, will create one after this run." msgstr "" + +#: lib/cmdline.c +msgid "Enable xattr based caching" +msgstr "" + +#: lib/cmdline.c +msgid "Check extended attributes to see if the file is already deduplicated" +msgstr "" + +#: lib/cmdline.c +msgid "" +"Note that not having duplicate directories enabled as lint type (e.g via -T " +"df)" +msgstr "" + +#: lib/cmdline.c +msgid "will also disable --merge-directories and trigger this warning." +msgstr "" diff --git a/src/rmlint.c b/src/rmlint.c index 7815cdb9..14164142 100644 --- a/src/rmlint.c +++ b/src/rmlint.c @@ -103,6 +103,7 @@ static void i18n_init(void) { int main(int argc, const char **argv) { int exit_state = EXIT_FAILURE; + RM_LOG_INIT; RmCfg cfg; rm_cfg_set_default(&cfg); diff --git a/tests/test_formatters/test_sh.py b/tests/test_formatters/test_sh.py index 65d4d5de..370d86df 100644 --- a/tests/test_formatters/test_sh.py +++ b/tests/test_formatters/test_sh.py @@ -259,3 +259,26 @@ def test_cleanup_emptydirs(shell): for dirname in names: assert (not os.path.exists(os.path.join(TESTDIR_NAME, dirname))) + + + +@with_setup(usual_setup_func, usual_teardown_func) +@parameterized([("sh", ), ("bash", ), ("dash", )]) +def test_keep_parent_timestamps(shell): + create_file('xxx', 'dir/a') + create_file('xxx', 'dir/b') + + dir_path = os.path.join(TESTDIR_NAME, 'dir') + stat_before = os.stat(dir_path) + + head, *data, footer = run_rmlint('-S a -T df -o sh:{t}/rmlint.sh'.format(t=TESTDIR_NAME)) + assert footer['duplicate_sets'] == 1 + assert footer['total_lint_size'] == 3 + assert footer['total_files'] == 2 + assert footer['duplicates'] == 1 + + sh_path = os.path.join(TESTDIR_NAME, 'rmlint.sh') + run_shell_script(shell, sh_path, "-dck") + stat_after = os.stat(dir_path) + + assert stat_before.st_mtime == stat_after.st_mtime diff --git a/tests/test_options/test_cache.py b/tests/test_options/test_cache.py index c7d93efc..bf909eb6 100644 --- a/tests/test_options/test_cache.py +++ b/tests/test_options/test_cache.py @@ -29,7 +29,7 @@ def create_files(): def check(data, write_cache): - unfinished = [p['path'] for p in data if p['type'] == 'unfinished_cksum'] + unfinished = [p['path'] for p in data if p['type'] == 'unique_file'] dupe_files = [p['path'] for p in data if p['type'] == 'duplicate_file'] dupe_trees = [p['path'] for p in data if p['type'] == 'duplicate_dir'] diff --git a/tests/test_options/test_replay.py b/tests/test_options/test_replay.py index 3989a915..53f8d1d5 100644 --- a/tests/test_options/test_replay.py +++ b/tests/test_options/test_replay.py @@ -317,3 +317,17 @@ def test_replay_tagged_order(): assert data[1]['path'].endswith('b/2') assert data[2]['path'].endswith('a/1') assert data[3]['path'].endswith('a/2') + + +@with_setup(usual_setup_func, usual_teardown_func) +def test_replay_duplicate_directory_size(): + create_file('xxx', 'a/xxx') + create_file('xxx', 'b/xxx') + + replay_path = '/tmp/replay.json' + head, *data, footer = run_rmlint('-o json:{p} -S a'.format(p=replay_path)) + assert len(data) == 2 + + head, *data, footer = run_rmlint('--replay {p}'.format(p=replay_path)) + assert len(data) == 2 + assert footer['total_lint_size'] == 3 diff --git a/tests/test_options/test_size.py b/tests/test_options/test_size.py index 70e15027..9cbe00ac 100644 --- a/tests/test_options/test_size.py +++ b/tests/test_options/test_size.py @@ -60,3 +60,30 @@ def trigger(*args): # double min trigger('--size 10--10') + + +@with_setup(usual_setup_func, usual_teardown_func) +def test_replay_size(): + create_file('', 'empty1') + create_file('', 'empty2') + create_file('xxx', 'a/xxx') + create_file('xxx', 'b/xxx') + create_file('yyy', 'a/yyy') + create_file('yyy', 'b/yyy') + create_testdir('empty_dir') + + replay_path = '/tmp/replay.json' + head, *data, footer = run_rmlint('-o json:{p}'.format( + p=replay_path + )) + + assert len(data) == 7 + assert [e["type"] for e in data] == \ + ["emptydir"] + (["emptyfile"] * 2) + (["duplicate_file"] * 4) + + head, *data, footer = run_rmlint('--replay {p} --size 1-10B'.format( + p=replay_path + )) + + assert [e["type"] for e in data] == \ + ["emptydir"] + (["emptyfile"] * 2) + (["duplicate_file"] * 4) diff --git a/tests/utils.py b/tests/utils.py index bd14a8c2..dc23cb73 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -58,9 +58,12 @@ def keep_testdir(): return get_env_flag('RM_TS_KEEP_TESTDIR') -def create_testdir(): +def create_testdir(*extra_path): try: - os.makedirs(TESTDIR_NAME) + path = TESTDIR_NAME + if extra_path: + path = os.path.join(TESTDIR_NAME, *extra_path) + os.makedirs(path) except OSError: pass @@ -186,7 +189,7 @@ def compare_json_doc(doc_a, doc_b, compare_checksum=False): for key in keys: # It's okay for unfinished checksums to have some missing fields. - if doc_a['type'] == doc_b['type'] == 'unfinished_cksum': + if doc_a['type'] == doc_b['type'] == 'unique_file': continue if doc_a[key] != doc_b[key]: