From a4c4d2b9a252f65e001cdbc7e735dd67144c89b0 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 25 Oct 2023 16:41:45 +0200 Subject: [PATCH] Do not detect clone entry as duplicated content. If two entries point to the same tuple (cluter_id, blob_id), we must not report them as duplicated content. --- src/zimcheck/checks.cpp | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/zimcheck/checks.cpp b/src/zimcheck/checks.cpp index 38e51931..ccafc663 100644 --- a/src/zimcheck/checks.cpp +++ b/src/zimcheck/checks.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,11 @@ SortedMsgParams sortedMsgParams(const MsgParams& msgParams) return SortedMsgParams(msgParams.begin(), msgParams.end()); } +bool areAliases(const zim::Item& i1, const zim::Item& i2) +{ + return i1.getClusterIndex() == i2.getClusterIndex() && i1.getBlobIndex() == i2.getBlobIndex(); +} + } // unnamed namespace namespace JSON @@ -487,15 +493,22 @@ void ArticleChecker::detect_redundant_articles() progress.report(); auto l = it.second; while ( !l.empty() ) { - const auto e1 = archive.getEntryByPath(l.front()); + // The way we have constructed `l`, e1 MUST BE an item + const auto e1 = archive.getEntryByPath(l.front()).getItem(); l.pop_front(); if ( !l.empty() ) { - // The way we have constructed `l`, e1 MUST BEĀ an item - const std::string s1 = e1.getItem().getData(); + std::optional s1; decltype(l) articlesDifferentFromE1; for(auto other : l) { - auto e2 = archive.getEntryByPath(other); - std::string s2 = e2.getItem().getData(); + // The way we have constructed `l`, e2 MUST BE an item + const auto e2 = archive.getEntryByPath(other).getItem(); + if (areAliases(e1, e2)) { + continue; + } + if (!s1) { + s1 = e1.getData(); + } + std::string s2 = e2.getData(); if (s1 != s2 ) { articlesDifferentFromE1.push_back(other); continue;