From 92fc65ea6a9b6f840634eec61d0bb8d3242eb373 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 25 Oct 2023 16:41:45 +0200 Subject: [PATCH] Do not detect clone entry as duplicated content. If two entries point to the same tuple (cluter_id, blob_id), we must not report them as duplicated content. --- src/zimcheck/checks.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/zimcheck/checks.cpp b/src/zimcheck/checks.cpp index 38e51931..6c7aa691 100644 --- a/src/zimcheck/checks.cpp +++ b/src/zimcheck/checks.cpp @@ -350,7 +350,7 @@ class ArticleChecker // All article with the same hash will be recorded in the same bucket of // this hash table. - std::map> hash_main; + std::map>> hash_main; zim::ConcurrentCache linkStatusCache; }; @@ -387,7 +387,7 @@ void ArticleChecker::check_item(const zim::Item& item) data = item.getData(); if(checks.isEnabled(TestType::REDUNDANT)) - hash_main[adler32(data)].push_back( item.getIndex() ); + hash_main[adler32(data)].push_back( {item.getIndex(), item.getClusterIndex(), item.getBlobIndex()} ); if (item.getMimetype() != "text/html") return; @@ -487,14 +487,19 @@ void ArticleChecker::detect_redundant_articles() progress.report(); auto l = it.second; while ( !l.empty() ) { - const auto e1 = archive.getEntryByPath(l.front()); + const auto [e1_idx, e1_cluster_idx, e1_blob_idx] = l.front(); l.pop_front(); + const auto e1 = archive.getEntryByPath(e1_idx); if ( !l.empty() ) { // The way we have constructed `l`, e1 MUST BEĀ an item const std::string s1 = e1.getItem().getData(); decltype(l) articlesDifferentFromE1; for(auto other : l) { - auto e2 = archive.getEntryByPath(other); + const auto [e2_idx, e2_cluster_idx, e2_blob_idx] = other; + if (e1_cluster_idx == e2_cluster_idx && e1_blob_idx == e2_blob_idx) { + continue; + } + auto e2 = archive.getEntryByPath(e2_idx); std::string s2 = e2.getItem().getData(); if (s1 != s2 ) { articlesDifferentFromE1.push_back(other);