Skip to content

Commit

Permalink
Do not detect clone entry as duplicated content.
Browse files Browse the repository at this point in the history
If two entries point to the same tuple (cluter_id, blob_id), we must
not report them as duplicated content.
  • Loading branch information
mgautierfr committed Oct 25, 2023
1 parent c67ebe0 commit 92fc65e
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ class ArticleChecker

// All article with the same hash will be recorded in the same bucket of
// this hash table.
std::map<unsigned int, std::list<zim::entry_index_type>> hash_main;
std::map<unsigned int, std::list<std::tuple<zim::entry_index_type, zim::cluster_index_type, zim::blob_index_type>>> hash_main;

zim::ConcurrentCache<std::string, bool> linkStatusCache;
};
Expand Down Expand Up @@ -387,7 +387,7 @@ void ArticleChecker::check_item(const zim::Item& item)
data = item.getData();

if(checks.isEnabled(TestType::REDUNDANT))
hash_main[adler32(data)].push_back( item.getIndex() );
hash_main[adler32(data)].push_back( {item.getIndex(), item.getClusterIndex(), item.getBlobIndex()} );

if (item.getMimetype() != "text/html")
return;
Expand Down Expand Up @@ -487,14 +487,19 @@ void ArticleChecker::detect_redundant_articles()
progress.report();
auto l = it.second;
while ( !l.empty() ) {
const auto e1 = archive.getEntryByPath(l.front());
const auto [e1_idx, e1_cluster_idx, e1_blob_idx] = l.front();
l.pop_front();
const auto e1 = archive.getEntryByPath(e1_idx);
if ( !l.empty() ) {
// The way we have constructed `l`, e1 MUST BE an item
const std::string s1 = e1.getItem().getData();
decltype(l) articlesDifferentFromE1;
for(auto other : l) {
auto e2 = archive.getEntryByPath(other);
const auto [e2_idx, e2_cluster_idx, e2_blob_idx] = other;
if (e1_cluster_idx == e2_cluster_idx && e1_blob_idx == e2_blob_idx) {
continue;
}
auto e2 = archive.getEntryByPath(e2_idx);
std::string s2 = e2.getItem().getData();
if (s1 != s2 ) {
articlesDifferentFromE1.push_back(other);
Expand Down

0 comments on commit 92fc65e

Please sign in to comment.