Skip to content

Commit

Permalink
Do not detect clone entry as duplicated content.
Browse files Browse the repository at this point in the history
If two entries point to the same tuple (cluter_id, blob_id), we must
not report them as duplicated content.
  • Loading branch information
mgautierfr authored and kelson42 committed Dec 4, 2023
1 parent 6875d84 commit 3289a6b
Showing 1 changed file with 9 additions and 4 deletions.
13 changes: 9 additions & 4 deletions src/zimcheck/checks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ class ArticleChecker

// All article with the same hash will be recorded in the same bucket of
// this hash table.
std::map<unsigned int, std::list<zim::entry_index_type>> hash_main;
std::map<unsigned int, std::list<std::tuple<zim::entry_index_type, zim::cluster_index_type, zim::blob_index_type>>> hash_main;

zim::ConcurrentCache<std::string, bool> linkStatusCache;
};
Expand Down Expand Up @@ -387,7 +387,7 @@ void ArticleChecker::check_item(const zim::Item& item)
data = item.getData();

if(checks.isEnabled(TestType::REDUNDANT))
hash_main[adler32(data)].push_back( item.getIndex() );
hash_main[adler32(data)].push_back( {item.getIndex(), item.getClusterIndex(), item.getBlobIndex()} );

if (item.getMimetype() != "text/html")
return;
Expand Down Expand Up @@ -487,14 +487,19 @@ void ArticleChecker::detect_redundant_articles()
progress.report();
auto l = it.second;
while ( !l.empty() ) {
const auto e1 = archive.getEntryByPath(l.front());
const auto [e1_idx, e1_cluster_idx, e1_blob_idx] = l.front();
l.pop_front();
const auto e1 = archive.getEntryByPath(e1_idx);
if ( !l.empty() ) {
// The way we have constructed `l`, e1 MUST BE an item
const std::string s1 = e1.getItem().getData();
decltype(l) articlesDifferentFromE1;
for(auto other : l) {
auto e2 = archive.getEntryByPath(other);
const auto [e2_idx, e2_cluster_idx, e2_blob_idx] = other;
if (e1_cluster_idx == e2_cluster_idx && e1_blob_idx == e2_blob_idx) {
continue;

Check warning on line 500 in src/zimcheck/checks.cpp

View check run for this annotation

Codecov / codecov/patch

src/zimcheck/checks.cpp#L500

Added line #L500 was not covered by tests
}
auto e2 = archive.getEntryByPath(e2_idx);
std::string s2 = e2.getItem().getData();
if (s1 != s2 ) {
articlesDifferentFromE1.push_back(other);
Expand Down

0 comments on commit 3289a6b

Please sign in to comment.