From f4b4aeacc346f0fa3c8b5e7e320e7363c5a9a9c8 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Wed, 22 May 2024 15:33:27 -0400 Subject: [PATCH 1/2] DEV-1125: Clean up duplicate holdings Iterates through clusters and cleans them up one by one --- bin/cleanup_duplicate_holdings.rb | 68 ++++++++++++ spec/cleanup_duplicate_holdings_spec.rb | 140 ++++++++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 bin/cleanup_duplicate_holdings.rb create mode 100644 spec/cleanup_duplicate_holdings_spec.rb diff --git a/bin/cleanup_duplicate_holdings.rb b/bin/cleanup_duplicate_holdings.rb new file mode 100644 index 00000000..3ffc0a90 --- /dev/null +++ b/bin/cleanup_duplicate_holdings.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + + +Services.mongo! + +class CleanupDuplicateHoldings + LOG_INTERVAL = 60 + + def initialize + @clusters_processed = 0 + @old_holdings_processed = 0 + @new_holdings_processed = 0 + @last_log_time = Time.now + Services.logger.info("Starting cluster deduplication") + end + + + def run + Cluster.each do |cluster| + Services.logger.debug("Cleaning cluster #{cluster._id}: #{cluster.ocns}") + old_count = cluster.holdings.count + new_count = remove_duplicate_holdings(cluster) + update_progress(old_count, new_count) + end + + Services.logger.info("Finished cleaning clusters") + log_progress + end + + private + + def update_progress(old_count, new_count) + @clusters_processed += 1 + @old_holdings_processed += old_count + @new_holdings_processed += new_count + + log_progress if hasnt_logged_recently? + end + + def log_progress + Services.logger.info("Processed #{@clusters_processed} clusters") + Services.logger.info("Processed #{@old_holdings_processed} old holdings") + Services.logger.info("Kept #{@new_holdings_processed} holdings") + @last_log_time = Time.now + end + + def hasnt_logged_recently? + !@last_log_time or (Time.now - @last_log_time > LOG_INTERVAL) + end + + # Returns the count of deduped holdings + def remove_duplicate_holdings(cluster) + cluster.holdings = dedupe_holdings(cluster) + cluster.save + cluster.holdings.count + end + + def dedupe_holdings(cluster) + cluster.holdings.group_by(&:update_key).map do |update_key,holdings_group| + latest_date = holdings_group.map(&:date_received).max + holdings_group.reject { |h| h.date_received != latest_date } + end.flatten + end +end + +if __FILE__ == $PROGRAM_NAME + CleanupDuplicateHoldings.new(ARGV).run +end diff --git a/spec/cleanup_duplicate_holdings_spec.rb b/spec/cleanup_duplicate_holdings_spec.rb new file mode 100644 index 00000000..9c816962 --- /dev/null +++ b/spec/cleanup_duplicate_holdings_spec.rb @@ -0,0 +1,140 @@ +# frozen_string_literal: true + +require "spec_helper" + +require_relative "../bin/cleanup_duplicate_holdings" + +RSpec.describe CleanupDuplicateHoldings do + def set_blank_fields(holding, value) + [:n_enum=, :n_chron=, :condition=, :issn=].each do |setter| + holding.public_send(setter, value) + end + end + + def blank_fields_holding(**kwargs) + build(:holding, :all_fields, **kwargs).tap { |h| set_blank_fields(h, "") } + end + + def nil_fields_dupe_holding(h) + h.clone.tap do |h2| + set_blank_fields(h2, nil) + h2._id = nil + h2.uuid = SecureRandom.uuid + h2.date_received = Date.yesterday + end + end + + before(:each) { Cluster.each(&:delete) } + + describe "run" do + it "cleans up duplicate holdings" do + holding = blank_fields_holding + create(:cluster, holdings: [holding, nil_fields_dupe_holding(holding)]) + + described_class.new.run + + expect(Cluster.first.holdings.count).to eq(1) + end + + it "leaves non-duplicate holdings alone" do + holding = blank_fields_holding + another_holding = blank_fields_holding + create(:cluster, holdings: [ + holding, + nil_fields_dupe_holding(holding), + another_holding + ]) + + described_class.new.run + + cluster_holdings = Cluster.first.holdings + expect(cluster_holdings.length).to eq(2) + expect(cluster_holdings[0]).not_to eq(cluster_holdings[1]) + end + + it "cleans up duplicate holdings from multiple organizations in a cluster" do + umich_holding = blank_fields_holding(organization: "umich") + upenn_holding = blank_fields_holding(organization: "upenn") + create(:cluster, holdings: [ + umich_holding, + upenn_holding, + nil_fields_dupe_holding(umich_holding), + nil_fields_dupe_holding(upenn_holding) + ]) + + described_class.new.run + + expect(Cluster.first.holdings.count).to eq(2) + expect(Cluster.first.holdings.map(&:organization).uniq).to contain_exactly("umich", "upenn") + end + + it "cleans up more than two duplicate holdings in a cluster" do + holding = blank_fields_holding + create(:cluster, holdings: [ + holding, + nil_fields_dupe_holding(holding), + nil_fields_dupe_holding(holding) + ]) + + described_class.new.run + + expect(Cluster.first.holdings.count).to eq(1) + end + + it "cleans up multiple clusters with duplicate holdings" do + holding = blank_fields_holding + create(:cluster, holdings: [ + holding, + nil_fields_dupe_holding(holding) + ]) + + holding2 = blank_fields_holding + create(:cluster, holdings: [ + holding2, + nil_fields_dupe_holding(holding2) + ]) + + described_class.new.run + + expect(Cluster.count).to eq(2) + Cluster.each do |c| + expect(c.holdings.count).to eq(1) + end + end + + it "keeps the holding with the most recent date received" do + # By default, the factory creates the holding with today's date; + # the duplicate holding has yesterday's date + holding = blank_fields_holding + create(:cluster, holdings: [holding, nil_fields_dupe_holding(holding)]) + + described_class.new.run + + expect(Cluster.first.holdings[0].date_received).to eq(Date.today) + end + + it "logs what it's working on at DEBUG level" do + Services.register(:logger) { Logger.new($stdout, level: Logger::DEBUG) } + + create(:cluster) + + expect { described_class.new.run }.to output(/#{Cluster.first.ocns.first}/).to_stdout + end + + it "logs how many clusters it's worked on" do + Services.register(:logger) { Logger.new($stdout, level: Logger::INFO) } + create(:cluster) + + expect { described_class.new.run }.to output(/Processed 1 cluster/).to_stdout + end + + it "logs how many holdings it's worked on" do + Services.register(:logger) { Logger.new($stdout, level: Logger::INFO) } + + holding = blank_fields_holding + create(:cluster, holdings: [holding, nil_fields_dupe_holding(holding)]) + + expect { described_class.new.run }.to output(/Processed 2 old holdings.*Kept 1 holding/m).to_stdout + end + end +end From 96097f3e59ca6adbf148364129064c9fdb4c42cc Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Thu, 23 May 2024 12:50:48 -0400 Subject: [PATCH 2/2] DEV-1125: fix script for duplicate holdings cleanup * add brief documentation * ensure it runs --- bin/cleanup_duplicate_holdings.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/cleanup_duplicate_holdings.rb b/bin/cleanup_duplicate_holdings.rb index 3ffc0a90..03e4436d 100644 --- a/bin/cleanup_duplicate_holdings.rb +++ b/bin/cleanup_duplicate_holdings.rb @@ -1,8 +1,14 @@ # frozen_string_literal: true +require 'services' +require 'cluster' Services.mongo! +# Iterates through clusters, removing any duplicate holdings and logging its progress. +# +# Usage: bundle exec ruby bin/cleanup_duplicate_holdings. + class CleanupDuplicateHoldings LOG_INTERVAL = 60 @@ -14,7 +20,6 @@ def initialize Services.logger.info("Starting cluster deduplication") end - def run Cluster.each do |cluster| Services.logger.debug("Cleaning cluster #{cluster._id}: #{cluster.ocns}") @@ -64,5 +69,5 @@ def dedupe_holdings(cluster) end if __FILE__ == $PROGRAM_NAME - CleanupDuplicateHoldings.new(ARGV).run + CleanupDuplicateHoldings.new.run end