Skip to content

Commit

Permalink
Merge pull request #291 from hathitrust/cluster-validator
Browse files Browse the repository at this point in the history
A script to identify invalid clusters in Print Holdings MongoDB
  • Loading branch information
mwarin authored Oct 13, 2023
2 parents 65963cc + 16eebf7 commit 674cfe3
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 0 deletions.
35 changes: 35 additions & 0 deletions bin/cluster_validator.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
require "cluster"
require "services"
Services.mongo!

# Goes through all clusters, checks if they are valid,
# and prints the first ocn of any invalid cluster to a file.

class ClusterValidator
attr_reader :output_path # file name
def initialize
# Make an output file in the right place
ymd = Time.now.strftime("%Y-%m-%d")
dir = Settings.local_report_path
FileUtils.mkdir_p(dir)
@output_path = "#{dir}/cluster_validator_#{ymd}.txt"
end

def run
puts "Writing to #{output_path}"
File.open(output_path, "w") do |outf|
# Go through each cluster and check if valid.
outf.puts "# These are ocns of invalid clusters:"
Cluster.each do |c|
unless c.valid?
outf.puts(c.ocns.first)
end
end
outf.puts "# Done"
end
end
end

if __FILE__ == $0
ClusterValidator.new.run
end
53 changes: 53 additions & 0 deletions spec/cluster_validator_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# frozen_string_literal: true

require "spec_helper"
require "loader/cluster_loader"
require_relative "../bin/cluster_validator"

RSpec.describe ClusterValidator do
let(:cluster_validator) { described_class.new }
# The output file will have 2 lines, header and footer, even if no body.
# So if the output file has 2 lines it is "empty" for the purposes of these tests.
let(:empty_file_line_count) { 2 }
let(:one_invalid_cluster_line_count) { 3 }
# Files only differ in commitment.phase (1 in valid, 999 in invalid)
let(:valid_cluster_fixt) { fixture("single_cluster_valid.json") }
let(:invalid_cluster_fixt) { fixture("single_cluster_invalid.json") }
before(:each) do
Cluster.collection.find.delete_many
end
def get_output_lines
described_class.new.run
File.read(cluster_validator.output_path).split("\n")
end
it "makes an outfile when it runs" do
expect(File.exist?(cluster_validator.output_path)).to be false
cluster_validator.run
expect(File.exist?(cluster_validator.output_path)).to be true
end
it "makes an empty-ish outfile if there are no clusters" do
# empty-ish meaning it'll only have the header and footer, which begin with "#".
lines = get_output_lines
expect(lines.count).to eq empty_file_line_count
expect(lines[0]).to start_with("#")
expect(lines[1]).to start_with("#")
end
it "does NOT count valid clusters" do
# Start with loading a valid cluster, and verify.
Loader::ClusterLoader.new.load(valid_cluster_fixt)
# Verify we have one valid cluster.
# Verify it does not count towards the report.
expect(Cluster.count).to eq 1
expect(Cluster.first.valid?).to be true
expect(get_output_lines.count).to eq empty_file_line_count
end
it "DOES count invalid clusters" do
# Start with loading an invalid cluster, and verify.
Loader::ClusterLoader.new.load(invalid_cluster_fixt)
# Verify we have one invalid cluster.
# Verify it does count towards the report.
expect(Cluster.count).to eq 1
expect(Cluster.first.valid?).to be false
expect(get_output_lines.count).to eq one_invalid_cluster_line_count
end
end
61 changes: 61 additions & 0 deletions spec/fixtures/single_cluster_invalid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
[
{
"ocns": [
5
],
"last_modified": "2023-10-13 13:15:31 UTC",
"holdings": [
{
"enum_chron": "",
"n_enum": "",
"n_chron": "",
"n_enum_chron": "",
"ocn": 5,
"organization": "umich",
"country_code": "us",
"weight": 1.0,
"local_id": "loc_1",
"mono_multi_serial": "mix",
"date_received": "2023-10-13 00:00:00 UTC",
"condition": "",
"issn": null,
"status": null,
"uuid": "3bbd8c32-7d53-42f2-9f92-52690b047881",
"gov_doc_flag": false
}
],
"ht_items": [
{
"ocns": [
5
],
"enum_chron": "",
"n_enum": "",
"n_chron": "",
"n_enum_chron": "",
"item_id": "test.140236",
"ht_bib_key": 486522,
"rights": "pd",
"access": "allow",
"bib_fmt": "BK",
"collection_code": "MIU",
"billing_entity": "umich"
}
],
"commitments": [
{
"policies": [

],
"phase": 999,
"facsimile": true,
"uuid": "b5f06bf7-60f6-41b9-8101-59c7501a5ce9",
"committed_date": "2021-08-18 00:00:00 UTC",
"organization": "umich",
"ocn": 5,
"local_id": "loc_1",
"oclc_sym": "uiu"
}
]
}
]
61 changes: 61 additions & 0 deletions spec/fixtures/single_cluster_valid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
[
{
"ocns": [
5
],
"last_modified": "2023-10-13 13:15:31 UTC",
"holdings": [
{
"enum_chron": "",
"n_enum": "",
"n_chron": "",
"n_enum_chron": "",
"ocn": 5,
"organization": "umich",
"country_code": "us",
"weight": 1.0,
"local_id": "loc_1",
"mono_multi_serial": "mix",
"date_received": "2023-10-13 00:00:00 UTC",
"condition": "",
"issn": null,
"status": null,
"uuid": "3bbd8c32-7d53-42f2-9f92-52690b047881",
"gov_doc_flag": false
}
],
"ht_items": [
{
"ocns": [
5
],
"enum_chron": "",
"n_enum": "",
"n_chron": "",
"n_enum_chron": "",
"item_id": "test.140236",
"ht_bib_key": 486522,
"rights": "pd",
"access": "allow",
"bib_fmt": "BK",
"collection_code": "MIU",
"billing_entity": "umich"
}
],
"commitments": [
{
"policies": [

],
"phase": 0,
"facsimile": true,
"uuid": "b5f06bf7-60f6-41b9-8101-59c7501a5ce9",
"committed_date": "2021-08-18 00:00:00 UTC",
"organization": "umich",
"ocn": 5,
"local_id": "loc_1",
"oclc_sym": "uiu"
}
]
}
]

0 comments on commit 674cfe3

Please sign in to comment.