-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #291 from hathitrust/cluster-validator
A script to identify invalid clusters in Print Holdings MongoDB
- Loading branch information
Showing
4 changed files
with
210 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
require "cluster" | ||
require "services" | ||
Services.mongo! | ||
|
||
# Goes through all clusters, checks if they are valid, | ||
# and prints the first ocn of any invalid cluster to a file. | ||
|
||
class ClusterValidator | ||
attr_reader :output_path # file name | ||
def initialize | ||
# Make an output file in the right place | ||
ymd = Time.now.strftime("%Y-%m-%d") | ||
dir = Settings.local_report_path | ||
FileUtils.mkdir_p(dir) | ||
@output_path = "#{dir}/cluster_validator_#{ymd}.txt" | ||
end | ||
|
||
def run | ||
puts "Writing to #{output_path}" | ||
File.open(output_path, "w") do |outf| | ||
# Go through each cluster and check if valid. | ||
outf.puts "# These are ocns of invalid clusters:" | ||
Cluster.each do |c| | ||
unless c.valid? | ||
outf.puts(c.ocns.first) | ||
end | ||
end | ||
outf.puts "# Done" | ||
end | ||
end | ||
end | ||
|
||
if __FILE__ == $0 | ||
ClusterValidator.new.run | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# frozen_string_literal: true | ||
|
||
require "spec_helper" | ||
require "loader/cluster_loader" | ||
require_relative "../bin/cluster_validator" | ||
|
||
RSpec.describe ClusterValidator do | ||
let(:cluster_validator) { described_class.new } | ||
# The output file will have 2 lines, header and footer, even if no body. | ||
# So if the output file has 2 lines it is "empty" for the purposes of these tests. | ||
let(:empty_file_line_count) { 2 } | ||
let(:one_invalid_cluster_line_count) { 3 } | ||
# Files only differ in commitment.phase (1 in valid, 999 in invalid) | ||
let(:valid_cluster_fixt) { fixture("single_cluster_valid.json") } | ||
let(:invalid_cluster_fixt) { fixture("single_cluster_invalid.json") } | ||
before(:each) do | ||
Cluster.collection.find.delete_many | ||
end | ||
def get_output_lines | ||
described_class.new.run | ||
File.read(cluster_validator.output_path).split("\n") | ||
end | ||
it "makes an outfile when it runs" do | ||
expect(File.exist?(cluster_validator.output_path)).to be false | ||
cluster_validator.run | ||
expect(File.exist?(cluster_validator.output_path)).to be true | ||
end | ||
it "makes an empty-ish outfile if there are no clusters" do | ||
# empty-ish meaning it'll only have the header and footer, which begin with "#". | ||
lines = get_output_lines | ||
expect(lines.count).to eq empty_file_line_count | ||
expect(lines[0]).to start_with("#") | ||
expect(lines[1]).to start_with("#") | ||
end | ||
it "does NOT count valid clusters" do | ||
# Start with loading a valid cluster, and verify. | ||
Loader::ClusterLoader.new.load(valid_cluster_fixt) | ||
# Verify we have one valid cluster. | ||
# Verify it does not count towards the report. | ||
expect(Cluster.count).to eq 1 | ||
expect(Cluster.first.valid?).to be true | ||
expect(get_output_lines.count).to eq empty_file_line_count | ||
end | ||
it "DOES count invalid clusters" do | ||
# Start with loading an invalid cluster, and verify. | ||
Loader::ClusterLoader.new.load(invalid_cluster_fixt) | ||
# Verify we have one invalid cluster. | ||
# Verify it does count towards the report. | ||
expect(Cluster.count).to eq 1 | ||
expect(Cluster.first.valid?).to be false | ||
expect(get_output_lines.count).to eq one_invalid_cluster_line_count | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
[ | ||
{ | ||
"ocns": [ | ||
5 | ||
], | ||
"last_modified": "2023-10-13 13:15:31 UTC", | ||
"holdings": [ | ||
{ | ||
"enum_chron": "", | ||
"n_enum": "", | ||
"n_chron": "", | ||
"n_enum_chron": "", | ||
"ocn": 5, | ||
"organization": "umich", | ||
"country_code": "us", | ||
"weight": 1.0, | ||
"local_id": "loc_1", | ||
"mono_multi_serial": "mix", | ||
"date_received": "2023-10-13 00:00:00 UTC", | ||
"condition": "", | ||
"issn": null, | ||
"status": null, | ||
"uuid": "3bbd8c32-7d53-42f2-9f92-52690b047881", | ||
"gov_doc_flag": false | ||
} | ||
], | ||
"ht_items": [ | ||
{ | ||
"ocns": [ | ||
5 | ||
], | ||
"enum_chron": "", | ||
"n_enum": "", | ||
"n_chron": "", | ||
"n_enum_chron": "", | ||
"item_id": "test.140236", | ||
"ht_bib_key": 486522, | ||
"rights": "pd", | ||
"access": "allow", | ||
"bib_fmt": "BK", | ||
"collection_code": "MIU", | ||
"billing_entity": "umich" | ||
} | ||
], | ||
"commitments": [ | ||
{ | ||
"policies": [ | ||
|
||
], | ||
"phase": 999, | ||
"facsimile": true, | ||
"uuid": "b5f06bf7-60f6-41b9-8101-59c7501a5ce9", | ||
"committed_date": "2021-08-18 00:00:00 UTC", | ||
"organization": "umich", | ||
"ocn": 5, | ||
"local_id": "loc_1", | ||
"oclc_sym": "uiu" | ||
} | ||
] | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
[ | ||
{ | ||
"ocns": [ | ||
5 | ||
], | ||
"last_modified": "2023-10-13 13:15:31 UTC", | ||
"holdings": [ | ||
{ | ||
"enum_chron": "", | ||
"n_enum": "", | ||
"n_chron": "", | ||
"n_enum_chron": "", | ||
"ocn": 5, | ||
"organization": "umich", | ||
"country_code": "us", | ||
"weight": 1.0, | ||
"local_id": "loc_1", | ||
"mono_multi_serial": "mix", | ||
"date_received": "2023-10-13 00:00:00 UTC", | ||
"condition": "", | ||
"issn": null, | ||
"status": null, | ||
"uuid": "3bbd8c32-7d53-42f2-9f92-52690b047881", | ||
"gov_doc_flag": false | ||
} | ||
], | ||
"ht_items": [ | ||
{ | ||
"ocns": [ | ||
5 | ||
], | ||
"enum_chron": "", | ||
"n_enum": "", | ||
"n_chron": "", | ||
"n_enum_chron": "", | ||
"item_id": "test.140236", | ||
"ht_bib_key": 486522, | ||
"rights": "pd", | ||
"access": "allow", | ||
"bib_fmt": "BK", | ||
"collection_code": "MIU", | ||
"billing_entity": "umich" | ||
} | ||
], | ||
"commitments": [ | ||
{ | ||
"policies": [ | ||
|
||
], | ||
"phase": 0, | ||
"facsimile": true, | ||
"uuid": "b5f06bf7-60f6-41b9-8101-59c7501a5ce9", | ||
"committed_date": "2021-08-18 00:00:00 UTC", | ||
"organization": "umich", | ||
"ocn": 5, | ||
"local_id": "loc_1", | ||
"oclc_sym": "uiu" | ||
} | ||
] | ||
} | ||
] |