Skip to content

Commit

Permalink
Merge pull request #288 from hathitrust/DEV-905
Browse files Browse the repository at this point in the history
Dev 905: better understanding of splitting/merging
  • Loading branch information
mwarin authored Sep 27, 2023
2 parents aeae17d + 58810af commit 23fb6f9
Show file tree
Hide file tree
Showing 19 changed files with 259 additions and 176 deletions.
81 changes: 81 additions & 0 deletions spec/clustering/cluster_commitment_spec.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
# frozen_string_literal: true

require "spec_helper"
require "cluster"
require "clustering/cluster_commitment"
require "clustering/cluster_ocn_resolution"
require "overlap/holding_commitment"
require "shared_print/finder"

RSpec.describe Clustering::ClusterCommitment do
let(:comm) { build(:commitment) }
Expand Down Expand Up @@ -69,4 +73,81 @@
expect(cc.uuids_in_cluster(cluster)).to include(comm.uuid)
end
end

describe "merging/splitting a cluster w/ commitments" do
before(:each) {
Cluster.collection.find.delete_many
}
let(:ocn1) { 1 }
let(:ocn2) { 2 }
let(:reso) { build(:ocn_resolution, deprecated: ocn1, resolved: ocn2) }

let(:finder) { SharedPrint::Finder.new(ocn: [ocn1, ocn2]) }
it "Puts commitments together when their clusters merge." do
# Start with 2 commitments on 2 clusters.
[ocn1, ocn2].each do |ocn|
cluster_tap_save(
build(:ht_item, ocns: [ocn]),
build(:holding, ocn: ocn, organization: "umich", mono_multi_serial: "spm", status: "CH", condition: ""),
build(:commitment, ocn: ocn, organization: "umich")
)
end
expect(finder.commitments.count).to eq 2
expect(Cluster.count).to eq 2

# Use reso as glue to merge the clusters.
cluster_tap_save reso

# Now find the 2 commitments on 1 merged cluster (with ocns [1, 2]).
expect(finder.clusters.count).to eq 1
expect(finder.commitments.count).to eq 2
expect(finder.clusters.first.ocns.sort).to eq [ocn1, ocn2]
end
it "puts commitments separately if the cluster splits them" do
# Here they start out in the same cluster (glued ht_item.ocns)
expect(finder.clusters.count).to eq 0

[ocn1, ocn2].each do |ocn|
cluster_tap_save(
build(:ht_item, ocns: [ocn]),
build(:holding, ocn: ocn, organization: "umich", mono_multi_serial: "spm", status: "CH", condition: ""),
build(:commitment, ocn: ocn, organization: "umich"),
reso # this clusters them together from the start
)
end
expect(finder.commitments.count).to eq 2
expect(Cluster.count).to eq 1

# Remove the reso glue to split the clusters.
Clustering::ClusterOCNResolution.new(reso).delete

# Now find the 2 commitments on 2 merged clusters (with ocns [1] & respectively [2]).
clusters = finder.clusters.to_a
expect(clusters.count).to eq 2
expect(finder.commitments.count).to eq 2
expect(clusters.first.ocns).to eq [ocn1]
expect(clusters.last.ocns).to eq [ocn2]
end
it "can happen that splitting changes shared print eligibility" do
# This cluster should be eligible for commitments.
cluster_tap_save(
build(:ht_item, ocns: [ocn1], bib_fmt: "BK", enum_chron: ""),
build(:holding, ocn: ocn2, organization: "umich", mono_multi_serial: "spm", status: "CH", condition: ""),
reso # this clusters them together from the start
)

# Confirm that it is.
expect(Overlap::HoldingCommitment.new(ocn1).eligible_holdings.size).to eq 1
# And since ocn 1 and 2 are in the same cluster, ocn2 is also eligible.
expect(Overlap::HoldingCommitment.new(ocn2).eligible_holdings.size).to eq 1

# Remove the reso glue to split the clusters. This should remove the ht_item
# from the cluster, and break the commitment eligibility for ocn1.
Clustering::ClusterOCNResolution.new(reso).delete

# Glue gone, no longer eligible
expect(Overlap::HoldingCommitment.new(ocn1).eligible_holdings.size).to eq 0
expect(Overlap::HoldingCommitment.new(ocn2).eligible_holdings.size).to eq 0
end
end
end
6 changes: 3 additions & 3 deletions spec/data_sources/large_clusters_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
# setup
hol1 = build(:holding, ocn: lrg_ocn, organization: org1)
hol2 = build(:holding, ocn: lrg_ocn, organization: org1)
cluster_tap_save [hol1, hol2]
cluster_tap_save(hol1, hol2)
cluster_holdings_uuids = Cluster.where(ocns: lrg_ocn).first.holdings.map(&:uuid)
# execution
expect(hol1.uuid == hol2.uuid).to be false
Expand All @@ -49,7 +49,7 @@
# setup
hol1 = build(:holding, ocn: lrg_ocn, organization: org1)
hol2 = build(:holding, ocn: lrg_ocn, organization: org2) # <<< org diff
cluster_tap_save [hol1, hol2]
cluster_tap_save(hol1, hol2)
cluster_holdings_uuids = Cluster.where(ocns: lrg_ocn).first.holdings.map(&:uuid)
# execution
expect(hol1.uuid == hol2.uuid).to be false
Expand All @@ -64,7 +64,7 @@
batch = []
batch_loader = Loader::HoldingLoaderNDJ.new

cluster_tap_save [build(:ht_item, ocns: [lrg_ocn])]
cluster_tap_save build(:ht_item, ocns: [lrg_ocn])
File.open(fixt, "r") do |f|
f.each_line do |line|
batch << batch_loader.item_from_line(line)
Expand Down
4 changes: 2 additions & 2 deletions spec/integration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def phctl(*args)
it "loads tsv file with phase 3 commitments" do
# Setup, need populated clusters to load commitments
[2, 3].each do |ocn|
cluster_tap_save [
cluster_tap_save(
build(:ht_item, ocns: [ocn]),
build(:holding, ocn: ocn, organization: "umich")
]
)
end
expect { phctl("sp", "phase3load", fixture("phase_3_commitments.tsv")) }
.to change { cluster_count(:commitments) }.by(2)
Expand Down
2 changes: 1 addition & 1 deletion spec/keio_updater_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def build_cluster(ocn)
ht_item = build(:ht_item, ocns: [ocn], collection_code: big_k, billing_entity: hword)
ht_item2 = build(:ht_item, ocns: [ocn], collection_code: "PU", billing_entity: upenn)
create(:cluster, ocns: [ocn])
cluster_tap_save [ht_item, ht_item2]
cluster_tap_save(ht_item, ht_item2)
end

def get_all
Expand Down
18 changes: 9 additions & 9 deletions spec/mongo_updater_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def commitment_local_ids(ocn)
# Feel free to add tests for the other clusterables.
describe "clusterable = commitments" do
it "Requires a clusterable string" do
cluster_tap_save [spc1, spc2]
cluster_tap_save(spc1, spc2)
expect {
described_class.update_embedded(
matcher: {ocn: ocn1},
Expand All @@ -41,7 +41,7 @@ def commitment_local_ids(ocn)
end

it "Requires a matcher" do
cluster_tap_save [spc1, spc2]
cluster_tap_save(spc1, spc2)
expect {
described_class.update_embedded(
clusterable: "commitments",
Expand All @@ -51,7 +51,7 @@ def commitment_local_ids(ocn)
end

it "Requires an updater" do
cluster_tap_save [spc1, spc2]
cluster_tap_save(spc1, spc2)
expect {
described_class.update_embedded(
clusterable: "commitments",
Expand All @@ -61,7 +61,7 @@ def commitment_local_ids(ocn)
end

it "Updates matching embedded documents" do
cluster_tap_save [spc1, spc2, spc3, spc4]
cluster_tap_save(spc1, spc2, spc3, spc4)
described_class.update_embedded(
clusterable: "commitments",
matcher: {ocn: ocn1},
Expand All @@ -72,7 +72,7 @@ def commitment_local_ids(ocn)
end

it "Gets more specific with additional matching criteria" do
cluster_tap_save [spc1, spc2, spc3, spc4]
cluster_tap_save(spc1, spc2, spc3, spc4)
described_class.update_embedded(
clusterable: "commitments",
matcher: {ocn: ocn1, local_id: loc1},
Expand All @@ -83,7 +83,7 @@ def commitment_local_ids(ocn)
end

it "Allows updating multiple fields on the same embedded document" do
cluster_tap_save [spc1, spc2]
cluster_tap_save(spc1, spc2)
described_class.update_embedded(
clusterable: "commitments",
matcher: {ocn: ocn1, local_id: loc1},
Expand All @@ -96,7 +96,7 @@ def commitment_local_ids(ocn)
end

it "Allows updating multiple fields on multiple embedded document" do
cluster_tap_save [spc1, spc2]
cluster_tap_save(spc1, spc2)
described_class.update_embedded(
clusterable: "commitments",
matcher: {ocn: ocn1},
Expand All @@ -109,7 +109,7 @@ def commitment_local_ids(ocn)
end

it "can match on undefined fields being null" do
cluster_tap_save [spc1, spc2]
cluster_tap_save(spc1, spc2)
described_class.update_embedded(
clusterable: "commitments",
matcher: {field_missing: nil},
Expand All @@ -122,7 +122,7 @@ def commitment_local_ids(ocn)
end

it "can set a field that does not exist" do
cluster_tap_save [spc1, spc2]
cluster_tap_save(spc1, spc2)
expect(Cluster.first.commitments.first.local_item_id).to eq nil
described_class.update_embedded(
clusterable: "commitments",
Expand Down
10 changes: 5 additions & 5 deletions spec/reports/cost_report_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,21 @@

describe "making sure that access and rights come out the way they go in" do
it "pd == allow" do
cluster_tap_save [build(:ht_item, access: alo, rights: pd, ocns: [111])]
cluster_tap_save build(:ht_item, access: alo, rights: pd, ocns: [111])
cluster = Cluster.find_by(ocns: 111)
expect(cluster.ht_items.count).to eq 1
expect(cluster.ht_items.first.rights).to eq pd
expect(cluster.ht_items.first.access).to eq alo
end
it "icus == allow" do
cluster_tap_save [build(:ht_item, access: alo, rights: icus, ocns: [222])]
cluster_tap_save build(:ht_item, access: alo, rights: icus, ocns: [222])
cluster = Cluster.find_by(ocns: 222)
expect(cluster.ht_items.count).to eq 1
expect(cluster.ht_items.first.rights).to eq icus
expect(cluster.ht_items.first.access).to eq alo
end
it "ic == deny" do
cluster_tap_save [build(:ht_item, access: dni, rights: ic, ocns: [333])]
cluster_tap_save build(:ht_item, access: dni, rights: ic, ocns: [333])
cluster = Cluster.find_by(ocns: 333)
expect(cluster.ht_items.count).to eq 1
expect(cluster.ht_items.first.rights).to eq ic
Expand All @@ -64,13 +64,13 @@

describe "#num_pd_volumes" do
it "counts the number of pd volumes" do
cluster_tap_save [ht_allow, build(:ht_item, access: alo, rights: pd, ocns: ht_allow.ocns)]
cluster_tap_save ht_allow, build(:ht_item, access: alo, rights: pd, ocns: ht_allow.ocns)
expect(cr.num_pd_volumes).to eq(2)
end
it "counts icus towards pd regardless of access" do
# Put 10 icus items in...
1.upto(10) do
cluster_tap_save [build(:ht_item, rights: icus)]
cluster_tap_save build(:ht_item, rights: icus)
end
# Expect all 10 when you call num_pd_volumes
expect(cr.num_pd_volumes).to eq(10)
Expand Down
24 changes: 12 additions & 12 deletions spec/reports/holdings_by_date_report_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ def bld_hol(ocn, org, mms, tim)
end
it "gets data from a populated db" do
# Add 2 records (different orgs) to db
cluster_tap_save [
cluster_tap_save(
bld_hol(ocn1, org1, spm, tim1),
bld_hol(ocn1, org2, spm, tim1)
]
)

# Get 2 records out
expect(rpt.data.count).to eq 2
Expand All @@ -81,10 +81,10 @@ def bld_hol(ocn, org, mms, tim)
end
it "reports the max date for a group" do
# Add 2 records (same org, diff dates) to db
cluster_tap_save [
cluster_tap_save(
bld_hol(ocn1, org1, spm, tim1),
bld_hol(ocn2, org1, spm, tim2)
]
)

# Get 1 record out (because select max, group on org+fmt)
res = rpt.data
Expand All @@ -95,28 +95,28 @@ def bld_hol(ocn, org, mms, tim)
expect(res.first).to eq expected
end
it "reports orgs separately" do
cluster_tap_save [
cluster_tap_save(
bld_hol(ocn1, org1, spm, tim1),
bld_hol(ocn1, org2, spm, tim1)
]
)
expect(rpt.data.count).to eq 2
end
it "reports mono_multi_serial separately" do
cluster_tap_save [
cluster_tap_save(
bld_hol(ocn1, org1, spm, tim1),
bld_hol(ocn1, org1, mpm, tim1)
]
)
expect(rpt.data.count).to eq 2
end
it "ignores ocn for grouping and reporting purposes" do
cluster_tap_save [
cluster_tap_save(
bld_hol(ocn1, org1, spm, tim1),
bld_hol(ocn2, org1, spm, tim1)
]
)
expect(rpt.data.count).to eq 1
end
it "writes a report file" do
cluster_tap_save [
cluster_tap_save(
# org1
bld_hol(ocn1, org1, spm, tim1),
bld_hol(ocn1, org1, mpm, tim1),
Expand All @@ -131,7 +131,7 @@ def bld_hol(ocn, org, mms, tim)
bld_hol(ocn1, org2, spm, tim2), # should count
bld_hol(ocn1, org2, mpm, tim2), # should count
bld_hol(ocn1, org2, ser, tim2) ## should count
]
)

expected_report = [
"organization\tformat\tmax_load_date",
Expand Down
Loading

0 comments on commit 23fb6f9

Please sign in to comment.