Skip to content

Commit

Permalink
Merge pull request #294 from hathitrust/DEV-562
Browse files Browse the repository at this point in the history
DEV-562, report count of commitments by phase & org
  • Loading branch information
mwarin authored Oct 23, 2023
2 parents 7952406 + 198668f commit cc0e52c
Show file tree
Hide file tree
Showing 8 changed files with 221 additions and 1 deletion.
7 changes: 7 additions & 0 deletions lib/phctl.rb
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,13 @@ def shared_print_newly_ingested
run_common_job(Reports::SharedPrintNewlyIngested, options)
end

desc "shared-print-phase-count (--phase=x)",
"Get tally of commitments per organization in the given phase"
option :phase, type: :numeric, default: nil
def shared_print_phase_count
run_common_job(Reports::SharedPrintPhaseCount, options)
end

desc "weeding-decision ORGANIZATION", "Generate a report to help ORG decide what to weed"
def weeding_decision(organization)
run_common_job(Reports::WeedingDecision, options, organization)
Expand Down
50 changes: 50 additions & 0 deletions lib/reports/shared_print_phase_count.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
require "shared_print/finder"
require "shared_print/phases"
require "utils/report_output"

# Output a .tsv report of which orgs have how many commitments in a phase.
# Defaults to latest (or, rather, _greatest_) phase.
module Reports
class SharedPrintPhaseCount
attr_reader :phase, :finder, :output
def initialize(phase: SharedPrint::Phases.list.max)
@phase = phase.to_i
# Potentially use Reports::Dynamic instead of SharedPrint::Finder
# (but SP::Finder does the trick just fine for now)
@finder = SharedPrint::Finder.new(phase: [@phase])
@output = Utils::ReportOutput.new("sp_phase#{@phase}_count")
end

def run
# Get an output handle.
handle = output.handle("w")
puts "Started writing to #{output.file}"
handle.puts(header)
# Get relevant commitments, tally organizations.
organization_tally = {}
commitments do |commitment|
organization_tally[commitment.organization] ||= 0
organization_tally[commitment.organization] += 1
end
# Output tally.
organization_tally.keys.sort.each do |org|
handle.puts [org, phase, organization_tally[org]].join("\t")
end
puts "Finished writing to #{output.file}"
ensure
handle.close
end

def commitments
return enum_for(:commitments) unless block_given?

finder.commitments.each do |commitment|
yield commitment
end
end

def header
["organization", "phase", "commitment_count"].join("\t")
end
end
end
1 change: 1 addition & 0 deletions lib/sidekiq_jobs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
require "reports/phase3_oclc_registration"
require "reports/rare_uncommitted"
require "reports/shared_print_newly_ingested"
require "reports/shared_print_phase_count"
require "reports/uncommitted_holdings"
require "reports/weeding_decision"
require "scrub/scrub_runner"
Expand Down
61 changes: 61 additions & 0 deletions lib/utils/report_output.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
require "securerandom"

# An attempt at simplifying the structure of report output dirs and files.
# Give report_name as input.
# The methods and their return:
# id: the "unique-ish" part of the filename
# dir: path to an output dir
# file: path to an output file
# handle("r"): an open read handle for file
# handle("w"): an open write handle for file
# Both dir and file will contain the report name,
# and the file a timestamp and a unique-ish string.
# dir is autovivified with mkdir_p.
# Example:
# output = Utils::ReportOutput.new("foo", ".txt")
# output.dir # -> "/local_reports/foo"
# output.file # -> "/local_reports/foo/foo_YYYYMMDD_abcd1234.txt"

module Utils
class ReportOutput
attr_reader :report_name, :ext
def initialize(report_name, ext = ".tsv")
@report_name = report_name # e.g. "cost_report" or "cost_report_umich"
@ext = ext
end

# IO for file
def handle(rw) # "r", "w" etc
File.open(file, rw)
end

# Full path to a file in a dir (a dir we are pretty sure exists)
def file
@file ||= File.join(dir, id) + ext
end

# mkdir to guarantee that it exists
def dir
@dir ||= dir_path.tap do |d|
FileUtils.mkdir_p(d).first
end
end

# Just the path
def dir_path
File.join(Settings.local_report_path || "/tmp", report_name)
end

def id
if @id.nil?
timestamp = Time.now.strftime("%Y%m%d_%H%M%S")
# Adding a random string in case 2 identical jobs that write a file
# are started in the same second.
rand_str = SecureRandom.hex[0..8]
# e.g. foo_YYYYMMDD_HHMMSS_abcd1234
@id = [report_name, timestamp, rand_str].join("_")
end
@id
end
end
end
8 changes: 8 additions & 0 deletions spec/integration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,14 @@ def cleanup(output)
rpt_out = File.read(glob)
expect(rpt_out).to match(/contributor/)
end

it "SharedPrintPhaseCount produces output" do
cluster_tap_save build(:commitment, phase: 0)
phctl(*%w[report shared-print-phase-count --phase 0])
glob = Dir.glob("#{ENV["TEST_TMP"]}/local_reports/sp_phase0_count/sp_phase0_count_*").first
lines = File.read(glob).split("\n")
expect(lines.count).to eq 2 # 1 header, 1 body
end
end

describe "Scrub" do
Expand Down
2 changes: 1 addition & 1 deletion spec/phctl_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
%w[report uncommitted-holdings] => Jobs::Common,
%w[report oclc-registration instid] => Jobs::Common,
%w[report phase3-oclc-registration instid] => Jobs::Common,

%w[report shared-print-phase-count --phase 1] => Jobs::Common,
# Has wrappers in holdings/jobs
%w[report member-counts infile outpath] => Jobs::Common,
%w[report costreport] => Jobs::Common,
Expand Down
30 changes: 30 additions & 0 deletions spec/reports/shared_print_phase_count_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
require "reports/shared_print_phase_count"
require "spec_helper"

RSpec.describe Reports::SharedPrintPhaseCount do
let(:org) { "umich" }
let(:phase) { 3 }
let(:report) { described_class.new(phase: phase) }
let(:commitment_count) { 5 }
before(:each) do
Cluster.collection.find.delete_many
end
it "gets commitments per phase, puts them in a file" do
# Make 5 umich commitments with phase:3 (these we want)
# and 5 umich with phase 0 (decoys)
commitments = []
1.upto(commitment_count) do |_i|
commitments << build(:commitment, organization: org, phase: phase)
commitments << build(:commitment, organization: org, phase: 0)
end
cluster_tap_save(*commitments)
# Verify that the report script sees the 5 with phase 3 and no decoys.
expect(report.commitments.count).to eq 5
# Actually run the report and check the output file
report.run
lines = File.read(report.output.file).split("\n")
expect(lines.count).to eq 2
expect(lines.first).to eq ["organization", "phase", "commitment_count"].join("\t")
expect(lines.last).to eq [org, phase, commitment_count].join("\t")
end
end
63 changes: 63 additions & 0 deletions spec/utils/report_output_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
require "spec_helper"
require "utils/report_output"

RSpec.describe Utils::ReportOutput do
let(:report_name) { "foo" }
let(:ext) { ".bar" }
let(:default_ext) { ".tsv" }
let(:report_output) { described_class.new(report_name, ext) }
it "has working attr_readers" do
expect(report_output.report_name).to eq report_name
expect(report_output.ext).to eq ext
end
it "has a default ext: .tsv" do
expect(described_class.new(report_name).ext).to eq default_ext
end
it "makes dir if missing" do
# dir_path only gives the path ...
# dir makes the path if missing,
dir_path = report_output.dir_path
expect(Dir.exist?(dir_path)).to be false
report_output.dir
expect(Dir.exist?(dir_path)).to be true
end
it "gives a predictable singleton-ish file" do
f1 = report_output.file
f2 = report_output.file
expect(f1).to eq f2
# Grab a new object if you want a new file
report_output2 = described_class.new(report_name, ext)
f3 = report_output.file
f4 = report_output2.file # this one will be different
expect(f2).to eq f3
# by power of transitivity we know f1-f3 are the same,
# we only expect f4 to be different
expect(f3).to_not eq f4
end
it "gives a predictable singleton-ish id" do
id1 = report_output.id
id2 = report_output.id
expect(id1).to eq id2
# Grab a new object if you want a new id
report_output2 = described_class.new(report_name, ext)
id3 = report_output.id
id4 = report_output2.id # this one will be different
expect(id2).to eq id3
# by power of transitivity we know id1-id3 are the same,
# we only expect id4 to be different
expect(id3).to_not eq id4
end
it "gives readable/writable handle" do
# write a string to a write-handle
write_handle = report_output.handle("w")
str = "It was a dark and stormy #{report_output.id} ..."
write_handle.puts(str)
write_handle.close
# and then read that string with a read-handle
read_handle = report_output.handle("r")
lines = read_handle.read.split("\n")
read_handle.close
expect(lines.count).to eq 1
expect(lines).to eq [str]
end
end

0 comments on commit cc0e52c

Please sign in to comment.