-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
DEV-967 make bib rights date independent (#28)
* DEV-967 make bib rights date independent - Add optional date argument to run_process_zephir_incremental.sh - Add run_incremental_zephir.rb script to calculate a set of dates for run_process_zephir_incremental.sh - Fix DATA_ROOT vs DATAROOT apparent typo - Add nonzero result codes to error `exit`s - Add `PostZephirProcessing::MonthlyInventory` class, integration and unit tests - `POST_ZEPHIR_LOGGER_LEVEL` * DEV-1059 Resolve wide character issue writing to rights_dbm - UTF-8 encode the TSV written to DBM in `bld_rights_db.pl` - Add `t/bld_rights_db.t` to call the build script with a crafted value in rights_current. * - Use `system` rather than backticks to execute `run_process_zephir_incremental.sh`. * - Replace `docker-compose` with `docker compose` in the GitHub actions, README, and `test_full_file.sh`
- Loading branch information
Showing
15 changed files
with
474 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ prof_library | |
config/.env | ||
config/config.pl | ||
config/.netrc | ||
coverage/ | ||
zephir_full_daily_a* | ||
*.gz | ||
local | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
--color | ||
--require spec_helper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# frozen_string_literal: true | ||
|
||
require "date" | ||
require "logger" | ||
|
||
module PostZephirProcessing | ||
class MonthlyInventory | ||
UPDATE_REGEXP = /^zephir_upd_(\d{8})\.json\.gz$/ | ||
DELETE_REGEXP = /^zephir_upd_(\d{8})_delete\.txt\.gz$/ | ||
RIGHTS_REGEXP = /^zephir_upd_(\d{8})\.rights$/ | ||
GROOVE_REGEXP = /^groove_incremental_(\d{4}-\d{2}-\d{2})\.tsv\.gz$/ | ||
TOUCHED_REGEXP = /^daily_touched_(\d{4}-\d{2}-\d{2})\.tsv\.gz$/ | ||
attr_reader :date, :logger, :inventory | ||
|
||
# @param logger [Logger] defaults to STDOUT | ||
# @param date [Date] the file datestamp date, not the "run date" | ||
def initialize(logger: nil, date: (Date.today - 1)) | ||
@logger = logger || Logger.new($stdout, level: ENV.fetch("POST_ZEPHIR_LOGGER_LEVEL", Logger::INFO).to_i) | ||
@date = date | ||
@logger.info("MonthlyInventory using date #{@date}") | ||
# TODO: these should go in .env/Canister | ||
@catalog_prep_dir = ENV["CATALOG_PREP"] || "/htsolr/catalog/prep/" | ||
@rights_dir = ENV["RIGHTS_DIR"] || "/htapps/babel/feed/var/rights" | ||
@rights_archive_dir = File.join(@rights_dir, "archive") | ||
@ingest_bibrecords_dir = ENV["INGEST_BIBRECORDS"] || "/htapps/babel/feed/var/bibrecords" | ||
@ingest_bibrecords_archive_dir = File.join(@ingest_bibrecords_dir, "archive") | ||
@inventory = { | ||
zephir_update_files: zephir_update_files, | ||
zephir_delete_files: zephir_delete_files, | ||
zephir_rights_files: zephir_rights_files, | ||
zephir_groove_files: zephir_groove_files, | ||
zephir_touched_files: zephir_touched_files | ||
} | ||
end | ||
|
||
# zephir_upd_YYYYMMDD.json.gz files for the current month. | ||
# @return [Array<Date>] sorted ASC | ||
def zephir_update_files | ||
directory_inventory(directory: @catalog_prep_dir, regexp: UPDATE_REGEXP) | ||
end | ||
|
||
# zephir_upd_YYYYMMDD_delete.txt.gz files for the current month. | ||
# @return [Array<Date>] sorted ASC | ||
def zephir_delete_files | ||
directory_inventory(directory: @catalog_prep_dir, regexp: DELETE_REGEXP) | ||
end | ||
|
||
# zephir_upd_YYYYMMDD.rights files for the current month. | ||
# @return [Array<Date>] sorted ASC | ||
def zephir_rights_files | ||
directory_inventory(directory: @rights_dir, archive_directory: @rights_archive_dir, regexp: RIGHTS_REGEXP) | ||
end | ||
|
||
# groove_incremental_YYYY-MM-DD.tsv.gz files for the current month. | ||
# @return [Array<Date>] sorted ASC | ||
def zephir_groove_files | ||
directory_inventory(directory: @ingest_bibrecords_dir, archive_directory: @ingest_bibrecords_archive_dir, regexp: GROOVE_REGEXP) | ||
end | ||
|
||
# daily_touched_YYYY-MM-DD.tsv.gz files for the current month. | ||
# @return [Array<Date>] sorted ASC | ||
def zephir_touched_files | ||
directory_inventory(directory: @ingest_bibrecords_dir, archive_directory: @ingest_bibrecords_archive_dir, regexp: TOUCHED_REGEXP) | ||
end | ||
|
||
# Iterate over the parts of the inventory separately. | ||
# Find the earliest (min) date missing (if any) from each. | ||
# If a date is missing in any one of them then it is a do-over candidate. | ||
# @return [Date,nil] | ||
def earliest_missing_date | ||
earliest = [] | ||
inventory.each do |_category, dates| | ||
delta = all_dates - dates | ||
earliest << delta.min unless delta.empty? | ||
end | ||
earliest.min | ||
end | ||
|
||
# Beginning of month to "present" | ||
# @return [Array<Date>] sorted ASC | ||
def all_dates | ||
@all_dates ||= (Date.new(date.year, date.month, 1)..date).to_a.sort | ||
end | ||
|
||
private | ||
|
||
# Run regexp against the contents of dir and store matching files | ||
# that have datestamps in the month of interest. | ||
# Do the same for the archive directory if it is supplied. | ||
# Does not attempt to iterate nonexistent directory. | ||
# @return [Array<Date>] sorted ASC | ||
def directory_inventory(directory:, regexp:, archive_directory: nil) | ||
dates = [] | ||
[directory, archive_directory].compact.uniq.each do |dir| | ||
next unless File.directory? dir | ||
|
||
dates += Dir.children(dir) | ||
.filter_map { |filename| (m = regexp.match(filename)) && Date.parse(m[1]) } | ||
.select { |file_date| file_date.month == date.month && file_date.year == date.year } | ||
end | ||
dates.sort.uniq | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
# frozen_string_literal: true | ||
|
||
require "date" | ||
require "logger" | ||
require_relative "lib/monthly_inventory" | ||
|
||
logger = Logger.new($stdout) | ||
HOME = File.expand_path(__dir__).freeze | ||
INCREMENTAL_SCRIPT = File.join(HOME, "run_process_zephir_incremental.sh") | ||
YESTERDAY = Date.today - 1 | ||
|
||
inventory = PostZephirProcessing::MonthlyInventory.new(logger: logger, date: YESTERDAY) | ||
logger.debug "all existing Zephir updates: #{inventory.inventory}" | ||
|
||
if inventory.earliest_missing_date.nil? | ||
logger.info "no Zephir files to process, exiting" | ||
exit 0 | ||
end | ||
|
||
dates = (inventory.earliest_missing_date..YESTERDAY) | ||
logger.debug "Processing Zephir files for: #{dates}" | ||
dates.each do |date| | ||
date_str = date.strftime("%Y%m%d") | ||
cmd = "#{INCREMENTAL_SCRIPT} #{date_str}" | ||
logger.debug "Calling '#{cmd}'" | ||
# Bail out if `system` returns false or nil | ||
unless system(cmd) | ||
logger.error "exitstatus #{$?.exitstatus} from '#{cmd}'" | ||
exit 1 | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# frozen_string_literal: true | ||
|
||
require "date" | ||
require "fileutils" | ||
require "tmpdir" | ||
|
||
RSpec.describe "MonthlyInventory Integration" do | ||
around(:each) do |example| | ||
Dir.mktmpdir do |tmpdir| | ||
setup_test_dirs(parent_dir: tmpdir) | ||
setup_test_files(date: Date.parse("2023-11-01")) | ||
example.run | ||
end | ||
end | ||
|
||
describe "all files present" do | ||
it "returns nil" do | ||
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30")) | ||
expect(mi.earliest_missing_date).to be_nil | ||
end | ||
end | ||
|
||
describe "one date missing" do | ||
it "returns the earliest" do | ||
date = Date.parse("2023-11-03") | ||
FileUtils.rm rights_file_for_date(date: date) | ||
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30")) | ||
expect(mi.earliest_missing_date).to eq date | ||
end | ||
end | ||
|
||
describe "different date in each category missing" do | ||
it "returns the earliest" do | ||
[ | ||
delete_file_for_date(date: Date.parse("2023-11-20")), | ||
update_file_for_date(date: Date.parse("2023-11-19")), | ||
rights_file_for_date(date: Date.parse("2023-11-18")), | ||
groove_file_for_date(date: Date.parse("2023-11-17")), | ||
touched_file_for_date(date: Date.parse("2023-11-16")) | ||
].each do |file| | ||
FileUtils.rm file | ||
end | ||
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30")) | ||
expect(mi.earliest_missing_date).to eq Date.parse("2023-11-16") | ||
end | ||
end | ||
|
||
describe "multiple dates missing" do | ||
it "returns the earliest" do | ||
dates = (Date.parse("2023-11-26")..Date.parse("2023-11-30")) | ||
dates.each do |date| | ||
FileUtils.rm delete_file_for_date(date: date) | ||
end | ||
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30")) | ||
expect(mi.earliest_missing_date).to eq dates.first | ||
end | ||
end | ||
|
||
describe "find file not yet moved to archive directory" do | ||
it "returns nil" do | ||
date = Date.parse("2023-11-10") | ||
FileUtils.mv groove_file_for_date(date: date), groove_file_for_date(date: date, archive: false) | ||
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30")) | ||
expect(mi.earliest_missing_date).to be_nil | ||
end | ||
end | ||
end |
Oops, something went wrong.