Skip to content

Commit

Permalink
DEV-967 make bib rights date independent (#28)
Browse files Browse the repository at this point in the history
* DEV-967 make bib rights date independent
- Add optional date argument to run_process_zephir_incremental.sh
- Add run_incremental_zephir.rb script to calculate a set of dates for run_process_zephir_incremental.sh
- Fix DATA_ROOT vs DATAROOT apparent typo
- Add nonzero result codes to error `exit`s
- Add `PostZephirProcessing::MonthlyInventory` class, integration and unit tests
- `POST_ZEPHIR_LOGGER_LEVEL`
* DEV-1059 Resolve wide character issue writing to rights_dbm
- UTF-8 encode the TSV written to DBM in `bld_rights_db.pl`
- Add `t/bld_rights_db.t` to call the build script with a crafted value in rights_current.
* - Use `system` rather than backticks to execute `run_process_zephir_incremental.sh`.
* - Replace `docker-compose` with `docker compose` in the GitHub actions, README, and `test_full_file.sh`
  • Loading branch information
moseshll authored Jan 31, 2024
1 parent 7e812a7 commit e2359f4
Show file tree
Hide file tree
Showing 15 changed files with 474 additions and 21 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,12 @@ jobs:

- name: Set up tests
run: |
docker-compose build
docker compose build
- name: Run tests
run: docker-compose run test bin/test_and_cover.sh
run: docker compose run test bin/test_and_cover.sh
env:
GITHUB_TOKEN: ${{ secrets.github_token }}

- name: Run Ruby tests
run: docker compose run test rspec
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ prof_library
config/.env
config/config.pl
config/.netrc
coverage/
zephir_full_daily_a*
*.gz
local
Expand Down
2 changes: 2 additions & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
--color
--require spec_helper
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ RUN apt-get update && apt-get install -y \
bsd-mailx \
msmtp \
netcat-traditional \
pigz
pigz \
ruby-rspec \
ruby-simplecov \
ruby-simplecov-html

RUN cpanm -n \
Data::Dumper \
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,12 @@ Running Tests
Tests with limited coverage can be run with Docker.

```bash
docker-compose build
docker-compose up -d
docker-compose run --rm pz perl t/test_postZephir.t
docker compose build
docker compose up -d
docker compose run --rm pz perl t/test_postZephir.t
```

For test coverage, replace the previous `docker-compose run` with
For test coverage, replace the previous `docker compose run` with
```bash
docker-compose run --rm pz bash -c "perl -MDevel::Cover=-silent,1 t/*.t && cover -nosummary /usr/src/app/cover_db"
docker compose run --rm pz bash -c "perl -MDevel::Cover=-silent,1 t/*.t && cover -nosummary /usr/src/app/cover_db"
```
4 changes: 3 additions & 1 deletion bld_rights_db.pl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
use File::Basename;
use POSIX qw(strftime);
use DBI qw(:sql_types);
use Encode qw(encode);
use Getopt::Std;
use rightsDB;
use strict;
Expand Down Expand Up @@ -60,7 +61,8 @@
$reason_code = $reason_codes->{$reason_num};
$access_profile_code = $access_profile_codes->{$access_profile_num};
my $mdp_id = join(".", $namespace, $id);
$INDEX{"$mdp_id"} = join("\t", $attr_code, $reason_code, $source_code, $timestamp, $note, $access_profile_code);
my $data = join("\t", $attr_code, $reason_code, $source_code, $timestamp, $note, $access_profile_code);
$INDEX{"$mdp_id"} = encode("UTF-8", $data);
$outcnt++;
}
dbmclose(%INDEX);
Expand Down
2 changes: 1 addition & 1 deletion config/defaults
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export RIGHTS_DBM=${RIGHTS_DBM:-$DATA_ROOT/rights_dbm}
export INGEST_BIBRECORDS=${INGEST_BIBRECORDS:-$DATA_ROOT/ingest_bibrecords}
export RIGHTS_DIR=${RIGHTS_DIR:-$DATA_ROOT/rights}
export CATALOG_PREP=${CATALOG_PREP:-$DATA_ROOT/catalog_prep}
export CATALOG_ARCHIVE=${CATALOG_ARCHIVE:-$DATAROOT/catalog_archive}
export CATALOG_ARCHIVE=${CATALOG_ARCHIVE:-$DATA_ROOT/catalog_archive}

for dir in $TMPDIR $DATA_ROOT $ZEPHIR_DATA $INGEST_BIBRECORDS $RIGHTS_DIR $CATALOG_PREP $CATALOG_ARCHIVE;
do mkdir -pv $dir;
Expand Down
104 changes: 104 additions & 0 deletions lib/monthly_inventory.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# frozen_string_literal: true

require "date"
require "logger"

module PostZephirProcessing
class MonthlyInventory
UPDATE_REGEXP = /^zephir_upd_(\d{8})\.json\.gz$/
DELETE_REGEXP = /^zephir_upd_(\d{8})_delete\.txt\.gz$/
RIGHTS_REGEXP = /^zephir_upd_(\d{8})\.rights$/
GROOVE_REGEXP = /^groove_incremental_(\d{4}-\d{2}-\d{2})\.tsv\.gz$/
TOUCHED_REGEXP = /^daily_touched_(\d{4}-\d{2}-\d{2})\.tsv\.gz$/
attr_reader :date, :logger, :inventory

# @param logger [Logger] defaults to STDOUT
# @param date [Date] the file datestamp date, not the "run date"
def initialize(logger: nil, date: (Date.today - 1))
@logger = logger || Logger.new($stdout, level: ENV.fetch("POST_ZEPHIR_LOGGER_LEVEL", Logger::INFO).to_i)
@date = date
@logger.info("MonthlyInventory using date #{@date}")
# TODO: these should go in .env/Canister
@catalog_prep_dir = ENV["CATALOG_PREP"] || "/htsolr/catalog/prep/"
@rights_dir = ENV["RIGHTS_DIR"] || "/htapps/babel/feed/var/rights"
@rights_archive_dir = File.join(@rights_dir, "archive")
@ingest_bibrecords_dir = ENV["INGEST_BIBRECORDS"] || "/htapps/babel/feed/var/bibrecords"
@ingest_bibrecords_archive_dir = File.join(@ingest_bibrecords_dir, "archive")
@inventory = {
zephir_update_files: zephir_update_files,
zephir_delete_files: zephir_delete_files,
zephir_rights_files: zephir_rights_files,
zephir_groove_files: zephir_groove_files,
zephir_touched_files: zephir_touched_files
}
end

# zephir_upd_YYYYMMDD.json.gz files for the current month.
# @return [Array<Date>] sorted ASC
def zephir_update_files
directory_inventory(directory: @catalog_prep_dir, regexp: UPDATE_REGEXP)
end

# zephir_upd_YYYYMMDD_delete.txt.gz files for the current month.
# @return [Array<Date>] sorted ASC
def zephir_delete_files
directory_inventory(directory: @catalog_prep_dir, regexp: DELETE_REGEXP)
end

# zephir_upd_YYYYMMDD.rights files for the current month.
# @return [Array<Date>] sorted ASC
def zephir_rights_files
directory_inventory(directory: @rights_dir, archive_directory: @rights_archive_dir, regexp: RIGHTS_REGEXP)
end

# groove_incremental_YYYY-MM-DD.tsv.gz files for the current month.
# @return [Array<Date>] sorted ASC
def zephir_groove_files
directory_inventory(directory: @ingest_bibrecords_dir, archive_directory: @ingest_bibrecords_archive_dir, regexp: GROOVE_REGEXP)
end

# daily_touched_YYYY-MM-DD.tsv.gz files for the current month.
# @return [Array<Date>] sorted ASC
def zephir_touched_files
directory_inventory(directory: @ingest_bibrecords_dir, archive_directory: @ingest_bibrecords_archive_dir, regexp: TOUCHED_REGEXP)
end

# Iterate over the parts of the inventory separately.
# Find the earliest (min) date missing (if any) from each.
# If a date is missing in any one of them then it is a do-over candidate.
# @return [Date,nil]
def earliest_missing_date
earliest = []
inventory.each do |_category, dates|
delta = all_dates - dates
earliest << delta.min unless delta.empty?
end
earliest.min
end

# Beginning of month to "present"
# @return [Array<Date>] sorted ASC
def all_dates
@all_dates ||= (Date.new(date.year, date.month, 1)..date).to_a.sort
end

private

# Run regexp against the contents of dir and store matching files
# that have datestamps in the month of interest.
# Do the same for the archive directory if it is supplied.
# Does not attempt to iterate nonexistent directory.
# @return [Array<Date>] sorted ASC
def directory_inventory(directory:, regexp:, archive_directory: nil)
dates = []
[directory, archive_directory].compact.uniq.each do |dir|
next unless File.directory? dir

dates += Dir.children(dir)
.filter_map { |filename| (m = regexp.match(filename)) && Date.parse(m[1]) }
.select { |file_date| file_date.month == date.month && file_date.year == date.year }
end
dates.sort.uniq
end
end
end
31 changes: 31 additions & 0 deletions run_incremental_zephir.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# frozen_string_literal: true

require "date"
require "logger"
require_relative "lib/monthly_inventory"

logger = Logger.new($stdout)
HOME = File.expand_path(__dir__).freeze
INCREMENTAL_SCRIPT = File.join(HOME, "run_process_zephir_incremental.sh")
YESTERDAY = Date.today - 1

inventory = PostZephirProcessing::MonthlyInventory.new(logger: logger, date: YESTERDAY)
logger.debug "all existing Zephir updates: #{inventory.inventory}"

if inventory.earliest_missing_date.nil?
logger.info "no Zephir files to process, exiting"
exit 0
end

dates = (inventory.earliest_missing_date..YESTERDAY)
logger.debug "Processing Zephir files for: #{dates}"
dates.each do |date|
date_str = date.strftime("%Y%m%d")
cmd = "#{INCREMENTAL_SCRIPT} #{date_str}"
logger.debug "Calling '#{cmd}'"
# Bail out if `system` returns false or nil
unless system(cmd)
logger.error "exitstatus #{$?.exitstatus} from '#{cmd}'"
exit 1
end
end
33 changes: 23 additions & 10 deletions run_process_zephir_incremental.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,26 @@
#!/bin/bash

# Call this script with:
# - exactly one date argument of the form YYYYMMDD
# OR
# - no arguments to use yesterday's date

if [[ $# -eq 0 ]]; then
YESTERDAY=`date --date="yesterday" +%Y%m%d`
else
if [[ "$1" =~ ^[0-9]{8}$ ]]; then
YESTERDAY=$1
else
echo "Invalid date format '$1', need YYYYMMDD"
exit 1
fi
fi

source $ROOTDIR/config/defaults
cd $TMPDIR

SCRIPTNAME=`basename $0`
zephir_date=`date --date="yesterday" +%Y-%m-%d`
YESTERDAY=`date --date="yesterday" +%Y%m%d`
TODAY=`date +%Y%m%d`
today_dash=`date +%Y-%m-%d`
zephir_date="$(echo $YESTERDAY | sed 's/\(....\)\(..\)/\1-\2-/')"

export us_fed_pub_exception_file="$FEDDOCS_HOME/feddocs_oclc_filter/oclcs_removed_from_registry.txt"

Expand Down Expand Up @@ -40,7 +53,7 @@ if [ ! -e $ZEPHIR_VUFIND_EXPORT ]; then
message="file $ZEPHIR_VUFIND_EXPORT not found, exitting"
echo "error, message is $message"
echo $message | mailx -s"error in $SCRIPTNAME" $EMAIL
exit
exit 1
fi

echo "`date`: retrieve $ZEPHIR_VUFIND_DELETE"
Expand All @@ -52,14 +65,14 @@ if [ $cmdstatus != "0" ]; then
message="Problem getting file ${ZEPHIR_VUFIND_DELETE} from zephir: rc is $cmdstatus"
echo "error, message is $message"
echo $message | mailx -s"error in $SCRIPTNAME" $EMAIL
exit
exit 1
fi

if [ ! -e $ZEPHIR_VUFIND_DELETE ]; then
message="file $ZEPHIR_VUFIND_DELETE not found, exitting"
echo "error, message is $message"
echo $message | mailx -s"error in $SCRIPTNAME" $EMAIL
exit
exit 1
fi

echo "`date`: retrieve $ZEPHIR_GROOVE_INCREMENTAL"
Expand Down Expand Up @@ -113,7 +126,7 @@ cmdstatus=$?
if [ $cmdstatus != "0" ]; then
message="Problem transferring file ${BASENAME}.json.gz to $CATALOG_PREP: rc is $cmdstatus"
echo $message
exit
exit 1
fi

# copy to ht archive directory
Expand All @@ -126,7 +139,7 @@ cmdstatus=$?
if [ $cmdstatus != "0" ]; then
message="Problem transferring file $ZEPHIR_VUFIND_DELETE to $CATALOG_PREP: rc is $cmdstatus"
echo $message
exit
exit 1
fi

echo "`date`: compress dollar dup files and send to zephir"
Expand All @@ -138,7 +151,7 @@ cmdstatus=$?
if [ $cmdstatus != "0" ]; then
message="Problem sending file ${ZEPHIR_VUFIND_DOLL_D}.gz to zephir: rc is $cmdstatus"
echo $message
exit
exit 1
fi

# This should have already been copied to the archive/catalog
Expand Down
67 changes: 67 additions & 0 deletions spec/integration/monthly_inventory_integration_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# frozen_string_literal: true

require "date"
require "fileutils"
require "tmpdir"

RSpec.describe "MonthlyInventory Integration" do
around(:each) do |example|
Dir.mktmpdir do |tmpdir|
setup_test_dirs(parent_dir: tmpdir)
setup_test_files(date: Date.parse("2023-11-01"))
example.run
end
end

describe "all files present" do
it "returns nil" do
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30"))
expect(mi.earliest_missing_date).to be_nil
end
end

describe "one date missing" do
it "returns the earliest" do
date = Date.parse("2023-11-03")
FileUtils.rm rights_file_for_date(date: date)
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30"))
expect(mi.earliest_missing_date).to eq date
end
end

describe "different date in each category missing" do
it "returns the earliest" do
[
delete_file_for_date(date: Date.parse("2023-11-20")),
update_file_for_date(date: Date.parse("2023-11-19")),
rights_file_for_date(date: Date.parse("2023-11-18")),
groove_file_for_date(date: Date.parse("2023-11-17")),
touched_file_for_date(date: Date.parse("2023-11-16"))
].each do |file|
FileUtils.rm file
end
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30"))
expect(mi.earliest_missing_date).to eq Date.parse("2023-11-16")
end
end

describe "multiple dates missing" do
it "returns the earliest" do
dates = (Date.parse("2023-11-26")..Date.parse("2023-11-30"))
dates.each do |date|
FileUtils.rm delete_file_for_date(date: date)
end
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30"))
expect(mi.earliest_missing_date).to eq dates.first
end
end

describe "find file not yet moved to archive directory" do
it "returns nil" do
date = Date.parse("2023-11-10")
FileUtils.mv groove_file_for_date(date: date), groove_file_for_date(date: date, archive: false)
mi = PostZephirProcessing::MonthlyInventory.new(date: Date.parse("2023-11-30"))
expect(mi.earliest_missing_date).to be_nil
end
end
end
Loading

0 comments on commit e2359f4

Please sign in to comment.