From c9b805adf3cd55dc7eaee6b34fc6f2150c35f075 Mon Sep 17 00:00:00 2001 From: Brian Moses Hall Date: Tue, 30 Apr 2024 13:46:42 -0400 Subject: [PATCH 1/2] TTO-207 investigate discrepancy in hathifiles full vs. incremental `cutoff` variable used in `generate_hathifile.rb` was excluding HTIDs that arguably changed due to changes in the catalog record. This patch gets rid of that variable and treats all HTIDs as having been affected by the record change (thus including them in the resulting hathifile) regardless of their 974(d) update date. Tests should be refactored at some point but I consider it out of scope for a TTO issue. --- jobs/generate_hathifile.rb | 16 +--------------- spec/data/000018677-20220808-upd.tsv | 1 + 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/jobs/generate_hathifile.rb b/jobs/generate_hathifile.rb index b107f41..2c78b32 100644 --- a/jobs/generate_hathifile.rb +++ b/jobs/generate_hathifile.rb @@ -37,27 +37,13 @@ def run_file(zephir_file) File.open(infile) end - # We only want to write some of the items in the zephr records: - # For upd files we only include entries dated on or after the - # datestamp in the Zephir file name. - # For full files we want everything; cutoff defaults to nil, which - # short-circuits the cutoff check. - cutoff = if zephir_file.type == "upd" - zephir_file.date - end - outfile = File.join(Settings.hathifiles_dir, zephir_file.hathifile) - Services[:logger].info "Outfile: #{outfile}" - Services[:logger].info "Cutoff: #{cutoff.inspect}" Tempfile.create do |fout| fin.each do |line| BibRecord.new(line).hathifile_records.each do |rec| - record_date = Date.parse rec[:update_date] - if cutoff.nil? || record_date >= cutoff - fout.puts record_from_bib_record(rec).join("\t") - end + fout.puts record_from_bib_record(rec).join("\t") end tracker.increment_and_log_batch_line end diff --git a/spec/data/000018677-20220808-upd.tsv b/spec/data/000018677-20220808-upd.tsv index 800b211..e3bbc7c 100644 --- a/spec/data/000018677-20220808-upd.tsv +++ b/spec/data/000018677-20220808-upd.tsv @@ -1 +1,2 @@ mdp.39015027625402 deny ic 000018677 MIU 990000186770106381 1613293 66014593 Go up for glory, by Bill Russell, as told to William McSweeny. Coward-McCann [1966] bib 0 1966 eng BK MIU umich umich google Russell, Bill, 1934-2022. +mdp.39015003746396 deny ic 000018677 MIU 990000186770106381 1613293 66014593 Go up for glory, by Bill Russell, as told to William McSweeny. Coward-McCann [1966] bib 0 1966 eng BK MIU umich umich google Russell, Bill, 1934-2022. From 42eda5954d1a0fa1b85fbd9a09b95a6eacee3859 Mon Sep 17 00:00:00 2001 From: Brian Moses Hall Date: Tue, 30 Apr 2024 22:45:51 -0400 Subject: [PATCH 2/2] Appease Coveralls --- spec/jobs/update_hathifile_listing_spec.rb | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/spec/jobs/update_hathifile_listing_spec.rb b/spec/jobs/update_hathifile_listing_spec.rb index a344336..2749a94 100644 --- a/spec/jobs/update_hathifile_listing_spec.rb +++ b/spec/jobs/update_hathifile_listing_spec.rb @@ -94,5 +94,19 @@ expect(metrics).to match(/^job_last_success\S*job="update_hathifile_listing"\S* \S+/m) .and match(/^job_records_processed\S*job="update_hathifile_listing"\S* [^0]\d*$/m) end + + it "removes existing files that are too old" do + # Make some files that are about half a year old + old_files = [] + 5.times do |i| + old_file = File.join(@tmp_web_dir, "/hathi_upd_#{(Date.today - (180 + i)).strftime("%Y%m%d")}.txt.gz") + FileUtils.touch(old_file) + old_files << old_file + end + hflist.run + old_files.each do |old_file| + expect(File.exist?(old_file)).to eq(false) + end + end end end