Skip to content

Commit

Permalink
Addresses #52 profile full monthly generation (#55)
Browse files Browse the repository at this point in the history
* Addresses #52 Profile full monthly generation
- Experiment with removing one-by-one rights retrieval in favor of Sequel batch query for multiple htids.
  - Add `HathifileWriter` class with limit on number of HTIDs in batch rights query
- Use Ettin Settings for DB connection string instead of `ENV` directly
  - Connect to `ht` instead of `ht_rights` so `ht.ht_collections` can be read
- Add `mariadb-client` to both dockerfiles
- Update to `hathifiles_database` 0.3.0
  • Loading branch information
moseshll authored Aug 19, 2024
1 parent 2957946 commit bcbfa9f
Show file tree
Hide file tree
Showing 24 changed files with 232 additions and 473 deletions.
4 changes: 1 addition & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@ ARG UID=1000
ARG GID=1000

RUN apt-get update -yqq && apt-get install -yqq --no-install-recommends \
nodejs \
netcat-traditional

mariadb-client

# COPY Gemfile* /usr/src/app/
WORKDIR /usr/src/app
Expand Down
3 changes: 3 additions & 0 deletions Dockerfile.prod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ ARG UNAME=holdings
ARG UID=1000
ARG GID=1000

RUN apt-get update -yqq && apt-get install -yqq --no-install-recommends \
mariadb-client

RUN gem install bundler
RUN groupadd -g $GID -o $UNAME
RUN useradd -m -d /usr/src/app -u $UID -g $GID -o -s /bin/bash $UNAME
Expand Down
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ gem "canister"
gem "date_named_file"
gem "dotenv"
gem "ettin"
gem "fast_jsonparser"
gem "httpclient"
gem "marc"
gem "milemarker"
Expand Down
10 changes: 6 additions & 4 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ GIT

GIT
remote: https://github.com/hathitrust/hathifiles_database.git
revision: 9c33e648ffc171d665bd46999d00e968560b2451
revision: 66cb21e0e31c7f491ce2669f70fee79b79db3def
branch: main
specs:
hathifiles_database (0.2.2)
hathifiles_database (0.3.0)
date_named_file
dotenv
ettin
Expand Down Expand Up @@ -74,6 +74,7 @@ GEM
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
faraday-net_http (3.0.2)
fast_jsonparser (0.6.0)
ffi (1.16.3)
ffi-compiler (1.0.1)
ffi (>= 1.0.0)
Expand Down Expand Up @@ -168,7 +169,7 @@ GEM
ruby-progressbar (1.13.0)
ruby2_keywords (0.0.5)
scrub_rb (1.0.1)
sequel (5.75.0)
sequel (5.82.0)
bigdecimal
simplecov (0.22.0)
docile (~> 1.1)
Expand Down Expand Up @@ -238,6 +239,7 @@ DEPENDENCIES
ettin
factory_bot
faraday
fast_jsonparser
filter!
hathifiles_database!
httpclient
Expand All @@ -255,4 +257,4 @@ DEPENDENCIES
zinzout

BUNDLED WITH
2.4.22
2.5.15
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ services:
mariadb: *healthy
pushgateway: *healthy
environment:
DB_CONNECTION_STRING: "mysql2://ht_rights:ht_rights@mariadb/ht"
MYSQL_CONNECTION_STRING: "mysql2://ht_rights:ht_rights@mariadb/ht"
PUSHGATEWAY: http://pushgateway:9091

mariadb:
Expand Down
74 changes: 14 additions & 60 deletions jobs/generate_hathifile.rb
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require "bib_record"
require "date"
require "settings"
require "services"
$LOAD_PATH.unshift File.expand_path(Pathname.new("#{File.dirname(__FILE__)}/../lib"))

require "push_metrics"

require "hathifile_writer"
require "services"
require "settings"
require "zephir_files"

class GenerateHathifile
attr_reader :tracker

def initialize
@tracker = PushMetrics.new(batch_size: 10_000, job_name: "generate_hathifiles")
@tracker = PushMetrics.new(batch_size: 10_000, job_name: "generate_hathifiles",
logger: Services["logger"])
end

def run
Expand All @@ -36,64 +39,15 @@ def run_file(zephir_file)
else
File.open(infile)
end

outfile = File.join(Settings.hathifiles_dir, zephir_file.hathifile)
Services[:logger].info "Outfile: #{outfile}"

Tempfile.create("hathifiles") do |fout|
Services[:logger].info "writing to tempfile #{fout.path}"
fin.each_with_index do |line, i|
if i % 100_000 == 0
Services[:logger].info "writing line #{i}"
end
BibRecord.new(line).hathifile_records.each do |rec|
fout.puts record_from_bib_record(rec).join("\t")
end
tracker.increment_and_log_batch_line
end
fout.flush
Services[:logger].info "Gzipping: #{fout.path}"
system("gzip #{fout.path}")
gzfile = fout.path + ".gz"
# Move tempfile into place
Services[:logger].info "Moving tempfile #{gzfile} -> #{outfile}"
FileUtils.mv(gzfile, outfile)
Services[:logger].info "Setting 0644 permissions on #{outfile}"
FileUtils.chmod(0o644, outfile)
writer = HathifileWriter.new(hathifile: zephir_file.hathifile)
fin.each do |line|
records = BibRecord.new(line).hathifile_records.to_a
writer.add records
tracker.increment_and_log_batch_line
end
writer.finish
fin.close
end

def record_from_bib_record(rec)
[
rec[:htid],
rec[:access],
rec[:rights],
rec[:ht_bib_key],
rec[:description],
(rec[:source] || ""),
(rec[:source_bib_num].join(",") || ""),
rec[:oclc_num].join(","),
rec[:isbn].join(","),
rec[:issn].join(","),
rec[:lccn].join(","),
rec[:title].join(","),
rec[:imprint].join(", "),
(rec[:rights_reason_code] || ""),
(rec[:rights_timestamp]&.strftime("%Y-%m-%d %H:%M:%S") || ""),
rec[:us_gov_doc_flag],
rec[:rights_date_used],
rec[:pub_place],
rec[:lang],
rec[:bib_fmt],
(rec[:collection_code] || ""),
(rec[:content_provider_code] || ""),
(rec[:responsible_entity_code] || ""),
(rec[:digitization_agent_code] || ""),
(rec[:access_profile] || ""),
(rec[:author].join(", ") || "")
]
end
end

# Force logger to flush STDOUT on write so we can see what out Argo Workflows are doing.
Expand Down
3 changes: 2 additions & 1 deletion lib/bib_record.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require "marc"
require "traject"
require "traject/macros/marc21_semantics"
require "fast_jsonparser"
require "json"
require "place_of_publication"
require "us_fed_doc"
Expand Down Expand Up @@ -39,7 +40,7 @@ def self.bib_fmt(rec_type:, bib_level:)
end

def initialize(marc_in_json)
@marc = MARC::Record.new_from_hash(JSON.parse(marc_in_json))
@marc = MARC::Record.new_from_hash FastJsonparser.parse(marc_in_json, symbolize_keys: false)
end

def ht_bib_key
Expand Down
2 changes: 1 addition & 1 deletion lib/collections_database/collections.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def initialize(collections = load_from_db)
end

def load_from_db
Services.collections_db[:ht_collections]
Services.db[:ht_collections]
.select(:collection,
:content_provider_cluster,
:responsible_entity,
Expand Down
64 changes: 0 additions & 64 deletions lib/collections_database/collections_db.rb

This file was deleted.

62 changes: 62 additions & 0 deletions lib/database.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# frozen_string_literal: true

require "dotenv"
Dotenv.load(".env")

require "delegate"
require "mysql2"
require "sequel"
require "settings"

# Backend for connection to MySQL database for production information about
# rights
class Database < SimpleDelegator
attr_reader :rawdb
attr_accessor :connection_string

def initialize(connection_string = Settings.database.url, **)
@rawdb = self.class.connection(connection_string, **)
super(@rawdb)
end

# #connection will take
# * a full connection string (passed here OR in Settings which is generally the environment
# variable MYSQL_CONNECTION_STRING)
# * a set of named arguments, drawn from those passed in and the
# environment. Arguments are those supported by Sequel.
#
# Environment variables are mapped as follows:
#
# user: DB_USER
# password: DB_PASSWORD
# host: DB_HOST
# port: DB_PORT
# database: DB_DATABASE
# adapter: DB_ADAPTER
def self.connection(connection_string = Settings.database.url, **kwargs)
if connection_string.nil?
db_args = gather_db_args(kwargs).merge(
config_local_infile: true
)
Sequel.connect(**db_args, logger: Logger.new($stdout, level: Logger::WARN))
else
Sequel.connect(connection_string, logger: Logger.new($stdout, level: Logger::WARN))
end
end

class << self
private

def gather_db_args(args)
%i[user password host
port database adapter].each do |db_arg|
args[db_arg] ||= ENV["DB_#{db_arg.to_s.upcase}"]
end

args[:host] ||= "localhost"
args[:adapter] ||= :mysql2
args[:database] ||= "ht"
args
end
end
end
Loading

0 comments on commit bcbfa9f

Please sign in to comment.