From 7e3ef9de2b1bbb83655a728058fe3f7299f73ea4 Mon Sep 17 00:00:00 2001 From: Brian Moses Hall Date: Thu, 18 Jul 2024 11:29:49 -0400 Subject: [PATCH] Addresses #52 Profile full monthly generation - Experiment with removing one-by-one rights retrieval in favor of Sequel batch query for multiple htids. --- Gemfile | 1 + Gemfile.lock | 2 + jobs/generate_hathifile.rb | 67 +++++++++++++++++-- lib/bib_record.rb | 3 +- lib/collections_database/collections.rb | 5 +- lib/collections_database/collections_db.rb | 64 ------------------ lib/database.rb | 63 +++++++++++++++++ lib/hathifiles.rb | 2 +- lib/item_record.rb | 31 ++------- lib/rights_database/access_profiles.rb | 39 ----------- lib/rights_database/rights.rb | 43 ------------ lib/rights_database/rights_attributes.rb | 39 ----------- lib/rights_database/rights_db.rb | 67 ------------------- lib/rights_database/rights_reasons.rb | 37 ---------- lib/services.rb | 25 +++---- .../collections_db_spec.rb | 39 ----------- .../rights_db_spec.rb => database_spec.rb} | 4 +- spec/item_record_spec.rb | 19 ------ spec/jobs/generate_hathifile_spec.rb | 10 +++ spec/rights_database/rights_spec.rb | 14 ---- 20 files changed, 162 insertions(+), 412 deletions(-) delete mode 100644 lib/collections_database/collections_db.rb create mode 100644 lib/database.rb delete mode 100644 lib/rights_database/access_profiles.rb delete mode 100644 lib/rights_database/rights.rb delete mode 100644 lib/rights_database/rights_attributes.rb delete mode 100644 lib/rights_database/rights_db.rb delete mode 100644 lib/rights_database/rights_reasons.rb delete mode 100644 spec/collections_database/collections_db_spec.rb rename spec/{rights_database/rights_db_spec.rb => database_spec.rb} (91%) delete mode 100644 spec/rights_database/rights_spec.rb diff --git a/Gemfile b/Gemfile index 5875ddb..9b4167a 100644 --- a/Gemfile +++ b/Gemfile @@ -8,6 +8,7 @@ gem "canister" gem "date_named_file" gem "dotenv" gem "ettin" +gem "fast_jsonparser" gem "httpclient" gem "marc" gem "milemarker" diff --git a/Gemfile.lock b/Gemfile.lock index 1c812cd..1d4ad1b 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -74,6 +74,7 @@ GEM faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) faraday-net_http (3.0.2) + fast_jsonparser (0.6.0) ffi (1.16.3) ffi-compiler (1.0.1) ffi (>= 1.0.0) @@ -238,6 +239,7 @@ DEPENDENCIES ettin factory_bot faraday + fast_jsonparser filter! hathifiles_database! httpclient diff --git a/jobs/generate_hathifile.rb b/jobs/generate_hathifile.rb index eb50c72..935cdda 100644 --- a/jobs/generate_hathifile.rb +++ b/jobs/generate_hathifile.rb @@ -6,13 +6,15 @@ require "settings" require "services" require "push_metrics" +require "sequel" require "zephir_files" class GenerateHathifile attr_reader :tracker def initialize - @tracker = PushMetrics.new(batch_size: 10_000, job_name: "generate_hathifiles") + @tracker = PushMetrics.new(batch_size: 10_000, job_name: "generate_hathifiles", + logger: Services["logger"]) end def run @@ -42,14 +44,24 @@ def run_file(zephir_file) Tempfile.create("hathifiles") do |fout| Services[:logger].info "writing to tempfile #{fout.path}" - fin.each_with_index do |line, i| - if i % 100_000 == 0 - Services[:logger].info "writing line #{i}" + fin.each_slice(1000) do |lines| + recs = [] + lines.each do |line| + recs += BibRecord.new(line).hathifile_records.to_a end - BibRecord.new(line).hathifile_records.each do |rec| + htids = recs.map { |rec| rec[:htid] } + htids_to_rights = batch_extract_rights(htids) + last_bib_key = nil + recs.each do |rec| + rights = htids_to_rights[rec[:htid]] || {} + rec[:rights_timestamp] = rights[:rights_timestamp] + rec[:access_profile] = rights[:access_profile] fout.puts record_from_bib_record(rec).join("\t") + if last_bib_key != rec[:ht_bib_key] + tracker.increment_and_log_batch_line + last_bib_key = rec[:ht_bib_key] + end end - tracker.increment_and_log_batch_line end fout.flush Services[:logger].info "Gzipping: #{fout.path}" @@ -94,6 +106,49 @@ def record_from_bib_record(rec) (rec[:author].join(", ") || "") ] end + + # Map htid -> rights for this batch + def batch_extract_rights(htids) + htids_to_rights = {} + Services.db[:rights_current] + .join(:access_profiles, id: Sequel[:rights_current][:access_profile]) + .select( + Sequel.as(qualified_rights_current_htid, :htid), + Sequel.as(qualified_rights_current_time, :rights_timestamp), + Sequel.as(Sequel.qualify(:access_profiles, :name), :access_profile) + ) + .where(qualified_rights_current_htid => htids) + .each do |record| + htids_to_rights[record[:htid]] = { + rights_timestamp: record[:rights_timestamp], + access_profile: record[:access_profile] + } + end + htids_to_rights + end + + private + + def qualified_rights_current_namespace + @qualified_rights_current_namespace ||= Sequel.qualify(:rights_current, :namespace) + end + + def qualified_rights_current_id + @qualified_rights_current_id ||= Sequel.qualify(:rights_current, :id) + end + + def qualified_rights_current_htid + @qualified_rights_current_htid = Sequel.function( + :concat, + qualified_rights_current_namespace, + ".", + qualified_rights_current_id + ) + end + + def qualified_rights_current_time + @qualified_rights_current_time ||= Sequel.qualify(:rights_current, :time) + end end # Force logger to flush STDOUT on write so we can see what out Argo Workflows are doing. diff --git a/lib/bib_record.rb b/lib/bib_record.rb index e67c78f..c23f5fe 100644 --- a/lib/bib_record.rb +++ b/lib/bib_record.rb @@ -3,6 +3,7 @@ require "marc" require "traject" require "traject/macros/marc21_semantics" +require "fast_jsonparser" require "json" require "place_of_publication" require "us_fed_doc" @@ -39,7 +40,7 @@ def self.bib_fmt(rec_type:, bib_level:) end def initialize(marc_in_json) - @marc = MARC::Record.new_from_hash(JSON.parse(marc_in_json)) + @marc = MARC::Record.new_from_hash FastJsonparser.parse(marc_in_json, symbolize_keys: false) end def ht_bib_key diff --git a/lib/collections_database/collections.rb b/lib/collections_database/collections.rb index 426c91b..d495099 100644 --- a/lib/collections_database/collections.rb +++ b/lib/collections_database/collections.rb @@ -36,7 +36,7 @@ def initialize(collections = load_from_db) end def load_from_db - Services.collections_db[:ht_collections] + Services.db[:ht_collections] .select(:collection, :content_provider_cluster, :responsible_entity, @@ -50,7 +50,8 @@ def [](collection) if @collections.key?(collection) @collections[collection] else - raise KeyError, "No collection data for collection:#{collection}" + nil + # raise KeyError, "No collection data for collection:#{collection}" end end diff --git a/lib/collections_database/collections_db.rb b/lib/collections_database/collections_db.rb deleted file mode 100644 index 9df0bf3..0000000 --- a/lib/collections_database/collections_db.rb +++ /dev/null @@ -1,64 +0,0 @@ -# frozen_string_literal: true - -require "delegate" -require "mysql2" -require "sequel" -require "services" -require "tempfile" - -module CollectionsDatabase - # Backend for connection to MySQL database for production information about - # collections - class CollectionsDB < SimpleDelegator - attr_reader :rawdb - attr_accessor :connection_string - - def initialize(connection_string = ENV["DB_CONNECTION_STRING"], **) - @rawdb = self.class.connection(connection_string, **) - super(@rawdb) - end - - # #connection will take - # * a full connection string (passed here OR in the environment - # variable MYSQL_CONNECTION_STRING) - # * a set of named arguments, drawn from those passed in and the - # environment. Arguments are those supported by Sequel. - # - # Environment variables are mapped as follows: - # - # user: DB_USER - # password: DB_PASSWORD - # host: DB_HOST - # port: DB_PORT - # database: DB_DATABASE - # adapter: DB_ADAPTER - def self.connection(connection_string = ENV["DB_CONNECTION_STRING"], - **kwargs) - - if connection_string.nil? - db_args = gather_db_args(kwargs).merge( - config_local_infile: true - ) - Sequel.connect(**db_args) - else - Sequel.connect(connection_string) - end - end - - class << self - private - - def gather_db_args(args) - %i[user password host - port database adapter].each do |db_arg| - args[db_arg] ||= ENV["DB_#{db_arg.to_s.upcase}"] - end - - args[:host] ||= "localhost" - args[:adapter] ||= :mysql2 - args[:database] ||= "ht_repository" - args - end - end - end -end diff --git a/lib/database.rb b/lib/database.rb new file mode 100644 index 0000000..5f9201d --- /dev/null +++ b/lib/database.rb @@ -0,0 +1,63 @@ +# frozen_string_literal: true + +require "dotenv" +Dotenv.load(".env") + +require "delegate" +require "mysql2" +require "sequel" + +# Backend for connection to MySQL database for production information about +# rights +class Database < SimpleDelegator + attr_reader :rawdb + attr_accessor :connection_string + + def initialize(connection_string = ENV["DB_CONNECTION_STRING"], **) + @rawdb = self.class.connection(connection_string, **) + super(@rawdb) + end + + # #connection will take + # * a full connection string (passed here OR in the environment + # variable MYSQL_CONNECTION_STRING) + # * a set of named arguments, drawn from those passed in and the + # environment. Arguments are those supported by Sequel. + # + # Environment variables are mapped as follows: + # + # user: DB_USER + # password: DB_PASSWORD + # host: DB_HOST + # port: DB_PORT + # database: DB_DATABASE + # adapter: DB_ADAPTER + def self.connection(connection_string = ENV["DB_CONNECTION_STRING"], + **kwargs) + + if connection_string.nil? + db_args = gather_db_args(kwargs).merge( + config_local_infile: true + ) + Sequel.connect(**db_args, logger: Logger.new($stdout, level: Logger::WARN)) + else + Sequel.connect(connection_string, logger: Logger.new($stdout, level: Logger::WARN)) + end + end + + class << self + private + + def gather_db_args(args) + %i[user password host + port database adapter].each do |db_arg| + args[db_arg] ||= ENV["DB_#{db_arg.to_s.upcase}"] + end + + args[:host] ||= "localhost" + args[:adapter] ||= :mysql2 + args[:database] ||= "ht_rights" + args + end + end +end diff --git a/lib/hathifiles.rb b/lib/hathifiles.rb index 757ee41..383e109 100644 --- a/lib/hathifiles.rb +++ b/lib/hathifiles.rb @@ -1,8 +1,8 @@ # frozen_string_literal: true require_relative "bib_record" -require_relative "collections_database/collections_db" require_relative "collections_database/collections" +require_relative "database" require_relative "hathifile_history/records" require_relative "hathifile_history/record" require_relative "hathifile_history/htid_history_entry" diff --git a/lib/item_record.rb b/lib/item_record.rb index cb2a42c..940d47a 100644 --- a/lib/item_record.rb +++ b/lib/item_record.rb @@ -59,10 +59,6 @@ def update_date @update_date ||= marc["d"] end - def rights_timestamp - @rights_timestamp ||= rights_from_db.time - end - def rights_determination_note @rights_determination_note ||= marc["t"] end @@ -77,27 +73,18 @@ def collection_code end def responsible_entity_code - @responsible_entity_code ||= Services.collections[collection_code].responsible_entity + @responsible_entity_code ||= Services.collections[collection_code]&.responsible_entity end def digitization_agent_code @digitization_agent_code ||= marc["s"] end - # From the rights database + # From the database def content_provider_code @content_provider_code ||= Services.collections[collection_code]&.content_provider_cluster || collection_code end - # From the rights database - def access_profile_code - rights_from_db.access_profile - end - - def access_profile - Services.access_profiles[access_profile_code]&.name || access_profile_code - end - def to_h {htid: htid, access: access, @@ -106,20 +93,14 @@ def to_h source: source, source_bib_num: source_bib_num, rights_reason_code: rights_reason_code, - rights_timestamp: rights_timestamp, rights_date_used: rights_date_used, collection_code: collection_code, content_provider_code: content_provider_code, responsible_entity_code: responsible_entity_code, digitization_agent_code: digitization_agent_code, - access_profile_code: access_profile_code, - access_profile: access_profile, - update_date: update_date} - end - - private - - def rights_from_db - @rights_from_db ||= Services.rights.new(item_id: htid) + update_date: update_date, + # rights_timestamp and access_profile must be filled in the caller + rights_timestamp: nil, + access_profile: nil} end end diff --git a/lib/rights_database/access_profiles.rb b/lib/rights_database/access_profiles.rb deleted file mode 100644 index f48999f..0000000 --- a/lib/rights_database/access_profiles.rb +++ /dev/null @@ -1,39 +0,0 @@ -# frozen_string_literal: true - -require "services" - -module RightsDatabase - # Access Profile - # Pulled from the access_profiles table. - # A join when pulling an items rights would also work? (better) - class AccessProfile - attr_reader :id, :name, :dscr - - def initialize(id:, name:, dscr:) - @id = id - @name = name - @description = dscr - end - end - - class AccessProfiles - attr_accessor :profiles - - def initialize(profiles = load_from_db) - @profiles = profiles - end - - def load_from_db - Services.rights_db[:access_profiles] - .select(:id, - :name, - :dscr) - .as_hash(:id) - .transform_values { |h| AccessProfile.new(**h) } - end - - def [](profile_id) - @profiles[profile_id] - end - end -end diff --git a/lib/rights_database/rights.rb b/lib/rights_database/rights.rb deleted file mode 100644 index d33326b..0000000 --- a/lib/rights_database/rights.rb +++ /dev/null @@ -1,43 +0,0 @@ -# frozen_string_literal: true - -require "services" - -module RightsDatabase - # Rights for an individual HT item - class Rights - attr_accessor :item_id, :attribute, :reason, :source, :time, :note, :access_profile, :user, :namespace, :id - - def initialize(item_id:, attribute: nil, reason: nil, source: nil, time: nil, note: nil, access_profile: nil, - user: nil) - @item_id = item_id - @namespace, @id = @item_id.split(".", 2) - if @attribute.nil? - load_from_db - else - @attribute = attribute - @reason = reason - @source = source - @time = time - @note = note - @access_profile = access_profile - @user = user - end - end - - def load_from_db - rights = Services.rights_db[:rights_current] - .where(:namespace => namespace, Sequel.qualify(:rights_current, :id) => id) - .first - rights&.each do |k, v| - case k - when :reason - @reason = Services.rights_reasons[v] - when :attr - @attribute = Services.rights_attributes[v] - else - public_send("#{k}=", v) - end - end - end - end -end diff --git a/lib/rights_database/rights_attributes.rb b/lib/rights_database/rights_attributes.rb deleted file mode 100644 index 44aa4e5..0000000 --- a/lib/rights_database/rights_attributes.rb +++ /dev/null @@ -1,39 +0,0 @@ -# frozen_string_literal: true - -require "services" - -module RightsDatabase - class Attribute - attr_reader :id, :type, :name, :description - - def initialize(id:, type:, name:, dscr:) - @id = id - @type = type - @name = name - @description = dscr - end - end - - # Rights Attributes - class RightsAttributes - attr_accessor :attributes - - def initialize(attributes = load_from_db) - @attributes = attributes - end - - def load_from_db - Services.rights_db[:attributes] - .select(:id, - :type, - :name, - :dscr) - .as_hash(:id) - .transform_values { |h| Attribute.new(**h) } - end - - def [](attr) - @attributes[attr] - end - end -end diff --git a/lib/rights_database/rights_db.rb b/lib/rights_database/rights_db.rb deleted file mode 100644 index a998dff..0000000 --- a/lib/rights_database/rights_db.rb +++ /dev/null @@ -1,67 +0,0 @@ -# frozen_string_literal: true - -require "dotenv" -Dotenv.load(".env") - -require "delegate" -require "mysql2" -require "sequel" -require "services" -require "tempfile" - -module RightsDatabase - # Backend for connection to MySQL database for production information about - # rights - class RightsDB < SimpleDelegator - attr_reader :rawdb - attr_accessor :connection_string - - def initialize(connection_string = ENV["DB_CONNECTION_STRING"], **) - @rawdb = self.class.connection(connection_string, **) - super(@rawdb) - end - - # #connection will take - # * a full connection string (passed here OR in the environment - # variable MYSQL_CONNECTION_STRING) - # * a set of named arguments, drawn from those passed in and the - # environment. Arguments are those supported by Sequel. - # - # Environment variables are mapped as follows: - # - # user: DB_USER - # password: DB_PASSWORD - # host: DB_HOST - # port: DB_PORT - # database: DB_DATABASE - # adapter: DB_ADAPTER - def self.connection(connection_string = ENV["DB_CONNECTION_STRING"], - **kwargs) - - if connection_string.nil? - db_args = gather_db_args(kwargs).merge( - config_local_infile: true - ) - Sequel.connect(**db_args) - else - Sequel.connect(connection_string) - end - end - - class << self - private - - def gather_db_args(args) - %i[user password host - port database adapter].each do |db_arg| - args[db_arg] ||= ENV["DB_#{db_arg.to_s.upcase}"] - end - - args[:host] ||= "localhost" - args[:adapter] ||= :mysql2 - args[:database] ||= "ht_rights" - args - end - end - end -end diff --git a/lib/rights_database/rights_reasons.rb b/lib/rights_database/rights_reasons.rb deleted file mode 100644 index ae47c28..0000000 --- a/lib/rights_database/rights_reasons.rb +++ /dev/null @@ -1,37 +0,0 @@ -# frozen_string_literal: true - -require "services" - -module RightsDatabase - # Reason description - class Reason - attr_reader :id, :name, :dscr - - def initialize(id:, name:, dscr:) - @id = id - @name = name - @description = dscr - end - end - - class RightsReasons - attr_accessor :reasons - - def initialize(reasons = load_from_db) - @reasons = reasons - end - - def load_from_db - Services.rights_db[:reasons] - .select(:id, - :name, - :dscr) - .as_hash(:id) - .transform_values { |h| Reason.new(**h) } - end - - def [](reason) - @reasons[reason] - end - end -end diff --git a/lib/services.rb b/lib/services.rb index 75aaf30..7d87b07 100644 --- a/lib/services.rb +++ b/lib/services.rb @@ -2,27 +2,24 @@ require "canister" require "sequel" -require "rights_database/rights_db" -require "rights_database/rights" -require "rights_database/rights_attributes" -require "rights_database/rights_reasons" -require "rights_database/access_profiles" -require "collections_database/collections_db" +require "database" require "collections_database/collections" require "sdr_num_prefixes" require "logger" Services = Canister.new -Services.register(:rights_db) { RightsDatabase::RightsDB.new } -Services.register(:rights) { RightsDatabase::Rights } -Services.register(:rights_attributes) { RightsDatabase::RightsAttributes.new } -Services.register(:rights_reasons) { RightsDatabase::RightsReasons.new } -Services.register(:access_profiles) { RightsDatabase::AccessProfiles.new } -Services.register(:collections_db) { CollectionsDatabase::CollectionsDB.new } -Services.register(:collections) { CollectionsDatabase::Collections.new } +Services.register(:db) do + Database.new +end + +Services.register(:collections) do + CollectionsDatabase::Collections.new +end -Services.register(:sdrnum_prefix_map) { SdrNumPrefixes.new } +Services.register(:sdrnum_prefix_map) do + SdrNumPrefixes.new +end Services.register(:logger) do Logger.new($stdout, level: ENV.fetch("HATHIFILE_LOGGER_LEVEL", Logger::INFO).to_i) diff --git a/spec/collections_database/collections_db_spec.rb b/spec/collections_database/collections_db_spec.rb deleted file mode 100644 index df56fc1..0000000 --- a/spec/collections_database/collections_db_spec.rb +++ /dev/null @@ -1,39 +0,0 @@ -# frozen_string_literal: true - -require "spec_helper" - -require "collections_database/collections" - -RSpec.describe CollectionsDatabase::CollectionsDB do - let(:connection_string) { "mysql2://ht_collections:ht_collections@mariadb/ht_collections" } - let(:user) { "ht_collections" } - let(:password) { "ht_collections" } - let(:database) { "ht_collections" } - let(:host) { "mariadb" } - let(:connection) do - described_class.new(user: user, - password: password, - database: database, - host: host) - end - - let(:opts) do - {user: user, - password: password, - host: host, - database: database, - adapter: "mysql2"} - end - - describe "Connecting" do - it "connects with url" do - c = described_class.connection(url: connection_string) - expect(c.tables).to include(:reasons) - end - - it "connects with opts" do - c = described_class.connection(opts: opts) - expect(c.tables).to include(:reasons) - end - end -end diff --git a/spec/rights_database/rights_db_spec.rb b/spec/database_spec.rb similarity index 91% rename from spec/rights_database/rights_db_spec.rb rename to spec/database_spec.rb index e4550da..78485c6 100644 --- a/spec/rights_database/rights_db_spec.rb +++ b/spec/database_spec.rb @@ -2,9 +2,9 @@ require "spec_helper" -require "rights_database/rights" +require "database" -RSpec.describe RightsDatabase::RightsDB do +RSpec.describe Database do let(:connection_string) { "mysql2://ht_rights:ht_rights@mariadb/ht_rights" } let(:user) { "ht_rights" } let(:password) { "ht_rights" } diff --git a/spec/item_record_spec.rb b/spec/item_record_spec.rb index 6512ca2..7cd2098 100644 --- a/spec/item_record_spec.rb +++ b/spec/item_record_spec.rb @@ -22,10 +22,6 @@ expect(ir.access).to eq("allow") end - it "retrives the rights code" do - expect(ir.rights).to eq("pd") - end - it "extracts the description" do expect(ir.description).to eq("v.1") end @@ -45,11 +41,6 @@ expect(ir.rights_reason_code).to eq("bib") end - it "extracts the rights_timestamp" do - ir.htid = "test.pd_google" - expect(ir.rights_timestamp).to eq(DateTime.parse("2009-01-01 05:00:00").to_time) - end - it "extracts the update date from 974d" do expect(ir.update_date).to eq("20210912") end @@ -79,16 +70,6 @@ end end - it "retrieves the access_profile_code" do - ir.htid = "test.pd_google" - expect(ir.access_profile_code).to eq(2) - end - - it "retrieves the access_profile" do - ir.htid = "test.pd_google" - expect(ir.access_profile).to eq("google") - end - describe "#rights_date_used" do it "fills non-existent rdus with '9999'" do m = MARC::Record.new_from_hash( diff --git a/spec/jobs/generate_hathifile_spec.rb b/spec/jobs/generate_hathifile_spec.rb index ffe9317..7035b2c 100644 --- a/spec/jobs/generate_hathifile_spec.rb +++ b/spec/jobs/generate_hathifile_spec.rb @@ -107,4 +107,14 @@ .and match(/^job_records_processed\S*job="generate_hathifiles"\S* 1$/m) end end + + describe "#batch_extract_rights" do + it "extracts rights_timestamp and access_profile" do + rights = GenerateHathifile.new.batch_extract_rights("test.pd_google") + expect(rights).to be_a(Hash) + expect(rights["test.pd_google"]).not_to be_nil + expect(rights["test.pd_google"][:rights_timestamp]).to be_a(Time) + expect(rights["test.pd_google"][:access_profile]).to be_a(String) + end + end end diff --git a/spec/rights_database/rights_spec.rb b/spec/rights_database/rights_spec.rb deleted file mode 100644 index 20604b8..0000000 --- a/spec/rights_database/rights_spec.rb +++ /dev/null @@ -1,14 +0,0 @@ -# frozen_string_literal: true - -require "spec_helper" -require "rights_database/rights" - -RSpec.describe RightsDatabase::Rights do - describe "#new" do - it "retrieves a rights record for an item id" do - item_rights = RightsDatabase::Rights.new(item_id: "test.cc-by-nc-nd-4.0_page") - expect(item_rights.attribute.id).to eq(22) - expect(item_rights.reason.id).to eq(1) - end - end -end