diff --git a/lib/oai_solr/basic_marc_extractor.rb b/lib/oai_solr/basic_marc_extractor.rb new file mode 100644 index 0000000..ca4a69b --- /dev/null +++ b/lib/oai_solr/basic_marc_extractor.rb @@ -0,0 +1,87 @@ +# frozen_string_literal: true + +require "set" +require_relative "basic_marc_single_extractor" + +module OAISolr + # A collection of BasicMARCSingleExtractors that will collect their combined values from + # a MARC::Record. + class BasicMARCExtractor + # Create a new object, optionally passing tags/codes to add a first BasicMARCSingleExtractor + # @param [String,Array,Range] tags Single, array or, or range over 3-digit marc tags + # @param [String, Range] subfield_codes Either a single string with all the desired subfield codes + # e.g., "abcek", or a range, e.g., "'a'..'m'". Optional. + # @example + # bme = BasicMARCExtractor.new; bme << BasicMARCSingleExtractor.new("245", "ab") + # bme = BasicMARCExtractor.new("245", "ab") + # bme = BasicMARCExtractor.new("600".."699", "a".."z") + def initialize(tags = nil, subfield_codes = nil) + @single_extractors = [] + if tags + self << BasicMARCSingleExtractor.new(tags, subfield_codes) + end + end + + # Given an array of duples (as from config), build up an extractor using `#<<` + # @param [Array>] tag_code_pairs Array of arrays of the form [ [tags, subfield_codes], ...] + # @example + # bme = BasicMARCExtractor.from_pairs([["245", "ab"], ["100".."111", "abd"]]) + # @see OAI::BasicMARCSingleExtractor#initialize for supported syntax + def self.from_pairs(tag_code_pairs) + unless tag_code_pairs.first&.is_a?(Array) + raise "#{self.class}.from_pairs takes an array of arrays" + end + basic_marc_extractor = new + tag_code_pairs.each { |tag, codes| basic_marc_extractor << BasicMARCSingleExtractor.new(tag, codes) } + basic_marc_extractor + end + + # Add a previously constructed single extractor, and re-compute the set of interesting tags + # @param [OAI::BasicMARCSingleExtractor] basic_marc_single_extractor + # @return [OAI::BasicMARCExtractor] + def <<(basic_marc_single_extractor) + @single_extractors << basic_marc_single_extractor + set_interesting_tags! + self + end + + # For efficiently, keep track of which field tags are "interesting" to this specific extractor, + # so we don't have to check the whole list of field tags for every BasicMARCSingleExtractor + # @see set_interesting_tags! + # @param [String] tag The field tag + # @return [Boolean] + def interesting_tag?(tag) + @interesting_ranges.any? { |rng| rng.cover?(tag) } or @interesting_single_tags.include?(tag) + end + + # Get a list of the "interesting" fields (by tag), and run each single extractor in turn + # on them. Flatten, compact, and uniq the resulting strings and return + # @param [MARC::Record] rec The record from which to extract data + # @return [Array] array of extracts + def values(rec) + rec.select { |field| interesting_tag?(field.tag) } + .flat_map { |f| @single_extractors.flat_map { |extractor| extractor.value(f) } } + .compact.uniq + end + + private + + # We want to efficiently determine if the tag is one that we're interested in. + # We support single tags, arrays of (single) tags, and tag ranges. The first two + # merge into one set; the ranges we handle separately for efficiency (no sense in + # turning '600'..'699' into an array) + def set_interesting_tags! + @interesting_single_tags = ::Set.new + @interesting_ranges = ::Set.new + @single_extractors.map(&:computed_tags).each do |tags| + case tags + when Range + @interesting_ranges << tags + else + @interesting_single_tags += Array(tags) + end + @interesting_single_tags.flatten! + end + end + end +end diff --git a/lib/oai_solr/basic_marc_single_extractor.rb b/lib/oai_solr/basic_marc_single_extractor.rb new file mode 100644 index 0000000..97775c9 --- /dev/null +++ b/lib/oai_solr/basic_marc_single_extractor.rb @@ -0,0 +1,161 @@ +module OAISolr + # Build up a simple object to quasi-efficiently extract values from MARC tag/subfield-codes + # based on a simplistic query specification. + # + # A single BasicMARCSingleExtractor will extract a specific set of subfields from the + # given tag specification. + # + # The set (or single) of tags_to_match you want can be passed in as: + # * A single string. `"245"` + # * A three digit integer, which will be coerced into a string. `245` + # * Note that if you want a zero-led field (e.g., "050") you can't use the integer option + # * An array of tags_to_match. ["245", "100", "111"] + # * A range of Strings that encompass all the tags you want,. "600".."699" + # + # Subfield codes can be expressed as: + # * A string containing all the subfields you want. "abdek" + # * A range of one-character strings. "a".."n" + # + # Control field: for "codes", pass a range of characters to fetch + # * When dealing with a control field, the "codes" passed should actually be a range of integers + # corresponding to the indexes (zero-based) of the characters you want from that value. + class BasicMARCSingleExtractor + # Generally, MARC fields have the data in alphabetical subfields fields, and metadata (e.g., links to + # other fields) in numbered subfields. We'll use all the "letter" subfields as the + # default for which subfields to use. + ALPHA = "a".."z" + + attr_reader :tags, :codes, :computed_tags + + # Create a new extractor for the given tag(s) and subfield code(s) + # Note that this code just creates a method to determine if a field matches the desired tags_to_match, + # and another to actually extract data from the subfields of those matched fields. + # + # Everything else in this class is just support to create the #matches_tag? and + # #extract methods. + # + # @param [String, Array, Range] tags + # @param [String] codes A list of the + # @example One field tag, two subfield codes + # extractor = BasicMARCSingleExtractor.new("245", "ab") + # @example An array of tags_to_match + # extractor = BasicMARCSingleExtractor.new(["100", "110", "111"], "abd") + # @example A range of tags_to_match, and the default (all alphabetic) subfield codes + # extractor = BasicMARCSingleExtractor.new("600".."699") # subfield codes defaults to ALPHA + # @example A single tag, with a range of subfields + # extractor = BasicMARCSingleExtractor.new("245", "a".."e") + # @example Get the "date1" characters from the 008 field + # extractor = BasicMARCStringExtractor.new("008", 7..10) + def initialize(tags, codes) + @tags = tags + @codes = codes || ALPHA + define_singleton_method(:matches_tag?, tag_matcher(@tags)) + define_singleton_method(:extract, value_extractor(@codes)) + end + + # @!method matches_tag?(tag) + # Determines if the passed field tag (e.g., "245") is one that this extractor + # cares about. + # @param [String] tag + # @return [Boolean] + + # @!method extract(field) + # Takes a MARC::DataField or MARC::ControlField and: + # * get the values of the subfields with the wanted codes and + # return them as a single, space-delimited string + # * Get a range of characters from a control field, when the "codes" specified was + # actually an integer range. + # @param [MARC::DataField, MARC::ControlField] field + # @return [String] the desired value(s), with subfield values joined with a space + + # If the "codes" that were passed was actually an integer range, we assume that we're dealing + # with a control field. + def control_field? + codes.is_a?(Range) and codes.begin.is_a?(Integer) + end + + # Try to extract strings from the desired subfield values. If none match, or we end + # up with an empty string, return nil + # @param [MARC::DataField] field + # @return [String, nil] Space-delimited values of the wanted subfields + def value(field) + val = if matches_tag?(field.tag) + extract(field) # defined dynamically in the constructor + else + return nil + end + + val.empty? ? nil : val + end + + # To decide what values to extract, we first need to decide if a given field's tag + # is one of the ones we care about for this extractor. + # + # Use the tag specification passed in the constructor and figure out + # the best way to test if a field tag string (e.g., "245") matches the tags + # covered by this extractor. Then build a lambda that will do that test. + # + # The returned lambda is used in the constructor to create the #matches_tag? method + # @param [String, Array, Range] tags_to_match + # @return [Proc] a lambda that takes a single tag and sees if it matches this extractor + def tag_matcher(tags_to_match) + case tags_to_match + when Integer, String + @computed_tags = tags_to_match.to_s + ->(t) { t.to_s == @computed_tags } + when Array + @computed_tags = tags_to_match.map(&:to_s).uniq + ->(t) { @computed_tags.include? t } + when Range + @computed_tags = tags_to_match + ->(t) { @computed_tags.cover?(t) } + else + raise "Illegal argument '#{tags_to_match.inspect}'" + end + end + + # Given a subfield codes specification from the constructor, build an efficient + # lambda to pull out the data from the given code(s) as a string. Used in the + # constructor to make the #extract method. + # @param [String, Range, Range] codes_or_control_field_range + # @return [Proc] lambda that take a MARC::ControlField or MARC::DataField and pulls + # out the requested data. + def value_extractor(codes_or_control_field_range) + if control_field? + control_field_extractor(codes_or_control_field_range) + else + datafield_extractor(codes_or_control_field_range) + end + end + + private + + # A control field extractor just gets the characters in the given range + # @param [Range] integer_range Integer range (zero-based) of the chars you want + # @return [Proc] lambda that will take a control field and extract the right characters + def control_field_extractor(integer_range) + ->(control_field) { control_field.value.slice(integer_range) } + end + + # Subfield extraction for when the codes are specified as a single char, a bunch of chars, + # or a char range. Each is treated separately to get the best performance for + # each situation, because these things can add up when doing lots and lots of records. + # @param [String] codes A string of which subfield codes to extract + # @return [Proc] lambda that will correctly do the extraction and joining of values on the passed field. + def datafield_extractor(codes) + case codes + when String + if codes.size == 1 + ->(data_field) { data_field.select { |sf| sf.code == codes }.map(&:value).join(" ").strip } + else + codesarray = codes.chars + ->(data_field) { data_field.select { |sf| codesarray.include? sf.code }.map(&:value).join(" ").strip } + end + when Range + ->(data_field) { data_field.select { |sf| codes.cover? sf.code }.map(&:value).join(" ").strip } + else + raise "Subfield codes must be either a string of chars, a range of chars, or a range of ints for control field extraction" + end + end + end +end diff --git a/lib/oai_solr/dublin_core.rb b/lib/oai_solr/dublin_core.rb index 7c6a210..2d3ad14 100644 --- a/lib/oai_solr/dublin_core.rb +++ b/lib/oai_solr/dublin_core.rb @@ -1,8 +1,13 @@ require "oai" require "rights_database" +require "oai_solr/dublin_core_crosswalk" module OAISolr class DublinCore < OAI::Provider::Metadata::DublinCore + # A dublic core crosswalk object for translating MARC records into + # the dublin core fields. + CROSSWALK = OAISolr::DublinCoreCrosswalk.new + def encode _, record dc_hash = dublin_core_hash(record) @@ -33,41 +38,53 @@ def self.rights_statement(record, statements = access_statements(record)) private + # @param [OAISolr::Record] record def dublin_core_hash(record) - # TODO: to_dublin_core doesn't do much useful in the current release of - # ruby-marc - the only things we're keeping from it are "source" and - # "relation" - record.marc_record.to_dublin_core.compact.tap do |dc| - dc.default_proc = proc { |hash, key| hash[key] = [] } - - dc["type"] = "text" - dc["date"] = record.solr_document["display_date"] - dc["description"] = description(record) - dc["rights"] = self.class.rights_statement(record) - - %w[publisher language format subject_display authorStr] - .reject { |k| record.solr_document[k].nil? } - .each { |k| dc[k] = [record.solr_document[k]].flatten } - - dc["subject"] = dc.delete("subject_display") - dc["creator"] = dc.delete("authorStr") - - # the old OAI provider doesn't include dc:coverage, and what rubymarc - # gives is as badly-formatted as the authors & subjects - dc.delete("coverage") - - record.solr_document["oclc"]&.each { |o| dc["identifier"] << "(OCoLC)#{o}" } - record.solr_document["ht_id"].each { |htid| dc["identifier"] << "#{Settings.handle}#{htid}" } - record.solr_document["isbn"]&.each { |isbn| dc["identifier"] << isbn } - end.reject { |_k, v| v.nil? || v.empty? } - end + dc = {} + + # Set stuff that's constant for HT items + dc["type"] = "text" + dc["rights"] = self.class.rights_statement(record) + + # Get stuff out of the solr documment + dc["date"] = record.first_solr_value("display_date") + dc["language"] = record.first_solr_value("language") + dc["publisher"] = record.first_solr_value("publisher") + dc["subject"] = record.solr_value("subject_display") + dc["format"] = record.first_solr_value("format") - # Current implementation appears to use 300 - # ruby-marc's next release will likely use 500 - def description(record) - return unless record.marc_record["300"] + marc = record.marc_record + + # The LoC spec says to NOT use creator, and instead use contributor, but our users + # have asked that we keep this the same as before, using creator. + dc["creator"] = CROSSWALK.contributor(marc) + + # Pull the rest from the record according to the Library of Congress crosswalk + dc["publisher"] ||= CROSSWALK.publisher(marc) + dc["coverage"] = CROSSWALK.coverage(marc) + dc["description"] = CROSSWALK.description(marc) + dc["format"] ||= CROSSWALK.format(marc) + dc["relation"] = CROSSWALK.relation(marc) + dc["source"] = CROSSWALK.source(marc) + dc["title"] = CROSSWALK.title(marc) + + # Get the identifiers + dc["identifier"] = record.solr_array("oclc").map { |id| "(OCoLC)#{id}" } + .concat(record.solr_array("ht_id").map { |htid| "#{Settings.handle}#{htid}" }) + .concat(record.solr_array("isbn").map { |isbn| "ISBN #{isbn}" }) + .concat(record.solr_array("issn").map { |issn| "ISBN #{issn}" }) + .concat(record.solr_array("lccn").map { |lccn| "LCCN #{lccn}" }) + # Flatten it all out and get rid of nils and duplicates + dc.select { |k, v| v.is_a?(Array) }.each_pair do |_field, values| + values.flatten! + values.compact! + values.uniq! + values.reject! { |x| x == "".freeze } + end - record.marc_record["300"].subfields.select { |sub| %w[a b c].include? sub.code }.map { |sub| sub.value }.join(" ") + # Ditch everything that's empty or nil + dc.reject! { |_k, v| v.nil? || v.empty? } + dc end # Returns an array of unique access statements for each HTID on record diff --git a/lib/oai_solr/dublin_core_crosswalk.rb b/lib/oai_solr/dublin_core_crosswalk.rb new file mode 100644 index 0000000..6330d6f --- /dev/null +++ b/lib/oai_solr/dublin_core_crosswalk.rb @@ -0,0 +1,138 @@ +require "set" +require_relative "basic_marc_extractor" + +module OAISolr + # Create an instance that will map MARC records to Dublin Core fields. + # Generally taken from the crosswalk at https://www.loc.gov/marc/marc2dc.html + # Mappings that can be easily specified as an OAI::BasicMARCExtractor are defined + # in the MAPPINGS constant. Anything more complex has its own method. + class DublinCoreCrosswalk + MAPPINGS = { + + contributor: [ + [%w[100 700], "abcdjq"], + [%w[110 710], "abcd"], + [%w[111 711], "acden"], + ["720", "a"] + ], + + coverage: [ + [651, nil], + [662, nil], + [751, nil], + [752, nil] + ], + + # date -- see below + + description: [ + [["300"] + ("500".."599").to_a - %w[506 530 538 540 546], nil] + ], + + format: [ + [340, nil], + [856, "q"] + ], + + identifier: [ + [%w[020 022 024], "a"], + [856, "u"], + [%w[050 080 060], nil], + ["082", "ab"] + ], + + language: [ + ["008", 35..37], + ["041", "abdefghj"] + ], + + publisher: [ + ["260", "ab"] + ], + + relation: [ + ["530", nil], + [("760".."787"), "ot"] + ], + + rights: [ + ["506", nil], + ["540", nil] + ], + + source: [ + ["534", "t"], + ["540", nil], + ["786", "ot"] + ], + subject: [ + ["600", "abcdefghjklmnopqrstuvxyz"], + ["610", "abcdefghklmnoprstuvxyz"], + ["611", "acdefghjklnpqstuvxyz"], + ["630", "adefghklmnoprstvxyz"], + ["650", "abcdevxyz"], + ["653", "abevyz"] + ], + + title: [ + ["245", "abdefgknp"], + ["246", "abdefgknp"] + ] + + # type -- see below + } + + MAPPINGS.each do |key, spec_pairs| + bme = BasicMARCExtractor.from_pairs(spec_pairs) + define_method(key.to_sym, ->(rec) { bme.values(rec) }) + end + + # If it's necessary to add a field that does not have an identically-named + # accessor, or is not in MAPPINGS, some adjustment may be necessary, + def full_map(rec) + fields = MAPPINGS.keys + %i[type date] + fields.map { |field| [field, send(field, rec)] } + .to_h.reject { |k, v| v.empty? } + end + + # Get the best date possible, looking for four digits in the 008, then + # falling back to the 260cg + # @param [MARC::Record] rec + def date(rec) + possible_year = date_008(rec) + return possible_year if /\A\d{4}\Z/.match?(possible_year) + + other_possible_date = date_260cg(rec) + + if /\S/.match?(other_possible_date) + other_possible_date + else + possible_year + end + end + + def type(rec) + leader6 = rec.leader[6] + leader7 = rec.leader[7] + types = [] + types << "text" if %w[a c d t].include?(leader6) + types << "image" if %w[e f g k].include?(leader6) + types << "sound" if %w[i k].include?(leader6) + types << "collection" if (leader6 == "p") || %w[c s].include?(leader7) + types + end + + private + + def date_008(rec) + rec["008"].value[7..10] + end + + def date_260cg(rec) + two_sixty = rec["260"] + if two_sixty + [two_sixty["c"], two_sixty["g"]].join(" ") + end + end + end +end diff --git a/lib/oai_solr/record.rb b/lib/oai_solr/record.rb index 419035d..89997b1 100644 --- a/lib/oai_solr/record.rb +++ b/lib/oai_solr/record.rb @@ -68,6 +68,18 @@ def solr_value(field) solr_document.has_key?(field) ? solr_document[field] : nil end + # @param [String] field Name of the solr field + # @return [String, Numeric, NilClass] The first found value, or nil if not found + def first_solr_value(field) + return nil unless solr_document.has_key?(field) + val = solr_document[field] + if val.is_a?(Array) + val.first + else + val + end + end + # @param [String] field Name of the field # @return [Array, Numeric, NilClass] The found value, or nil if not found def solr_array(field) diff --git a/spec/oai_solr_record_spec.rb b/spec/oai_solr_record_spec.rb index fbcf48e..bd2e309 100644 --- a/spec/oai_solr_record_spec.rb +++ b/spec/oai_solr_record_spec.rb @@ -75,6 +75,10 @@ expect(parsed.css("dc|identifier").map { |c| c.text }).to include("(OCoLC)562083") end + it "has the LCCN as an identifier" do + expect(parsed.css("dc|identifier").map(&:text)).to include("LCCN agr48000285") + end + it "has item handle as an dc:identifier" do handle = "http://hdl.handle.net/2027/uc1.31822013347232" expect(parsed.css("dc|identifier").map { |c| c.text }).to include(handle) @@ -92,7 +96,19 @@ let(:sdoc) { JSON.parse(File.read("spec/data/008553258.json")) } it "has ISBN as a dc:identifier" do - expect(parsed.css("dc|identifier").map { |c| c.text }).to include("9806741242") + expect(parsed.css("dc|identifier").map { |c| c.text }).to include("ISBN 9806741242") + end + end + + context "with record with more complex data" do + let(:sdoc) { JSON.parse(File.read("spec/data/001718542.json")) } + + it "gets the full title" do + expect(parsed.css("dc|title").first.text).to eq("Local government ... comprising statutes, orders, forms, cases, and local decisions of the Local government board ; 1908-.") + end + + it "gets multiple creators" do + expect(parsed.css("dc|creator").size).to eq(2) end end end