Merge pull request #26 from hathitrust/marc_to_dublin_core_crosswalk

Adding new class to convert marc to Dublin Core crosswalk
hathitrust · Apr 10, 2023 · 39d5aa0 · 39d5aa0
2 parents fd511bb + 7313245
commit 39d5aa0
Show file tree

Hide file tree

Showing 6 changed files with 464 additions and 33 deletions.
diff --git a/lib/oai_solr/basic_marc_extractor.rb b/lib/oai_solr/basic_marc_extractor.rb
@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+
+require "set"
+require_relative "basic_marc_single_extractor"
+
+module OAISolr
+  # A collection of BasicMARCSingleExtractors that will collect their combined values from
+  # a MARC::Record.
+  class BasicMARCExtractor
+    # Create a new object, optionally passing tags/codes to add a first BasicMARCSingleExtractor
+    # @param [String,Array<String>,Range<String>] tags Single, array or, or range over 3-digit marc tags
+    # @param [String, Range<String>] subfield_codes Either a single string with all the desired subfield codes
+    #   e.g., "abcek", or a range, e.g., "'a'..'m'". Optional.
+    # @example
+    #   bme = BasicMARCExtractor.new; bme << BasicMARCSingleExtractor.new("245", "ab")
+    #   bme = BasicMARCExtractor.new("245", "ab")
+    #   bme = BasicMARCExtractor.new("600".."699", "a".."z")
+    def initialize(tags = nil, subfield_codes = nil)
+      @single_extractors = []
+      if tags
+        self << BasicMARCSingleExtractor.new(tags, subfield_codes)
+      end
+    end
+
+    # Given an array of duples (as from config), build up an extractor using `#<<`
+    # @param [Array<Array<String>>] tag_code_pairs Array of arrays of the form [ [tags, subfield_codes], ...]
+    # @example
+    #   bme = BasicMARCExtractor.from_pairs([["245", "ab"], ["100".."111", "abd"]])
+    # @see OAI::BasicMARCSingleExtractor#initialize for supported syntax
+    def self.from_pairs(tag_code_pairs)
+      unless tag_code_pairs.first&.is_a?(Array)
+        raise "#{self.class}.from_pairs takes an array of arrays"
+      end
+      basic_marc_extractor = new
+      tag_code_pairs.each { |tag, codes| basic_marc_extractor << BasicMARCSingleExtractor.new(tag, codes) }
+      basic_marc_extractor
+    end
+
+    # Add a previously constructed single extractor, and re-compute the set of interesting tags
+    # @param [OAI::BasicMARCSingleExtractor] basic_marc_single_extractor
+    # @return [OAI::BasicMARCExtractor]
+    def <<(basic_marc_single_extractor)
+      @single_extractors << basic_marc_single_extractor
+      set_interesting_tags!
+      self
+    end
+
+    # For efficiently, keep track of which field tags are "interesting" to this specific extractor,
+    # so we don't have to check the whole list of field tags for every BasicMARCSingleExtractor
+    # @see set_interesting_tags!
+    # @param [String] tag The field tag
+    # @return [Boolean]
+    def interesting_tag?(tag)
+      @interesting_ranges.any? { |rng| rng.cover?(tag) } or @interesting_single_tags.include?(tag)
+    end
+
+    # Get a list of the "interesting" fields (by tag), and run each single extractor in turn
+    # on them. Flatten, compact, and uniq the resulting strings and return
+    # @param [MARC::Record] rec The record from which to extract data
+    # @return [Array<String>] array of extracts
+    def values(rec)
+      rec.select { |field| interesting_tag?(field.tag) }
+        .flat_map { |f| @single_extractors.flat_map { |extractor| extractor.value(f) } }
+        .compact.uniq
+    end
+
+    private
+
+    # We want to efficiently determine if the tag is one that we're interested in.
+    # We support single tags, arrays of (single) tags, and tag ranges. The first two
+    # merge into one set; the ranges we handle separately for efficiency (no sense in
+    # turning '600'..'699' into an array)
+    def set_interesting_tags!
+      @interesting_single_tags = ::Set.new
+      @interesting_ranges = ::Set.new
+      @single_extractors.map(&:computed_tags).each do |tags|
+        case tags
+        when Range
+          @interesting_ranges << tags
+        else
+          @interesting_single_tags += Array(tags)
+        end
+        @interesting_single_tags.flatten!
+      end
+    end
+  end
+end
diff --git a/lib/oai_solr/basic_marc_single_extractor.rb b/lib/oai_solr/basic_marc_single_extractor.rb
@@ -0,0 +1,161 @@
+module OAISolr
+  # Build up a simple object to quasi-efficiently extract values from MARC tag/subfield-codes
+  # based on a simplistic query specification.
+  #
+  # A single BasicMARCSingleExtractor will extract a specific set of subfields from the
+  # given tag specification.
+  #
+  # The set (or single) of tags_to_match you want can be passed in as:
+  #   * A single string. `"245"`
+  #   * A three digit integer, which will be coerced into a string. `245`
+  #     * Note that if you want a zero-led field (e.g., "050") you can't use the integer option
+  #   * An array of tags_to_match. ["245", "100", "111"]
+  #   * A range of Strings that encompass all the tags you want,. "600".."699"
+  #
+  # Subfield codes can be expressed as:
+  #   * A string containing all the subfields you want. "abdek"
+  #   * A range of one-character strings. "a".."n"
+  #
+  # Control field: for "codes", pass a range of characters to fetch
+  #   * When dealing with a control field, the "codes" passed should actually be a range of integers
+  #     corresponding to the indexes (zero-based) of the characters you want from that value.
+  class BasicMARCSingleExtractor
+    # Generally, MARC fields have the data in alphabetical subfields fields, and metadata (e.g., links to
+    # other fields) in numbered subfields.  We'll use all the "letter" subfields as the
+    # default for which subfields to use.
+    ALPHA = "a".."z"
+
+    attr_reader :tags, :codes, :computed_tags
+
+    # Create a new extractor for the given tag(s) and subfield code(s)
+    # Note that this code just creates a method to determine if a field matches the desired tags_to_match,
+    # and another to actually extract data from the subfields of those matched fields.
+    #
+    # Everything else in this class is just support to create the #matches_tag? and
+    # #extract methods.
+    #
+    # @param [String, Array<String>, Range<String>] tags
+    # @param [String] codes A list of the
+    # @example One field tag, two subfield codes
+    #   extractor = BasicMARCSingleExtractor.new("245", "ab")
+    # @example An array of tags_to_match
+    #   extractor = BasicMARCSingleExtractor.new(["100", "110", "111"], "abd")
+    # @example A range of tags_to_match, and the default (all alphabetic) subfield codes
+    #   extractor = BasicMARCSingleExtractor.new("600".."699") # subfield codes defaults to ALPHA
+    # @example A single tag, with a range of subfields
+    #   extractor = BasicMARCSingleExtractor.new("245", "a".."e")
+    # @example Get the "date1" characters from the 008 field
+    #   extractor = BasicMARCStringExtractor.new("008", 7..10)
+    def initialize(tags, codes)
+      @tags = tags
+      @codes = codes || ALPHA
+      define_singleton_method(:matches_tag?, tag_matcher(@tags))
+      define_singleton_method(:extract, value_extractor(@codes))
+    end
+
+    # @!method matches_tag?(tag)
+    #   Determines if the passed field tag (e.g., "245") is one that this extractor
+    #   cares about.
+    #   @param [String] tag
+    #   @return [Boolean]
+
+    # @!method extract(field)
+    #   Takes a MARC::DataField or MARC::ControlField and:
+    #     * get the values of the subfields with the wanted codes and
+    #       return them as a single, space-delimited string
+    #     * Get a range of characters from a control field, when the "codes" specified was
+    #       actually an integer range.
+    #   @param [MARC::DataField, MARC::ControlField] field
+    #   @return [String] the desired value(s), with subfield values joined with a space
+
+    # If the "codes" that were passed was actually an integer range, we assume that we're dealing
+    # with a control field.
+    def control_field?
+      codes.is_a?(Range) and codes.begin.is_a?(Integer)
+    end
+
+    # Try to extract strings from the desired subfield values. If none match, or we end
+    # up with an empty string, return nil
+    # @param [MARC::DataField] field
+    # @return [String, nil] Space-delimited values of the wanted subfields
+    def value(field)
+      val = if matches_tag?(field.tag)
+        extract(field) # defined dynamically in the constructor
+      else
+        return nil
+      end
+
+      val.empty? ? nil : val
+    end
+
+    # To decide what values to extract, we first need to decide if a given field's tag
+    # is one of the ones we care about for this extractor.
+    #
+    # Use the tag specification passed in the constructor and figure out
+    # the best way to test if a field tag string (e.g., "245") matches the tags
+    # covered by this extractor. Then build a lambda that will do that test.
+    #
+    # The returned lambda is used in the constructor to create the #matches_tag? method
+    # @param  [String, Array<String>, Range<String>] tags_to_match
+    # @return [Proc] a lambda that takes a single tag and sees if it matches this extractor
+    def tag_matcher(tags_to_match)
+      case tags_to_match
+      when Integer, String
+        @computed_tags = tags_to_match.to_s
+        ->(t) { t.to_s == @computed_tags }
+      when Array
+        @computed_tags = tags_to_match.map(&:to_s).uniq
+        ->(t) { @computed_tags.include? t }
+      when Range
+        @computed_tags = tags_to_match
+        ->(t) { @computed_tags.cover?(t) }
+      else
+        raise "Illegal argument '#{tags_to_match.inspect}'"
+      end
+    end
+
+    # Given a subfield codes specification from the constructor, build an efficient
+    # lambda to pull out the data from the given code(s) as a string. Used in the
+    # constructor to make the #extract method.
+    # @param [String, Range<String>, Range<Integer>] codes_or_control_field_range
+    # @return [Proc] lambda that take a MARC::ControlField or MARC::DataField and pulls
+    #   out the requested data.
+    def value_extractor(codes_or_control_field_range)
+      if control_field?
+        control_field_extractor(codes_or_control_field_range)
+      else
+        datafield_extractor(codes_or_control_field_range)
+      end
+    end
+
+    private
+
+    # A control field extractor just gets the characters in the given range
+    # @param [Range] integer_range  Integer range (zero-based) of the chars you want
+    # @return [Proc] lambda that will take a control field and extract the right characters
+    def control_field_extractor(integer_range)
+      ->(control_field) { control_field.value.slice(integer_range) }
+    end
+
+    # Subfield extraction for when the codes are specified as a single char, a bunch of chars,
+    # or a char range.  Each is treated separately to get the best performance for
+    # each situation, because these things can add up when doing lots and lots of records.
+    # @param [String] codes A string of which subfield codes to extract
+    # @return [Proc] lambda that will correctly do the extraction and joining of values on the passed field.
+    def datafield_extractor(codes)
+      case codes
+      when String
+        if codes.size == 1
+          ->(data_field) { data_field.select { |sf| sf.code == codes }.map(&:value).join(" ").strip }
+        else
+          codesarray = codes.chars
+          ->(data_field) { data_field.select { |sf| codesarray.include? sf.code }.map(&:value).join(" ").strip }
+        end
+      when Range
+        ->(data_field) { data_field.select { |sf| codes.cover? sf.code }.map(&:value).join(" ").strip }
+      else
+        raise "Subfield codes must be either a string of chars, a range of chars, or a range of ints for control field extraction"
+      end
+    end
+  end
+end
diff --git a/lib/oai_solr/dublin_core.rb b/lib/oai_solr/dublin_core.rb
@@ -1,8 +1,13 @@
 require "oai"
 require "rights_database"
+require "oai_solr/dublin_core_crosswalk"
 
 module OAISolr
   class DublinCore < OAI::Provider::Metadata::DublinCore
+    # A dublic core crosswalk object for translating MARC records into
+    # the dublin core fields.
+    CROSSWALK = OAISolr::DublinCoreCrosswalk.new
+
     def encode _, record
       dc_hash = dublin_core_hash(record)
 
@@ -33,41 +38,53 @@ def self.rights_statement(record, statements = access_statements(record))
 
     private
 
+    # @param [OAISolr::Record] record
     def dublin_core_hash(record)
-      # TODO: to_dublin_core doesn't do much useful in the current release of
-      # ruby-marc - the only things we're keeping from it are "source" and
-      # "relation"
-      record.marc_record.to_dublin_core.compact.tap do |dc|
-        dc.default_proc = proc { |hash, key| hash[key] = [] }
-
-        dc["type"] = "text"
-        dc["date"] = record.solr_document["display_date"]
-        dc["description"] = description(record)
-        dc["rights"] = self.class.rights_statement(record)
-
-        %w[publisher language format subject_display authorStr]
-          .reject { |k| record.solr_document[k].nil? }
-          .each { |k| dc[k] = [record.solr_document[k]].flatten }
-
-        dc["subject"] = dc.delete("subject_display")
-        dc["creator"] = dc.delete("authorStr")
-
-        # the old OAI provider doesn't include dc:coverage, and what rubymarc
-        # gives is as badly-formatted as the authors & subjects
-        dc.delete("coverage")
-
-        record.solr_document["oclc"]&.each { |o| dc["identifier"] << "(OCoLC)#{o}" }
-        record.solr_document["ht_id"].each { |htid| dc["identifier"] << "#{Settings.handle}#{htid}" }
-        record.solr_document["isbn"]&.each { |isbn| dc["identifier"] << isbn }
-      end.reject { |_k, v| v.nil? || v.empty? }
-    end
+      dc = {}
+
+      # Set stuff that's constant for HT items
+      dc["type"] = "text"
+      dc["rights"] = self.class.rights_statement(record)
+
+      # Get stuff out of the solr documment
+      dc["date"] = record.first_solr_value("display_date")
+      dc["language"] = record.first_solr_value("language")
+      dc["publisher"] = record.first_solr_value("publisher")
+      dc["subject"] = record.solr_value("subject_display")
+      dc["format"] = record.first_solr_value("format")
 
-    # Current implementation appears to use 300
-    # ruby-marc's next release will likely use 500
-    def description(record)
-      return unless record.marc_record["300"]
+      marc = record.marc_record
+
+      # The LoC spec says to NOT use creator, and instead use contributor, but our users
+      # have asked that we keep this the same as before, using creator.
+      dc["creator"] = CROSSWALK.contributor(marc)
+
+      # Pull the rest from the record according to the Library of Congress crosswalk
+      dc["publisher"] ||= CROSSWALK.publisher(marc)
+      dc["coverage"] = CROSSWALK.coverage(marc)
+      dc["description"] = CROSSWALK.description(marc)
+      dc["format"] ||= CROSSWALK.format(marc)
+      dc["relation"] = CROSSWALK.relation(marc)
+      dc["source"] = CROSSWALK.source(marc)
+      dc["title"] = CROSSWALK.title(marc)
+
+      # Get the identifiers
+      dc["identifier"] = record.solr_array("oclc").map { |id| "(OCoLC)#{id}" }
+        .concat(record.solr_array("ht_id").map { |htid| "#{Settings.handle}#{htid}" })
+        .concat(record.solr_array("isbn").map { |isbn| "ISBN #{isbn}" })
+        .concat(record.solr_array("issn").map { |issn| "ISBN #{issn}" })
+        .concat(record.solr_array("lccn").map { |lccn| "LCCN #{lccn}" })
+      # Flatten it all out and get rid of nils and duplicates
+      dc.select { |k, v| v.is_a?(Array) }.each_pair do |_field, values|
+        values.flatten!
+        values.compact!
+        values.uniq!
+        values.reject! { |x| x == "".freeze }
+      end
 
-      record.marc_record["300"].subfields.select { |sub| %w[a b c].include? sub.code }.map { |sub| sub.value }.join(" ")
+      # Ditch everything that's empty or nil
+      dc.reject! { |_k, v| v.nil? || v.empty? }
+      dc
     end
 
     # Returns an array of unique access statements for each HTID on record