From 05168ab543dbba9e6fc4cdaa8c24fa2402eddd72 Mon Sep 17 00:00:00 2001
From: Bill Dueber <bill@dueber.com>
Date: Mon, 13 Mar 2023 22:46:38 -0400
Subject: [PATCH 1/3] Add LoC(-inspired) MARC -> Dublin Core crosswalk

Adding new class to convert marc to Dublin Core,  and use it to
build up the dublin core record in addition to stuff pulled
directly from the solr document.

  * Add DC Crosswalk
  * Retool dublin_core.rb to use it. This eliminates any uses of
    MARC::Record.to_dublin_core that were present.
  * Add tests that reflect the new fields being pulled out.

This mostly follows the definition laid out by the Library of Congress
at https://www.loc.gov/marc/marc2dc.html, with deviations based on
what we used to do and requests from our most active user.

Use of the new crosswalk object is as follows.

```
require "oai_solr/dublin_core_crosswalk"

dcc = OAISolr:DublinCoreCrosswalk.new

hash_of_element_value_pairs = dcc.full_map(record)

title = dcc.title(rec)
identifier = dcc.identifier(rec)

```
---
 docker-compose.yml                          |  13 ++
 lib/oai_solr/basic_marc_extractor.rb        |  87 +++++++++++
 lib/oai_solr/basic_marc_single_extractor.rb | 161 ++++++++++++++++++++
 lib/oai_solr/dublin_core.rb                 |  81 ++++++----
 lib/oai_solr/dublin_core_crosswalk.rb       | 157 +++++++++++++++++++
 lib/oai_solr/record.rb                      |  12 ++
 spec/oai_solr_record_spec.rb                |  18 ++-
 7 files changed, 496 insertions(+), 33 deletions(-)
 create mode 100644 lib/oai_solr/basic_marc_extractor.rb
 create mode 100644 lib/oai_solr/basic_marc_single_extractor.rb
 create mode 100644 lib/oai_solr/dublin_core_crosswalk.rb
diff --git a/docker-compose.yml b/docker-compose.yml
index 9316df5..5769116 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -31,6 +31,19 @@ services:
       - solr-sdr-catalog
       - mariadb
 
+  test-persist:
+    build: .
+    volumes:
+      - .:/usr/src/app
+      - gem_cache:/gems
+    command: bash -c "bin/wait-for solr-sdr-catalog:9033 mariadb:3306"
+    environment:
+      SOLR_URL: http://solr-sdr-catalog:9033/solr/catalog
+      RIGHTS_DATABASE_CONNECTION_STRING: "mysql2://ht_rights:ht_rights@mariadb/ht"
+    depends_on:
+      - solr-sdr-catalog
+      - mariadb
+
   solr-sdr-catalog:
     image: ghcr.io/hathitrust/catalog-solr-sample
     ports:
diff --git a/lib/oai_solr/basic_marc_extractor.rb b/lib/oai_solr/basic_marc_extractor.rb
new file mode 100644
index 0000000..73e87e1
--- /dev/null
+++ b/lib/oai_solr/basic_marc_extractor.rb
@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+
+require "set"
+require_relative "basic_marc_single_extractor"
+
+module OAISolr
+  # A collection of BasicMARCSingleExtractors that will collect their combined values from
+  # a MARC::Record.
+  class BasicMARCExtractor
+    # Create a new object, optionally passing tags/codes to add a first BasicMARCSingleExtractor
+    # @param [String,Array<String>,Range<String>] tags Single, array or, or range over 3-digit marc tags
+    # @param [String, Range<String>] subfield_codes Either a single string with all the desired subfield codes
+    #   e.g., "abcek", or a range, e.g., "'a'..'m'". Optional.
+    # @example
+    #   bme = BasicMARCExtractor.new; bme << BasicMARCSingleExtractor.new("245", "ab")
+    #   bme = BasicMARCExtractor.new("245", "ab")
+    #   bme = BasicMARCExtractor.new("600".."699", "a".."z")
+    def initialize(tags = nil, subfield_codes = nil)
+      @single_extractors = []
+      if tags
+        self << BasicMARCSingleExtractor.new(tags, subfield_codes)
+      end
+    end
+
+    # Given an array of duples (as from config), build up an extractor using `#<<`
+    # @param [Array<Array<String>>] tag_code_pairs Array of arrays of the form [ [tags, subfield_codes], ...]
+    # @example
+    #   bme = BasicMARCExtractor.from_pairs([["245", "ab"], ["100".."111", "abd"]])
+    # @see OAI::BasicMARCSingleExtractor#initialize for supported syntax
+    def self.from_pairs(tag_code_pairs)
+      unless tag_code_pairs.first&.is_a?(Array)
+        raise "#{self.class}.from_pairs takes an array of arrays"
+      end
+      basic_marc_extractor = new
+      tag_code_pairs.each { |tag, codes| basic_marc_extractor << BasicMARCSingleExtractor.new(tag, codes) }
+      basic_marc_extractor
+    end
+
+    # Add a previously constructed single extractor, and re-compute the set of interesting tags
+    # @param [OAI::BasicMARCSingleExtractor] basic_marc_single_extractor
+    # @return [OAI::BasicMARCExtractor]
+    def <<(basic_marc_single_extractor)
+      @single_extractors << basic_marc_single_extractor
+      set_interesting_tags!
+      self
+    end
+
+    # For efficiently, keep track of which field tags are "interesting" to this specific extractor,
+    # so we don't have to check the whole list of field tags for every BasicMARCSingleExtractor
+    # @see set_interesting_tags!
+    # @param [String] tag The field tag
+    # @return [Boolean]
+    def interesting_tag?(tag)
+      @interesting_ranges.any? { |rng| rng.cover?(tag) } or @interesting_single_tags.include?(tag)
+    end
+
+    # Get a list of the "interesting" fields (by tag), and run each single extractor in turn
+    # on them. Flatten, compact, and uniq the resulting strings and return
+    # @param [MARC::Record] rec The record from which to extract data
+    # @return [Array<String>] array of extracts
+    def values(rec)
+      rec.select { |field| interesting_tag?(field.tag) }
+        .flat_map { |f| @single_extractors.flat_map { |extractor| extractor.value(f) } }
+        .compact.uniq
+    end
+
+    private
+
+    # We want to efficiently determine if the tag is one that we're interested in.
+    # We support single tags, arrays of (single) tags, and tag ranges. The first two
+    # merge into one set; the ranges we handle separately for efficiency (no sense in
+    # turning '600'..'699' into an array)
+    def set_interesting_tags!
+      @interesting_single_tags = Set.new
+      @interesting_ranges = Set.new
+      @single_extractors.map(&:computed_tags).each do |tags|
+        case tags
+        when Range
+          @interesting_ranges << tags
+        else
+          @interesting_single_tags += Array(tags)
+        end
+        @interesting_single_tags.flatten!
+      end
+    end
+  end
+end
diff --git a/lib/oai_solr/basic_marc_single_extractor.rb b/lib/oai_solr/basic_marc_single_extractor.rb
new file mode 100644
index 0000000..6081f60
--- /dev/null
+++ b/lib/oai_solr/basic_marc_single_extractor.rb
@@ -0,0 +1,161 @@
+module OAISolr
+  # Build up a simple object to quasi-efficiently extract values from MARC tag/subfield-codes
+  # based on a simplistic query specification.
+  #
+  # A single BasicMARCSingleExtractor will extract a specific set of subfields from the
+  # given tag specification.
+  #
+  # The set (or single) of tags_to_match you want can be passed in as:
+  #   * A single string. `"245"`
+  #   * A three digit integer, which will be coerced into a string. `245`
+  #     * Note that if you want a zero-led field (e.g., "050") you can't use the integer option
+  #   * An array of tags_to_match. ["245", "100", "111"]
+  #   * A range of Strings that encompass all the tags you want,. "600".."699"
+  #
+  # Subfield codes can be expressed as:
+  #   * A string containing all the subfields you want. "abdek"
+  #   * A range of one-character strings. "a".."n"
+  #
+  # Control field: for "codes", pass a range of characters to fetch
+  #   * When dealing with a control field, the "codes" passed should actually be a range of integers
+  #     corresponding to the indexes (zero-based) of the characters you want from that value.
+  class BasicMARCSingleExtractor
+    # Generally, MARC fields have the data in alphabetical subfields fields, and metadata (e.g., links to
+    # other fields) in numbered subfields.  We'll use all the "letter" subfields as the
+    # default for which subfields to use.
+    ALPHA = "a".."z"
+
+    attr_reader :tags, :codes, :computed_tags
+
+    # Create a new extractor for the given tag(s) and subfield code(s)
+    # Note that this code just creates a method to determine if a field matches the desired tags_to_match,
+    # and another to actually extract data from the subfields of those matched fields.
+    #
+    # Everything else in this class is just support to create the #matches_tag? and
+    # #extract methods.
+    #
+    # @param [String, Array<String>, Range<String>] tags
+    # @param [String] codes A list of the
+    # @example One field tag, two subfield codes
+    #   extractor = BasicMARCSingleExtractor.new("245", "ab")
+    # @example An array of tags_to_match
+    #   extractor = BasicMARCSingleExtractor.new(["100", "110", "111"], "abd")
+    # @example A range of tags_to_match, and the default (all alphabetic) subfield codes
+    #   extractor = BasicMARCSingleExtractor.new("600".."699") # subfield codes defaults to ALPHA
+    # @example A single tag, with a range of subfields
+    #   extractor = BasicMARCSingleExtractor.new("245", "a".."e")
+    # @example Get the "date1" characters from the 008 field
+    #   extractor = BasicMARCStringExtractor.new("008", 7..10)
+    def initialize(tags, codes)
+      @tags = tags
+      @codes = codes || ALPHA
+      define_singleton_method(:matches_tag?, tag_matcher(@tags))
+      define_singleton_method(:extract, value_extractor(@codes))
+    end
+
+    # @!method matches_tag?(tag)
+    #   Determines if the passed field tag (e.g., "245") is one that this extractor
+    #   cares about.
+    #   @param [String] tag
+    #   @return [Boolean]
+
+    # @!method extract(field)
+    #   Takes a MARC::DataField or MARC::ControlField and:
+    #     * get the values of the subfields with the wanted codes and
+    #       return them as a single, space-delimited string
+    #     * Get a range of characters from a control field, when the "codes" specified was
+    #       actually an integer range.
+    #   @param [MARC::DataField, MARC::ControlField] field
+    #   @return [String] the desired value(s), with subfield values joined with a space
+
+    # If the "codes" that were passed was actually an integer range, we assume that we're dealing
+    # with a control field.
+    def control_field?
+      codes.is_a?(Range) and codes.begin.is_a?(Integer)
+    end
+
+    # Try to extract strings from the desired subfield values. If none match, or we end
+    # up with an empty string, return nil
+    # @param [MARC::DataField] field
+    # @return [String, nil] Space-delimited values of the wanted subfields
+    def value(field)
+      val = if matches_tag?(field.tag)
+        extract(field) # defined dynamically in the constructor
+      else
+        return nil
+      end
+
+      val.empty? ? nil : val
+    end
+
+    # To decide what values to extract, we first need to decide if a given field's tag
+    # is one of the ones we care about for this extractor.
+    #
+    # Use the tag specification passed in the constructor and figure out
+    # the best way to test if a field tag string (e.g., "245") matches the tags
+    # covered by this extractor. Then build a lambda that will do that test.
+    #
+    # The returned lambda is used in the constructor to create the #matches_tag? method
+    # @param  [String, Array<String>, Range<String>] tags_to_match
+    # @return [Proc] a lambda that takes a single tag and sees if it matches this extractor
+    def tag_matcher(tags_to_match)
+      case tags_to_match
+      when Integer, String
+        @computed_tags = tags_to_match.to_s
+        ->(t) { t.to_s == @computed_tags }
+      when Array
+        @computed_tags = tags_to_match.map(&:to_s).uniq
+        ->(t) { @computed_tags.include? t }
+      when Range
+        @computed_tags = tags_to_match
+        ->(t) { @computed_tags.cover?(t) }
+      else
+        raise "Illegal argumrnt '#{tags_to_match.inspect}'"
+      end
+    end
+
+    # Given a subfield codes specification from the constructor, build an efficient
+    # lambda to pull out the data from the given code(s) as a string. Used in the
+    # constructor to make the #extract method.
+    # @param [String, Range<String>, Range<Integer>] codes_or_control_field_range
+    # @return [Proc] lambda that take a MARC::ControlField or MARC::DataField and pulls
+    #   out the requested data.
+    def value_extractor(codes_or_control_field_range)
+      if control_field?
+        control_field_extractor(codes_or_control_field_range)
+      else
+        datafield_extractor(codes_or_control_field_range)
+      end
+    end
+
+    private
+
+    # A control field extractor just gets the characters in the given range
+    # @param [Range] integer_range  Integer range (zero-based) of the chars you want
+    # @return [Proc] lambda that will take a control field and extract the right characters
+    def control_field_extractor(integer_range)
+      ->(control_field) { control_field.value.slice(integer_range) }
+    end
+
+    # Subfield extraction for when the codes are specified as a single char, a bunch of chars,
+    # or a char range.  Each is treated separately to get the best performance for
+    # each situation, because these things can add up when doing lots and lots of records.
+    # @param [String] codes A string of which subfield codes to extract
+    # @return [Proc] lambda that will correctly do the extraction and joining of values on the passed field.
+    def datafield_extractor(codes)
+      case codes
+      when String
+        if codes.size == 1
+          ->(data_field) { data_field.select { |sf| sf.code == codes }.map(&:value).join(" ").strip }
+        else
+          codesarray = codes.chars
+          ->(data_field) { data_field.select { |sf| codesarray.include? sf.code }.map(&:value).join(" ").strip }
+        end
+      when Range
+        ->(data_field) { data_field.select { |sf| codes.cover? sf.code }.map(&:value).join(" ").strip }
+      else
+        raise "Subfield codes must be either a string of chars, a range of chars, or a range of ints for control field extraction"
+      end
+    end
+  end
+end
diff --git a/lib/oai_solr/dublin_core.rb b/lib/oai_solr/dublin_core.rb
index 7c6a210..2d3ad14 100644
--- a/lib/oai_solr/dublin_core.rb
+++ b/lib/oai_solr/dublin_core.rb
@@ -1,8 +1,13 @@
 require "oai"
 require "rights_database"
+require "oai_solr/dublin_core_crosswalk"
 
 module OAISolr
   class DublinCore < OAI::Provider::Metadata::DublinCore
+    # A dublic core crosswalk object for translating MARC records into
+    # the dublin core fields.
+    CROSSWALK = OAISolr::DublinCoreCrosswalk.new
+
     def encode _, record
       dc_hash = dublin_core_hash(record)
 
@@ -33,41 +38,53 @@ def self.rights_statement(record, statements = access_statements(record))
 
     private
 
+    # @param [OAISolr::Record] record
     def dublin_core_hash(record)
-      # TODO: to_dublin_core doesn't do much useful in the current release of
-      # ruby-marc - the only things we're keeping from it are "source" and
-      # "relation"
-      record.marc_record.to_dublin_core.compact.tap do |dc|
-        dc.default_proc = proc { |hash, key| hash[key] = [] }
-
-        dc["type"] = "text"
-        dc["date"] = record.solr_document["display_date"]
-        dc["description"] = description(record)
-        dc["rights"] = self.class.rights_statement(record)
-
-        %w[publisher language format subject_display authorStr]
-          .reject { |k| record.solr_document[k].nil? }
-          .each { |k| dc[k] = [record.solr_document[k]].flatten }
-
-        dc["subject"] = dc.delete("subject_display")
-        dc["creator"] = dc.delete("authorStr")
-
-        # the old OAI provider doesn't include dc:coverage, and what rubymarc
-        # gives is as badly-formatted as the authors & subjects
-        dc.delete("coverage")
-
-        record.solr_document["oclc"]&.each { |o| dc["identifier"] << "(OCoLC)#{o}" }
-        record.solr_document["ht_id"].each { |htid| dc["identifier"] << "#{Settings.handle}#{htid}" }
-        record.solr_document["isbn"]&.each { |isbn| dc["identifier"] << isbn }
-      end.reject { |_k, v| v.nil? || v.empty? }
-    end
+      dc = {}
+
+      # Set stuff that's constant for HT items
+      dc["type"] = "text"
+      dc["rights"] = self.class.rights_statement(record)
+
+      # Get stuff out of the solr documment
+      dc["date"] = record.first_solr_value("display_date")
+      dc["language"] = record.first_solr_value("language")
+      dc["publisher"] = record.first_solr_value("publisher")
+      dc["subject"] = record.solr_value("subject_display")
+      dc["format"] = record.first_solr_value("format")
 
-    # Current implementation appears to use 300
-    # ruby-marc's next release will likely use 500
-    def description(record)
-      return unless record.marc_record["300"]
+      marc = record.marc_record
+
+      # The LoC spec says to NOT use creator, and instead use contributor, but our users
+      # have asked that we keep this the same as before, using creator.
+      dc["creator"] = CROSSWALK.contributor(marc)
+
+      # Pull the rest from the record according to the Library of Congress crosswalk
+      dc["publisher"] ||= CROSSWALK.publisher(marc)
+      dc["coverage"] = CROSSWALK.coverage(marc)
+      dc["description"] = CROSSWALK.description(marc)
+      dc["format"] ||= CROSSWALK.format(marc)
+      dc["relation"] = CROSSWALK.relation(marc)
+      dc["source"] = CROSSWALK.source(marc)
+      dc["title"] = CROSSWALK.title(marc)
+
+      # Get the identifiers
+      dc["identifier"] = record.solr_array("oclc").map { |id| "(OCoLC)#{id}" }
+        .concat(record.solr_array("ht_id").map { |htid| "#{Settings.handle}#{htid}" })
+        .concat(record.solr_array("isbn").map { |isbn| "ISBN #{isbn}" })
+        .concat(record.solr_array("issn").map { |issn| "ISBN #{issn}" })
+        .concat(record.solr_array("lccn").map { |lccn| "LCCN #{lccn}" })
+      # Flatten it all out and get rid of nils and duplicates
+      dc.select { |k, v| v.is_a?(Array) }.each_pair do |_field, values|
+        values.flatten!
+        values.compact!
+        values.uniq!
+        values.reject! { |x| x == "".freeze }
+      end
 
-      record.marc_record["300"].subfields.select { |sub| %w[a b c].include? sub.code }.map { |sub| sub.value }.join(" ")
+      # Ditch everything that's empty or nil
+      dc.reject! { |_k, v| v.nil? || v.empty? }
+      dc
     end
 
     # Returns an array of unique access statements for each HTID on record
diff --git a/lib/oai_solr/dublin_core_crosswalk.rb b/lib/oai_solr/dublin_core_crosswalk.rb
new file mode 100644
index 0000000..0f0a51b
--- /dev/null
+++ b/lib/oai_solr/dublin_core_crosswalk.rb
@@ -0,0 +1,157 @@
+require "set"
+require_relative "basic_marc_extractor"
+
+module OAISolr
+  # Create an instance that will map MARC records to Dublin Core fields.
+  # Generally taken from the crosswalk at https://www.loc.gov/marc/marc2dc.html
+  # Mappings that can be easily specified as an OAI::BasicMARCExtractor are defined
+  # in the MAPPINGS constant. Anything more complex has its own method.
+  class DublinCoreCrosswalk
+    MAPPINGS = {
+
+      contributor: [
+        [%w[100 700], "abcdjq"],
+        [%w[110 710], "abcd"],
+        [%w[111 711], "acden"],
+        ["720", "a"]
+      ],
+
+      coverage: [
+        [651, nil],
+        [662, nil],
+        [751, nil],
+        [752, nil]
+      ],
+
+      # date -- see below
+
+      description: [
+        [["300"] + ("500".."599").to_a - %w[506 530 538 540 546], nil]
+      ],
+
+      format: [
+        [340, nil],
+        [856, "q"]
+      ],
+
+      identifier: [
+        [%w[020 022 024], "a"],
+        [856, "u"],
+        [%w[050 080 060], nil],
+        ["082", "ab"]
+      ],
+
+      language: [
+        ["008", 35..37],
+        ["041", "abdefghj"]
+      ],
+
+      publisher: [
+        ["260", "ab"]
+      ],
+
+      relation: [
+        ["530", nil],
+        [("760".."787"), "ot"]
+      ],
+
+      rights: [
+        ["506", nil],
+        ["540", nil]
+      ],
+
+      source: [
+        ["534", "t"],
+        ["540", nil],
+        ["786", "ot"]
+      ],
+      subject: [
+        ["600", "abcdefghjklmnopqrstuvxyz"],
+        ["610", "abcdefghklmnoprstuvxyz"],
+        ["611", "acdefghjklnpqstuvxyz"],
+        ["630", "adefghklmnoprstvxyz"],
+        ["650", "abcdevxyz"],
+        ["653", "abevyz"]
+      ],
+
+      title: [
+        ["245", "abdefgknp"],
+        ["246", "abdefgknp"]
+      ]
+
+      # type -- see below
+    }
+
+    # Build the instance that will do the data extraction based on the mappings
+    # in MAPPINGS.
+    def initialize
+      MAPPINGS.each_pair do |key, spec_pairs|
+        define_singleton_method(key.to_sym, basic_marc_extractor_proc(spec_pairs))
+      end
+    end
+
+    def full_map(rec)
+      {
+        contributor: contributor(rec),
+        coverage: coverage(rec),
+        date: date(rec),
+        description: description(rec),
+        format: self.format(rec), # need self to avoid keyword conflict
+        identifier: identifier(rec),
+        language: language(rec),
+        publisher: publisher(rec),
+        relation: relation(rec),
+        rights: rights(rec),
+        source: source(rec),
+        subject: subject(rec),
+        title: title(rec),
+        type: type(rec)
+      }.reject { |k, v| v.empty? }
+    end
+
+    # Get the best date possible, looking for four digits in the 008, then
+    # falling back to the 260cg
+    # @param [MARC::Record] rec
+    def date(rec)
+      possible_year = date_008(rec)
+      return possible_year if /\A\d{4}\Z/.match?(possible_year)
+
+      other_possible_date = date_260cg(rec)
+
+      if /\S/.match?(other_possible_date)
+        other_possible_date
+      else
+        possible_year
+      end
+    end
+
+    def type(rec)
+      leader6 = rec.leader[6]
+      leader7 = rec.leader[7]
+      types = []
+      types << "text" if %w[a c d t].include?(leader6)
+      types << "image" if %w[e f g k].include?(leader6)
+      types << "sound" if %w[i k].include?(leader6)
+      types << "collection" if (leader6 == "p") || %w[c s].include?(leader7)
+      types
+    end
+
+    private
+
+    def basic_marc_extractor_proc(pairs)
+      bme = BasicMARCExtractor.from_pairs(pairs)
+      ->(rec) { bme.values(rec) }
+    end
+
+    def date_008(rec)
+      rec["008"].value[7..10]
+    end
+
+    def date_260cg(rec)
+      two_sixty = rec["260"]
+      if two_sixty
+        [two_sixty["c"], two_sixty["g"]].join(" ")
+      end
+    end
+  end
+end
diff --git a/lib/oai_solr/record.rb b/lib/oai_solr/record.rb
index 419035d..89997b1 100644
--- a/lib/oai_solr/record.rb
+++ b/lib/oai_solr/record.rb
@@ -68,6 +68,18 @@ def solr_value(field)
       solr_document.has_key?(field) ? solr_document[field] : nil
     end
 
+    # @param [String] field Name of the solr field
+    # @return [String, Numeric, NilClass] The first found value, or nil if not found
+    def first_solr_value(field)
+      return nil unless solr_document.has_key?(field)
+      val = solr_document[field]
+      if val.is_a?(Array)
+        val.first
+      else
+        val
+      end
+    end
+
     # @param [String] field Name of the field
     # @return [Array<String>, Numeric, NilClass] The found value, or nil if not found
     def solr_array(field)
diff --git a/spec/oai_solr_record_spec.rb b/spec/oai_solr_record_spec.rb
index fbcf48e..bd2e309 100644
--- a/spec/oai_solr_record_spec.rb
+++ b/spec/oai_solr_record_spec.rb
@@ -75,6 +75,10 @@
       expect(parsed.css("dc|identifier").map { |c| c.text }).to include("(OCoLC)562083")
     end
 
+    it "has the LCCN as an identifier" do
+      expect(parsed.css("dc|identifier").map(&:text)).to include("LCCN agr48000285")
+    end
+
     it "has item handle as an dc:identifier" do
       handle = "http://hdl.handle.net/2027/uc1.31822013347232"
       expect(parsed.css("dc|identifier").map { |c| c.text }).to include(handle)
@@ -92,7 +96,19 @@
       let(:sdoc) { JSON.parse(File.read("spec/data/008553258.json")) }
 
       it "has ISBN as a dc:identifier" do
-        expect(parsed.css("dc|identifier").map { |c| c.text }).to include("9806741242")
+        expect(parsed.css("dc|identifier").map { |c| c.text }).to include("ISBN 9806741242")
+      end
+    end
+
+    context "with record with more complex data" do
+      let(:sdoc) { JSON.parse(File.read("spec/data/001718542.json")) }
+
+      it "gets the full title" do
+        expect(parsed.css("dc|title").first.text).to eq("Local government ... comprising statutes, orders, forms, cases, and local decisions of the Local government board ; 1908-.")
+      end
+
+      it "gets multiple creators" do
+        expect(parsed.css("dc|creator").size).to eq(2)
       end
     end
   end

From 132496a2938a6e49f498b54b837e8f1bf84d8cb3 Mon Sep 17 00:00:00 2001
From: "Brian \"Moses\" Hall" <moseshll@umich.edu>
Date: Tue, 28 Mar 2023 16:58:32 -0400
Subject: [PATCH 2/3] Address feedback on PR.

---
 docker-compose.yml                          | 13 --------
 lib/oai_solr/basic_marc_extractor.rb        |  4 +--
 lib/oai_solr/basic_marc_single_extractor.rb |  2 +-
 lib/oai_solr/dublin_core_crosswalk.rb       | 35 +++++----------------
 4 files changed, 11 insertions(+), 43 deletions(-)

diff --git a/docker-compose.yml b/docker-compose.yml
index 5769116..9316df5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -31,19 +31,6 @@ services:
       - solr-sdr-catalog
       - mariadb
 
-  test-persist:
-    build: .
-    volumes:
-      - .:/usr/src/app
-      - gem_cache:/gems
-    command: bash -c "bin/wait-for solr-sdr-catalog:9033 mariadb:3306"
-    environment:
-      SOLR_URL: http://solr-sdr-catalog:9033/solr/catalog
-      RIGHTS_DATABASE_CONNECTION_STRING: "mysql2://ht_rights:ht_rights@mariadb/ht"
-    depends_on:
-      - solr-sdr-catalog
-      - mariadb
-
   solr-sdr-catalog:
     image: ghcr.io/hathitrust/catalog-solr-sample
     ports:
diff --git a/lib/oai_solr/basic_marc_extractor.rb b/lib/oai_solr/basic_marc_extractor.rb
index 73e87e1..ca4a69b 100644
--- a/lib/oai_solr/basic_marc_extractor.rb
+++ b/lib/oai_solr/basic_marc_extractor.rb
@@ -71,8 +71,8 @@ def values(rec)
     # merge into one set; the ranges we handle separately for efficiency (no sense in
     # turning '600'..'699' into an array)
     def set_interesting_tags!
-      @interesting_single_tags = Set.new
-      @interesting_ranges = Set.new
+      @interesting_single_tags = ::Set.new
+      @interesting_ranges = ::Set.new
       @single_extractors.map(&:computed_tags).each do |tags|
         case tags
         when Range
diff --git a/lib/oai_solr/basic_marc_single_extractor.rb b/lib/oai_solr/basic_marc_single_extractor.rb
index 6081f60..97775c9 100644
--- a/lib/oai_solr/basic_marc_single_extractor.rb
+++ b/lib/oai_solr/basic_marc_single_extractor.rb
@@ -110,7 +110,7 @@ def tag_matcher(tags_to_match)
         @computed_tags = tags_to_match
         ->(t) { @computed_tags.cover?(t) }
       else
-        raise "Illegal argumrnt '#{tags_to_match.inspect}'"
+        raise "Illegal argument '#{tags_to_match.inspect}'"
       end
     end
 
diff --git a/lib/oai_solr/dublin_core_crosswalk.rb b/lib/oai_solr/dublin_core_crosswalk.rb
index 0f0a51b..114c23d 100644
--- a/lib/oai_solr/dublin_core_crosswalk.rb
+++ b/lib/oai_solr/dublin_core_crosswalk.rb
@@ -82,31 +82,17 @@ class DublinCoreCrosswalk
       # type -- see below
     }
 
-    # Build the instance that will do the data extraction based on the mappings
-    # in MAPPINGS.
-    def initialize
-      MAPPINGS.each_pair do |key, spec_pairs|
-        define_singleton_method(key.to_sym, basic_marc_extractor_proc(spec_pairs))
-      end
+    MAPPINGS.each do |key, spec_pairs|
+      bme = BasicMARCExtractor.from_pairs(spec_pairs)
+      define_method(key.to_sym, ->(rec) { bme.values(rec) })
     end
 
+    # If it's necessary to add a field that does not have an identically-named
+    # accessor, or is not in MAPPINGS, some adjustment may be necessary,
     def full_map(rec)
-      {
-        contributor: contributor(rec),
-        coverage: coverage(rec),
-        date: date(rec),
-        description: description(rec),
-        format: self.format(rec), # need self to avoid keyword conflict
-        identifier: identifier(rec),
-        language: language(rec),
-        publisher: publisher(rec),
-        relation: relation(rec),
-        rights: rights(rec),
-        source: source(rec),
-        subject: subject(rec),
-        title: title(rec),
-        type: type(rec)
-      }.reject { |k, v| v.empty? }
+      fields = MAPPINGS.keys + %i(type date)
+      Hash[fields.map {|field| [ field, self.send(field, rec)] }]
+        .reject { |k, v| v.empty? }
     end
 
     # Get the best date possible, looking for four digits in the 008, then
@@ -138,11 +124,6 @@ def type(rec)
 
     private
 
-    def basic_marc_extractor_proc(pairs)
-      bme = BasicMARCExtractor.from_pairs(pairs)
-      ->(rec) { bme.values(rec) }
-    end
-
     def date_008(rec)
       rec["008"].value[7..10]
     end

From 7313245617da14f584cb5da1a962dd78caf5a1fd Mon Sep 17 00:00:00 2001
From: "Brian \"Moses\" Hall" <moseshll@umich.edu>
Date: Wed, 29 Mar 2023 11:14:50 -0400
Subject: [PATCH 3/3] Appease standardrb

---
 lib/oai_solr/dublin_core_crosswalk.rb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/oai_solr/dublin_core_crosswalk.rb b/lib/oai_solr/dublin_core_crosswalk.rb
index 114c23d..6330d6f 100644
--- a/lib/oai_solr/dublin_core_crosswalk.rb
+++ b/lib/oai_solr/dublin_core_crosswalk.rb
@@ -90,9 +90,9 @@ class DublinCoreCrosswalk
     # If it's necessary to add a field that does not have an identically-named
     # accessor, or is not in MAPPINGS, some adjustment may be necessary,
     def full_map(rec)
-      fields = MAPPINGS.keys + %i(type date)
-      Hash[fields.map {|field| [ field, self.send(field, rec)] }]
-        .reject { |k, v| v.empty? }
+      fields = MAPPINGS.keys + %i[type date]
+      fields.map { |field| [field, send(field, rec)] }
+        .to_h.reject { |k, v| v.empty? }
     end
 
     # Get the best date possible, looking for four digits in the 008, then