Skip to content

Commit

Permalink
Merge pull request #26 from hathitrust/marc_to_dublin_core_crosswalk
Browse files Browse the repository at this point in the history
Adding new class to convert marc to Dublin Core crosswalk
  • Loading branch information
aelkiss authored Apr 10, 2023
2 parents fd511bb + 7313245 commit 39d5aa0
Show file tree
Hide file tree
Showing 6 changed files with 464 additions and 33 deletions.
87 changes: 87 additions & 0 deletions lib/oai_solr/basic_marc_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# frozen_string_literal: true

require "set"
require_relative "basic_marc_single_extractor"

module OAISolr
# A collection of BasicMARCSingleExtractors that will collect their combined values from
# a MARC::Record.
class BasicMARCExtractor
# Create a new object, optionally passing tags/codes to add a first BasicMARCSingleExtractor
# @param [String,Array<String>,Range<String>] tags Single, array or, or range over 3-digit marc tags
# @param [String, Range<String>] subfield_codes Either a single string with all the desired subfield codes
# e.g., "abcek", or a range, e.g., "'a'..'m'". Optional.
# @example
# bme = BasicMARCExtractor.new; bme << BasicMARCSingleExtractor.new("245", "ab")
# bme = BasicMARCExtractor.new("245", "ab")
# bme = BasicMARCExtractor.new("600".."699", "a".."z")
def initialize(tags = nil, subfield_codes = nil)
@single_extractors = []
if tags
self << BasicMARCSingleExtractor.new(tags, subfield_codes)
end
end

# Given an array of duples (as from config), build up an extractor using `#<<`
# @param [Array<Array<String>>] tag_code_pairs Array of arrays of the form [ [tags, subfield_codes], ...]
# @example
# bme = BasicMARCExtractor.from_pairs([["245", "ab"], ["100".."111", "abd"]])
# @see OAI::BasicMARCSingleExtractor#initialize for supported syntax
def self.from_pairs(tag_code_pairs)
unless tag_code_pairs.first&.is_a?(Array)
raise "#{self.class}.from_pairs takes an array of arrays"
end
basic_marc_extractor = new
tag_code_pairs.each { |tag, codes| basic_marc_extractor << BasicMARCSingleExtractor.new(tag, codes) }
basic_marc_extractor
end

# Add a previously constructed single extractor, and re-compute the set of interesting tags
# @param [OAI::BasicMARCSingleExtractor] basic_marc_single_extractor
# @return [OAI::BasicMARCExtractor]
def <<(basic_marc_single_extractor)
@single_extractors << basic_marc_single_extractor
set_interesting_tags!
self
end

# For efficiently, keep track of which field tags are "interesting" to this specific extractor,
# so we don't have to check the whole list of field tags for every BasicMARCSingleExtractor
# @see set_interesting_tags!
# @param [String] tag The field tag
# @return [Boolean]
def interesting_tag?(tag)
@interesting_ranges.any? { |rng| rng.cover?(tag) } or @interesting_single_tags.include?(tag)
end

# Get a list of the "interesting" fields (by tag), and run each single extractor in turn
# on them. Flatten, compact, and uniq the resulting strings and return
# @param [MARC::Record] rec The record from which to extract data
# @return [Array<String>] array of extracts
def values(rec)
rec.select { |field| interesting_tag?(field.tag) }
.flat_map { |f| @single_extractors.flat_map { |extractor| extractor.value(f) } }
.compact.uniq
end

private

# We want to efficiently determine if the tag is one that we're interested in.
# We support single tags, arrays of (single) tags, and tag ranges. The first two
# merge into one set; the ranges we handle separately for efficiency (no sense in
# turning '600'..'699' into an array)
def set_interesting_tags!
@interesting_single_tags = ::Set.new
@interesting_ranges = ::Set.new
@single_extractors.map(&:computed_tags).each do |tags|
case tags
when Range
@interesting_ranges << tags
else
@interesting_single_tags += Array(tags)
end
@interesting_single_tags.flatten!
end
end
end
end
161 changes: 161 additions & 0 deletions lib/oai_solr/basic_marc_single_extractor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
module OAISolr
# Build up a simple object to quasi-efficiently extract values from MARC tag/subfield-codes
# based on a simplistic query specification.
#
# A single BasicMARCSingleExtractor will extract a specific set of subfields from the
# given tag specification.
#
# The set (or single) of tags_to_match you want can be passed in as:
# * A single string. `"245"`
# * A three digit integer, which will be coerced into a string. `245`
# * Note that if you want a zero-led field (e.g., "050") you can't use the integer option
# * An array of tags_to_match. ["245", "100", "111"]
# * A range of Strings that encompass all the tags you want,. "600".."699"
#
# Subfield codes can be expressed as:
# * A string containing all the subfields you want. "abdek"
# * A range of one-character strings. "a".."n"
#
# Control field: for "codes", pass a range of characters to fetch
# * When dealing with a control field, the "codes" passed should actually be a range of integers
# corresponding to the indexes (zero-based) of the characters you want from that value.
class BasicMARCSingleExtractor
# Generally, MARC fields have the data in alphabetical subfields fields, and metadata (e.g., links to
# other fields) in numbered subfields. We'll use all the "letter" subfields as the
# default for which subfields to use.
ALPHA = "a".."z"

attr_reader :tags, :codes, :computed_tags

# Create a new extractor for the given tag(s) and subfield code(s)
# Note that this code just creates a method to determine if a field matches the desired tags_to_match,
# and another to actually extract data from the subfields of those matched fields.
#
# Everything else in this class is just support to create the #matches_tag? and
# #extract methods.
#
# @param [String, Array<String>, Range<String>] tags
# @param [String] codes A list of the
# @example One field tag, two subfield codes
# extractor = BasicMARCSingleExtractor.new("245", "ab")
# @example An array of tags_to_match
# extractor = BasicMARCSingleExtractor.new(["100", "110", "111"], "abd")
# @example A range of tags_to_match, and the default (all alphabetic) subfield codes
# extractor = BasicMARCSingleExtractor.new("600".."699") # subfield codes defaults to ALPHA
# @example A single tag, with a range of subfields
# extractor = BasicMARCSingleExtractor.new("245", "a".."e")
# @example Get the "date1" characters from the 008 field
# extractor = BasicMARCStringExtractor.new("008", 7..10)
def initialize(tags, codes)
@tags = tags
@codes = codes || ALPHA
define_singleton_method(:matches_tag?, tag_matcher(@tags))
define_singleton_method(:extract, value_extractor(@codes))
end

# @!method matches_tag?(tag)
# Determines if the passed field tag (e.g., "245") is one that this extractor
# cares about.
# @param [String] tag
# @return [Boolean]

# @!method extract(field)
# Takes a MARC::DataField or MARC::ControlField and:
# * get the values of the subfields with the wanted codes and
# return them as a single, space-delimited string
# * Get a range of characters from a control field, when the "codes" specified was
# actually an integer range.
# @param [MARC::DataField, MARC::ControlField] field
# @return [String] the desired value(s), with subfield values joined with a space

# If the "codes" that were passed was actually an integer range, we assume that we're dealing
# with a control field.
def control_field?
codes.is_a?(Range) and codes.begin.is_a?(Integer)
end

# Try to extract strings from the desired subfield values. If none match, or we end
# up with an empty string, return nil
# @param [MARC::DataField] field
# @return [String, nil] Space-delimited values of the wanted subfields
def value(field)
val = if matches_tag?(field.tag)
extract(field) # defined dynamically in the constructor
else
return nil
end

val.empty? ? nil : val
end

# To decide what values to extract, we first need to decide if a given field's tag
# is one of the ones we care about for this extractor.
#
# Use the tag specification passed in the constructor and figure out
# the best way to test if a field tag string (e.g., "245") matches the tags
# covered by this extractor. Then build a lambda that will do that test.
#
# The returned lambda is used in the constructor to create the #matches_tag? method
# @param [String, Array<String>, Range<String>] tags_to_match
# @return [Proc] a lambda that takes a single tag and sees if it matches this extractor
def tag_matcher(tags_to_match)
case tags_to_match
when Integer, String
@computed_tags = tags_to_match.to_s
->(t) { t.to_s == @computed_tags }
when Array
@computed_tags = tags_to_match.map(&:to_s).uniq
->(t) { @computed_tags.include? t }
when Range
@computed_tags = tags_to_match
->(t) { @computed_tags.cover?(t) }
else
raise "Illegal argument '#{tags_to_match.inspect}'"
end
end

# Given a subfield codes specification from the constructor, build an efficient
# lambda to pull out the data from the given code(s) as a string. Used in the
# constructor to make the #extract method.
# @param [String, Range<String>, Range<Integer>] codes_or_control_field_range
# @return [Proc] lambda that take a MARC::ControlField or MARC::DataField and pulls
# out the requested data.
def value_extractor(codes_or_control_field_range)
if control_field?
control_field_extractor(codes_or_control_field_range)
else
datafield_extractor(codes_or_control_field_range)
end
end

private

# A control field extractor just gets the characters in the given range
# @param [Range] integer_range Integer range (zero-based) of the chars you want
# @return [Proc] lambda that will take a control field and extract the right characters
def control_field_extractor(integer_range)
->(control_field) { control_field.value.slice(integer_range) }
end

# Subfield extraction for when the codes are specified as a single char, a bunch of chars,
# or a char range. Each is treated separately to get the best performance for
# each situation, because these things can add up when doing lots and lots of records.
# @param [String] codes A string of which subfield codes to extract
# @return [Proc] lambda that will correctly do the extraction and joining of values on the passed field.
def datafield_extractor(codes)
case codes
when String
if codes.size == 1
->(data_field) { data_field.select { |sf| sf.code == codes }.map(&:value).join(" ").strip }
else
codesarray = codes.chars
->(data_field) { data_field.select { |sf| codesarray.include? sf.code }.map(&:value).join(" ").strip }
end
when Range
->(data_field) { data_field.select { |sf| codes.cover? sf.code }.map(&:value).join(" ").strip }
else
raise "Subfield codes must be either a string of chars, a range of chars, or a range of ints for control field extraction"
end
end
end
end
81 changes: 49 additions & 32 deletions lib/oai_solr/dublin_core.rb
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
require "oai"
require "rights_database"
require "oai_solr/dublin_core_crosswalk"

module OAISolr
class DublinCore < OAI::Provider::Metadata::DublinCore
# A dublic core crosswalk object for translating MARC records into
# the dublin core fields.
CROSSWALK = OAISolr::DublinCoreCrosswalk.new

def encode _, record
dc_hash = dublin_core_hash(record)

Expand Down Expand Up @@ -33,41 +38,53 @@ def self.rights_statement(record, statements = access_statements(record))

private

# @param [OAISolr::Record] record
def dublin_core_hash(record)
# TODO: to_dublin_core doesn't do much useful in the current release of
# ruby-marc - the only things we're keeping from it are "source" and
# "relation"
record.marc_record.to_dublin_core.compact.tap do |dc|
dc.default_proc = proc { |hash, key| hash[key] = [] }

dc["type"] = "text"
dc["date"] = record.solr_document["display_date"]
dc["description"] = description(record)
dc["rights"] = self.class.rights_statement(record)

%w[publisher language format subject_display authorStr]
.reject { |k| record.solr_document[k].nil? }
.each { |k| dc[k] = [record.solr_document[k]].flatten }

dc["subject"] = dc.delete("subject_display")
dc["creator"] = dc.delete("authorStr")

# the old OAI provider doesn't include dc:coverage, and what rubymarc
# gives is as badly-formatted as the authors & subjects
dc.delete("coverage")

record.solr_document["oclc"]&.each { |o| dc["identifier"] << "(OCoLC)#{o}" }
record.solr_document["ht_id"].each { |htid| dc["identifier"] << "#{Settings.handle}#{htid}" }
record.solr_document["isbn"]&.each { |isbn| dc["identifier"] << isbn }
end.reject { |_k, v| v.nil? || v.empty? }
end
dc = {}

# Set stuff that's constant for HT items
dc["type"] = "text"
dc["rights"] = self.class.rights_statement(record)

# Get stuff out of the solr documment
dc["date"] = record.first_solr_value("display_date")
dc["language"] = record.first_solr_value("language")
dc["publisher"] = record.first_solr_value("publisher")
dc["subject"] = record.solr_value("subject_display")
dc["format"] = record.first_solr_value("format")

# Current implementation appears to use 300
# ruby-marc's next release will likely use 500
def description(record)
return unless record.marc_record["300"]
marc = record.marc_record

# The LoC spec says to NOT use creator, and instead use contributor, but our users
# have asked that we keep this the same as before, using creator.
dc["creator"] = CROSSWALK.contributor(marc)

# Pull the rest from the record according to the Library of Congress crosswalk
dc["publisher"] ||= CROSSWALK.publisher(marc)
dc["coverage"] = CROSSWALK.coverage(marc)
dc["description"] = CROSSWALK.description(marc)
dc["format"] ||= CROSSWALK.format(marc)
dc["relation"] = CROSSWALK.relation(marc)
dc["source"] = CROSSWALK.source(marc)
dc["title"] = CROSSWALK.title(marc)

# Get the identifiers
dc["identifier"] = record.solr_array("oclc").map { |id| "(OCoLC)#{id}" }
.concat(record.solr_array("ht_id").map { |htid| "#{Settings.handle}#{htid}" })
.concat(record.solr_array("isbn").map { |isbn| "ISBN #{isbn}" })
.concat(record.solr_array("issn").map { |issn| "ISBN #{issn}" })
.concat(record.solr_array("lccn").map { |lccn| "LCCN #{lccn}" })
# Flatten it all out and get rid of nils and duplicates
dc.select { |k, v| v.is_a?(Array) }.each_pair do |_field, values|
values.flatten!
values.compact!
values.uniq!
values.reject! { |x| x == "".freeze }
end

record.marc_record["300"].subfields.select { |sub| %w[a b c].include? sub.code }.map { |sub| sub.value }.join(" ")
# Ditch everything that's empty or nil
dc.reject! { |_k, v| v.nil? || v.empty? }
dc
end

# Returns an array of unique access statements for each HTID on record
Expand Down
Loading

0 comments on commit 39d5aa0

Please sign in to comment.