From 41bf1c7c9f3c69ca3d34a687712cfcdf6cac10ec Mon Sep 17 00:00:00 2001 From: Greg Wiedeman Date: Fri, 15 Dec 2023 13:09:23 -0500 Subject: [PATCH 1/3] started storing unitdates in solr differently to preserve order --- lib/arclight/traject/ead2_component_config.rb | 14 ++++++++------ lib/arclight/traject/ead2_config.rb | 15 ++++++++------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/lib/arclight/traject/ead2_component_config.rb b/lib/arclight/traject/ead2_component_config.rb index ed0f4a9ac..b74df8ddf 100644 --- a/lib/arclight/traject/ead2_component_config.rb +++ b/lib/arclight/traject/ead2_component_config.rb @@ -107,15 +107,17 @@ to_field 'title_ssm', extract_xpath('./did/unittitle') to_field 'title_tesim', extract_xpath('./did/unittitle') -to_field 'unitdate_bulk_ssim', extract_xpath('./did/unitdate[@type="bulk"]') -to_field 'unitdate_inclusive_ssm', extract_xpath('./did/unitdate[@type="inclusive"]') -to_field 'unitdate_other_ssim', extract_xpath('./did/unitdate[not(@type)]') +to_field 'unitdates_ssm', extract_xpath('/ead/archdesc/did/unitdate') +to_field 'unitdates_labels_ssm' do |record, accumulator| + record.xpath('/ead/archdesc/did/unitdate').each do |unitdate| + accumulator << unitdate.attribute('type')&.value + end +end to_field 'normalized_date_ssm' do |_record, accumulator, context| accumulator << settings['date_normalizer'].constantize.new( - context.output_hash['unitdate_inclusive_ssm'], - context.output_hash['unitdate_bulk_ssim'], - context.output_hash['unitdate_other_ssim'] + context.output_hash['unitdates_ssm'], + context.output_hash['unitdates_labels_ssm'] ).to_s end diff --git a/lib/arclight/traject/ead2_config.rb b/lib/arclight/traject/ead2_config.rb index 0fcb55128..a2bb4fb00 100644 --- a/lib/arclight/traject/ead2_config.rb +++ b/lib/arclight/traject/ead2_config.rb @@ -80,10 +80,12 @@ to_field 'title_tesim', extract_xpath('/ead/archdesc/did/unittitle') to_field 'ead_ssi', extract_xpath('/ead/eadheader/eadid') -to_field 'unitdate_ssm', extract_xpath('/ead/archdesc/did/unitdate') -to_field 'unitdate_bulk_ssim', extract_xpath('/ead/archdesc/did/unitdate[@type="bulk"]') -to_field 'unitdate_inclusive_ssm', extract_xpath('/ead/archdesc/did/unitdate[@type="inclusive"]') -to_field 'unitdate_other_ssim', extract_xpath('/ead/archdesc/did/unitdate[not(@type)]') +to_field 'unitdates_ssm', extract_xpath('/ead/archdesc/did/unitdate') +to_field 'unitdates_labels_ssm' do |record, accumulator| + record.xpath('/ead/archdesc/did/unitdate').each do |unitdate| + accumulator << unitdate.attribute('type')&.value + end +end # All top-level docs treated as 'collection' for routing / display purposes to_field 'level_ssm' do |_record, accumulator| @@ -104,9 +106,8 @@ to_field 'normalized_date_ssm' do |_record, accumulator, context| accumulator << settings['date_normalizer'].constantize.new( - context.output_hash['unitdate_inclusive_ssm'], - context.output_hash['unitdate_bulk_ssim'], - context.output_hash['unitdate_other_ssim'] + context.output_hash['unitdates_ssm'], + context.output_hash['unitdates_labels_ssm'] ).to_s end From 5b4656347779f4fa177f288f7fab6200c5b9d883 Mon Sep 17 00:00:00 2001 From: Greg Wiedeman Date: Fri, 15 Dec 2023 14:43:25 -0500 Subject: [PATCH 2/3] restructured dates so that order is preserved --- lib/arclight/normalized_date.rb | 43 +++++------------ lib/arclight/traject/ead2_component_config.rb | 10 ++-- lib/arclight/traject/ead2_config.rb | 6 ++- spec/fixtures/ead/nlm/alphaomegaalpha.xml | 3 ++ spec/lib/arclight/normalized_date_spec.rb | 47 ++++++++++--------- 5 files changed, 52 insertions(+), 57 deletions(-) diff --git a/lib/arclight/normalized_date.rb b/lib/arclight/normalized_date.rb index f1ce3f174..1daf153ad 100644 --- a/lib/arclight/normalized_date.rb +++ b/lib/arclight/normalized_date.rb @@ -6,27 +6,24 @@ module Arclight # e.g., "1990-2000, bulk 1990-1999" # @see http://www2.archivists.org/standards/DACS/part_I/chapter_2/4_date class NormalizedDate - # @param [String | Array] `inclusive` from the `unitdate` - # @param [Array] `bulk` from the `unitdate` - # @param [Array] `other` from the `unitdate` when type is not specified - def initialize(inclusive, bulk = [], other = []) - @inclusive = (inclusive || []).map do |inclusive_text| - if inclusive_text.is_a? Array # of YYYY-YYYY for ranges - # NOTE: This code is not routable AFAICT in actual indexing. - # We pass arrays of strings (or xml nodes) here, and never a multidimensional array - year_range(inclusive_text) - elsif inclusive_text.present? - inclusive_text.strip + # @param [Array] an array of unitdate strings in order + # @param [Array] an array of corresponding type labels for dates or nil + def initialize(unitdates, unitdate_labels) + @date_accumulator = [] + if unitdates.present? + unitdates.each_with_index do |unitdate, i| + if unitdate_labels[i].downcase.match?('bulk') + @date_accumulator << "#{unitdate_labels[i]} #{unitdate}" + else + @date_accumulator << unitdate + end end - end&.join(', ') - - @bulk = Array.wrap(bulk).compact.map(&:strip).join(', ') - @other = Array.wrap(other).compact.map(&:strip).join(', ') + end end # @return [String] the normalized title/date def to_s - normalize + @date_accumulator.join(', ') end private @@ -36,19 +33,5 @@ def to_s def year_range(date_array) YearRange.new(date_array.include?('/') ? date_array : date_array.map { |v| v.tr('-', '/') }).to_s end - - # @see http://www2.archivists.org/standards/DACS/part_I/chapter_2/4_date for rules - def normalize - if inclusive.present? - result = inclusive.to_s - result << ", bulk #{bulk}" if bulk.present? - elsif other.present? - result = other.to_s - else - result = nil - end - - result&.strip - end end end diff --git a/lib/arclight/traject/ead2_component_config.rb b/lib/arclight/traject/ead2_component_config.rb index b74df8ddf..0ec568dd9 100644 --- a/lib/arclight/traject/ead2_component_config.rb +++ b/lib/arclight/traject/ead2_component_config.rb @@ -107,10 +107,14 @@ to_field 'title_ssm', extract_xpath('./did/unittitle') to_field 'title_tesim', extract_xpath('./did/unittitle') -to_field 'unitdates_ssm', extract_xpath('/ead/archdesc/did/unitdate') +to_field 'unitdates_ssm', extract_xpath('./did/unitdate') to_field 'unitdates_labels_ssm' do |record, accumulator| - record.xpath('/ead/archdesc/did/unitdate').each do |unitdate| - accumulator << unitdate.attribute('type')&.value + record.xpath('.//did/unitdate').each do |unitdate| + if unitdate.attribute('type') + accumulator << unitdate.attribute('type')&.value + else + accumulator << "" + end end end diff --git a/lib/arclight/traject/ead2_config.rb b/lib/arclight/traject/ead2_config.rb index a2bb4fb00..85dcfcb19 100644 --- a/lib/arclight/traject/ead2_config.rb +++ b/lib/arclight/traject/ead2_config.rb @@ -83,7 +83,11 @@ to_field 'unitdates_ssm', extract_xpath('/ead/archdesc/did/unitdate') to_field 'unitdates_labels_ssm' do |record, accumulator| record.xpath('/ead/archdesc/did/unitdate').each do |unitdate| - accumulator << unitdate.attribute('type')&.value + if unitdate.attribute('type') + accumulator << unitdate.attribute('type')&.value + else + accumulator << "" + end end end diff --git a/spec/fixtures/ead/nlm/alphaomegaalpha.xml b/spec/fixtures/ead/nlm/alphaomegaalpha.xml index 6e8c60d72..4c36ed24c 100644 --- a/spec/fixtures/ead/nlm/alphaomegaalpha.xml +++ b/spec/fixtures/ead/nlm/alphaomegaalpha.xml @@ -56,7 +56,9 @@ Compact digital disc 3 CDs + 1888 1894-1992 + 1903-1962 Collection materials primarily in English. @@ -392,6 +394,7 @@ MS C 271.I 1902-1976 1975-1976 + 1988

Administrative records include details materials directly related to the history and diff --git a/spec/lib/arclight/normalized_date_spec.rb b/spec/lib/arclight/normalized_date_spec.rb index b73f40334..197c8a9a0 100644 --- a/spec/lib/arclight/normalized_date_spec.rb +++ b/spec/lib/arclight/normalized_date_spec.rb @@ -3,22 +3,22 @@ require 'spec_helper' RSpec.describe Arclight::NormalizedDate do - subject(:normalized_date) { described_class.new(date_inclusive, date_bulk, date_other).to_s } + subject(:normalized_date) { described_class.new(unitdates, unitdate_labels).to_s } - let(:date_inclusive) { ['1990-2000'] } - let(:date_bulk) { '1999-2005' } - let(:date_other) { nil } + let(:unitdates) { ['1905', '1927-2000', '1982-1995'] } + let(:unitdate_labels) { '', 'inclusive', 'bulk' } context 'under normal conditions' do it 'joins dates' do - expect(normalized_date).to eq '1990-2000, bulk 1999-2005' + expect(normalized_date).to eq '1905, 1927-2000, bulk 1982-1995' end context 'multiple normalized dates' do - let(:date_inclusive) { %w[1990 1992] } + let(:unitdates) { %w[1990 1992] } + let(:unitdate_labels) { %w[inclusive inclusive] } it 'are joined w/ a comma' do - expect(normalized_date).to eq '1990, 1992, bulk 1999-2005' + expect(normalized_date).to eq '1990, 1992' end end end @@ -27,24 +27,25 @@ # NOTE: This test is the only place where the code that exercises this is routable # This has to be a multidimensional array, and the resulting XML nodes sent in are always flat context 'multiples' do - let(:date_inclusive) { [%w[1990-2000 2001-2002 2004]] } - let(:date_bulk) { '1990-2004' } + let(:unitdates) { [%w[1990-2000 2001-2002 2004 1990-2004]] } + let(:unitdate_labels) { [%w[inclusive inclusive INCLUSIVE bulk] } it 'uses compressed joined years' do - expect(normalized_date).to eq '1990-2002, 2004, bulk 1990-2004' + expect(normalized_date).to eq '1990-2000, 2001-2002, 2004, bulk 1990-2004' end end context 'undated' do - let(:date_bulk) { 'n.d.' } + let(:unitdates) { ['1905', '1927-2000', 'n.d.'] } it 'do not normalized term "undated"' do - expect(normalized_date).to eq '1990-2000, bulk n.d.' + expect(normalized_date).to eq '1905, 1927-2000, bulk n.d.' end end context 'circa' do - let(:date_bulk) { 'c.1995' } + let(:unitdates) { ['1990-2000', 'c.1995'] } + let(:unitdate_labels) { ['', 'bulk'] } it 'do not normalized term "circa"' do expect(normalized_date).to eq '1990-2000, bulk c.1995' @@ -52,34 +53,34 @@ end context 'no bulk' do - let(:date_bulk) { nil } + let(:unitdate_labels) { ['', 'inclusive', ''] } it 'uses inclusive date only' do - expect(normalized_date).to eq '1990-2000' + expect(normalized_date).to eq '1905, 1927-2000, 1982-1995' end end context 'no inclusive or bulk but other' do - let(:date_inclusive) { nil } - let(:date_bulk) { nil } - let(:date_other) { 'n.d.' } + let(:unitdates) { %w[1963 1954] } + let(:unitdate_labels) { ['', ''] } it 'uses other' do - expect(normalized_date).to eq 'n.d.' + expect(normalized_date).to eq '1963, 1954' end end context 'no inclusive but bulk' do - let(:date_inclusive) { nil } + let(:unitdates) { %w[1963 1954-1990] } + let(:unitdate_labels) { ['bulk', ''] } it 'does not know what to do' do - expect(normalized_date).to be_nil + expect(normalized_date).to eq 'bulk 1963, 1954-1990' end end context 'no information' do - let(:date_inclusive) { nil } - let(:date_bulk) { nil } + let(:unitdates) { nil } + let(:unitdate_labels) { nil } it 'does not know what to do' do expect(normalized_date).to be_nil From 43a819361b4c6a7acc974d5ceffde46e5b6ed76a Mon Sep 17 00:00:00 2001 From: Greg Wiedeman Date: Fri, 15 Dec 2023 14:57:01 -0500 Subject: [PATCH 3/3] tests should work now --- spec/lib/arclight/normalized_date_spec.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/spec/lib/arclight/normalized_date_spec.rb b/spec/lib/arclight/normalized_date_spec.rb index 197c8a9a0..b424936d8 100644 --- a/spec/lib/arclight/normalized_date_spec.rb +++ b/spec/lib/arclight/normalized_date_spec.rb @@ -6,7 +6,7 @@ subject(:normalized_date) { described_class.new(unitdates, unitdate_labels).to_s } let(:unitdates) { ['1905', '1927-2000', '1982-1995'] } - let(:unitdate_labels) { '', 'inclusive', 'bulk' } + let(:unitdate_labels) { ['', 'inclusive', 'bulk'] } context 'under normal conditions' do it 'joins dates' do @@ -27,8 +27,8 @@ # NOTE: This test is the only place where the code that exercises this is routable # This has to be a multidimensional array, and the resulting XML nodes sent in are always flat context 'multiples' do - let(:unitdates) { [%w[1990-2000 2001-2002 2004 1990-2004]] } - let(:unitdate_labels) { [%w[inclusive inclusive INCLUSIVE bulk] } + let(:unitdates) { ['1990-2000', '2001-2002', '2004', '1990-2004'] } + let(:unitdate_labels) { ['inclusive', 'inclusive', 'INCLUSIVE', 'bulk'] } it 'uses compressed joined years' do expect(normalized_date).to eq '1990-2000, 2001-2002, 2004, bulk 1990-2004' @@ -43,12 +43,12 @@ end end - context 'circa' do + context 'circa and mixed case' do let(:unitdates) { ['1990-2000', 'c.1995'] } - let(:unitdate_labels) { ['', 'bulk'] } + let(:unitdate_labels) { ['', 'BuLk'] } it 'do not normalized term "circa"' do - expect(normalized_date).to eq '1990-2000, bulk c.1995' + expect(normalized_date).to eq '1990-2000, BuLk c.1995' end end @@ -83,7 +83,7 @@ let(:unitdate_labels) { nil } it 'does not know what to do' do - expect(normalized_date).to be_nil + expect(normalized_date).to eq '' end end end