From 91b207a1a50613676b39599799107e3e35b758b1 Mon Sep 17 00:00:00 2001 From: Dan Schmidt Date: Thu, 21 Dec 2023 22:59:02 -0800 Subject: [PATCH] Refactor SolrIndexer (#11) --- lib/gingr/cli.rb | 9 +-- lib/gingr/import_util.rb | 26 ++------- lib/gingr/solr_indexer.rb | 51 ++++++++--------- lib/gingr/watcher.rb | 4 +- spec/solr_indexer_spec.rb | 114 ++++++++++++++++++++++---------------- 5 files changed, 99 insertions(+), 105 deletions(-) diff --git a/lib/gingr/cli.rb b/lib/gingr/cli.rb index d046389..5ff147e 100644 --- a/lib/gingr/cli.rb +++ b/lib/gingr/cli.rb @@ -26,7 +26,6 @@ class Cli < Thor option :geoserver_url option :geoserver_secure_url def watch(root_dir = nil) - root_dir ||= ENV['GINGR_WATCH_DIRECTORY'] || '/opt/app/data/gingr' watcher = Gingr::Watcher.new(root_dir, options) watcher.start! end @@ -45,12 +44,10 @@ def watch(root_dir = nil) option :geoserver_secure_url option :update_reference_field, type: :boolean, default: false option :solr_url - def solr(dir_path) + def solr(directory) reference_urls = ImportUtil.get_reference_urls(options) - solr_url = options[:solr_url] || ENV.fetch('SOLR_URL', Config.default_options[:solr_url]) - ImportUtil.index_solr_from_dir(dir_path, solr_url, reference_urls) - txt = "all json files under '#{dir_path}' and subdirectories have been indexed to solr #{solr_url} successfully" - logger.info(txt) + solr = Gingr::SolrIndexer.new(options[:solr_url], reference_urls) + solr.index_directory(directory) end desc 'geoserver', 'publish a giving shapefile or GeoTIFF file to a geoserver' diff --git a/lib/gingr/import_util.rb b/lib/gingr/import_util.rb index b433aa3..980fe74 100644 --- a/lib/gingr/import_util.rb +++ b/lib/gingr/import_util.rb @@ -18,28 +18,14 @@ def publish_geoservers(geofile_names, options) publish_geoserver_files(geofile_names[:ucb], options[:geoserver_secure_url], false) end - def index_solr_from_dir(directory_path, url, reference_urls) - indexer = SolrIndexer.new(url, reference_urls) - Find.find(directory_path) do |path| - next unless File.extname(path).downcase == '.json' - - indexer.update(path) - rescue RSolr::Error::Http => e - logger.error("Solr index error: #{e.response}") - raise - end - indexer.commit - end - def get_reference_urls(options) - update_reference_field = options[:update_reference_field] - return {} unless update_reference_field - - hash = {} - Config.reference_urls.each_key do |key| - hash[key] = reference_url(key, options) + {}.tap do |refs| + if options[:update_reference_field] + Config.reference_urls.each_key do |key| + refs[key] = reference_url(key, options) + end + end end - hash end def root_path diff --git a/lib/gingr/solr_indexer.rb b/lib/gingr/solr_indexer.rb index 3a36fd7..9d9a97c 100644 --- a/lib/gingr/solr_indexer.rb +++ b/lib/gingr/solr_indexer.rb @@ -1,5 +1,6 @@ # frozen_string_literal: true require 'faraday/net_http_persistent' +require 'find' require 'rsolr' require_relative 'config' @@ -7,43 +8,37 @@ module Gingr class SolrIndexer include Config - attr_reader :reference_urls - attr_reader :solr + attr_accessor :reference_urls + attr_accessor :solr - def initialize(url, reference_urls = {}) - @solr = RSolr.connect url:, adapter: :net_http_persistent - @reference_urls = reference_urls + def initialize(solr = nil, reference_urls = nil) + solr ||= ENV['SOLR_URL'] || Gingr::Config.default_options[:solr_url] + solr = RSolr.connect url: solr, adapter: :net_http_persistent if solr.kind_of? String + @solr = solr + @reference_urls = reference_urls || {} end - def update_reference_urls? - !@reference_urls.empty? + def add(doc) + doc = JSON.load_file(doc) if doc.kind_of? String + update_reference_urls!(doc) + @solr.add doc end - def update(file_path) - commit_within = ENV.fetch('SOLR_COMMIT_WITHIN', 5000).to_i - doc = JSON.parse(File.read(file_path)) - [doc].flatten.each do |record| - update_reference_urls!(record) if update_reference_urls? - @solr.update params: { commitWithin: commit_within, overwrite: true }, - data: [record].to_json, - headers: { 'Content-Type' => 'application/json' } - end - end - - def commit - @solr.commit + def index_directory(directory) + Find.find(directory) + .select(&method(:json_file?)) + .each(&method(:add)) end - private - - def update_reference_urls!(record) - references = record['dct_references_s'] - - Config.reference_urls.each do |name, from_url| + def update_reference_urls!(doc) + Gingr::Config.reference_urls.each do |name, from_url| to_url = @reference_urls[name] - references = references.gsub(from_url, to_url) if to_url + doc['dct_references_s'].gsub!(from_url, to_url) if to_url end - record['dct_references_s'] = references + end + + def json_file?(filepath) + File.extname(filepath).casecmp?('.json') end end end diff --git a/lib/gingr/watcher.rb b/lib/gingr/watcher.rb index 5255ad4..18d9534 100644 --- a/lib/gingr/watcher.rb +++ b/lib/gingr/watcher.rb @@ -24,10 +24,10 @@ class Watcher attr_reader :options attr_reader :root_dir - def initialize(root_dir, options = {}) + def initialize(root_dir = nil, options = {}) # This is the Gingr root directory, not the directory to be watched. # Watcher watches the ./ready directory under this one. - @root_dir = root_dir + @root_dir = root_dir || ENV['GINGR_WATCH_DIRECTORY'] || '/opt/app/data/gingr' # Options are passed as-is to `gingr all`, so they should match the # arguments you'd otherwise pass to that command diff --git a/spec/solr_indexer_spec.rb b/spec/solr_indexer_spec.rb index 7946a6b..e9a97fa 100644 --- a/spec/solr_indexer_spec.rb +++ b/spec/solr_indexer_spec.rb @@ -1,74 +1,90 @@ # frozen_string_literal: true require 'spec_helper' +require 'find' +require 'gingr/solr_indexer' RSpec.describe Gingr::SolrIndexer do - let(:url) { 'http://solr:8983/solr/geodata-test' } - let(:solr) { instance_double(RSolr::Client) } + around(:each) do |test| + original_solr_url = ENV['SOLR_URL'] + test.run + ensure + ENV['SOLR_URL'] = original_solr_url + end describe '#initialize' do - before do - allow(RSolr).to receive(:connect).and_return(solr) + it 'initializes a solr client with the given url' do + indexer = Gingr::SolrIndexer.new('http://solr-from-init/') + expect(indexer.solr.uri.to_s).to eq 'http://solr-from-init/' + end + + it 'falls back to ENV["SOLR_URL"] if it is set' do + ENV['SOLR_URL'] = 'http://solr-from-env/' + indexer = Gingr::SolrIndexer.new + expect(indexer.solr.uri.to_s).to eq 'http://solr-from-env/' end - it 'should initialize' do - described_class.new(url) - expect(RSolr).to have_received(:connect).with( - url:, - adapter: :net_http_persistent - ) + it 'falls back to the config' do + ENV.delete 'SOLR_URL' + indexer = Gingr::SolrIndexer.new + expect(indexer.solr.uri.to_s).to eq 'http://solr:8983/solr/geodata-test/' end end - describe '#update' do - let(:file_path) { 'spec/fixture/jsonfile/berkeley_public_pdf.json' } - let(:doc) { JSON.parse(File.read(file_path)) } + describe '#update_reference_urls!' do + let(:document) { JSON.load_file('spec/fixture/jsonfile/berkeley_public_pdf.json') } - before do - allow(solr).to receive(:update) - allow(RSolr).to receive(:connect).and_return(solr) - solr_indexer.update(file_path) + it 'does nothing if reference_urls are nil' do + indexer = Gingr::SolrIndexer.new + expect { indexer.update_reference_urls! document }.not_to change { document } end - context 'update reference urls' do - let(:solr_indexer) { described_class.new(url, reference_urls) } - let(:reference_urls) do - { 'geoserver_secure' => 'http://fake_geoserver_secure:8081', - 'geoserver' => 'http://fake_geoserver:8080', - 'download' => 'https://fake_spatial.lib.berkeley.edu' } - end + it 'updates references if configured to do so' do + refs = { geoserver_url: 'http://geoserver-at-init/' } + indexer = Gingr::SolrIndexer.new(nil, refs) + expect { indexer.update_reference_urls! document }.to change { document } + expect(document['dct_references_s']).to match 'http://geoserver-at-init/' + end + end - it 'should call solr' do - expect(solr).to have_received(:update).with( - params: { commitWithin: 5000, overwrite: true }, - data: [[doc].flatten[0]].to_json, - headers: { 'Content-Type' => 'application/json' } - ) - end + describe '#index_directory' do + it 'adds all .json files to solr' do + files = ['foo.xml', 'bar.json', 'baz.json'].shuffle + expect(Find).to receive(:find).with('directory').and_return files + Gingr::SolrIndexer.any_instance.stub(:add) - it 'should call the update reference field method' do - expect(solr_indexer.update_reference_urls?).to eq(true) - end + indexer = Gingr::SolrIndexer.new + indexer.index_directory('directory') + expect(indexer).to have_received(:add).with('bar.json') + expect(indexer).to have_received(:add).with('baz.json') + expect(indexer).not_to have_received(:add).with('foo.xml') end + end - context 'not update reference urls' do - let(:solr_indexer) { described_class.new(url) } - it 'should not call the update reference field method' do - solr_indexer.update(file_path) - expect(solr_indexer.update_reference_urls?).to eq(false) - end + describe '#add' do + let(:document) { JSON.load_file document_path } + let(:document_path) { 'spec/fixture/jsonfile/berkeley_public_pdf.json' } + + it 'passes documents to the rsolr client' do + solr = spy(RSolr::Client) + indexer = Gingr::SolrIndexer.new(solr) + indexer.add(document) + expect(solr).to have_received(:add).with(document) end - end - describe '#commit' do - before do - allow(RSolr).to receive(:connect).and_return(solr) - allow(solr).to receive(:commit) + it 'automatically loads filepaths as JSON' do + solr = spy(RSolr::Client) + indexer = Gingr::SolrIndexer.new(solr) + indexer.add(document_path) + expect(solr).to have_received(:add).with(document) end - it 'should initialize' do - solr_indexer = described_class.new(url) - solr_indexer.commit - expect(solr_indexer.solr).to have_received(:commit) + it 'modifies reference urls' do + solr = spy(RSolr::Client) + refs = { geoserver_url: 'http://geoserver-from-init/' } + indexer = Gingr::SolrIndexer.new(solr, refs) + expect { indexer.add(document) }.to change { document } + expect(document['dct_references_s']).to match('http://geoserver-from-init/') + expect(solr).to have_received(:add).with(document) end end end