Skip to content

Commit

Permalink
Refactor SolrIndexer (#11)
Browse files Browse the repository at this point in the history
  • Loading branch information
danschmidt5189 authored Dec 22, 2023
1 parent d0e0808 commit 91b207a
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 105 deletions.
9 changes: 3 additions & 6 deletions lib/gingr/cli.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class Cli < Thor
option :geoserver_url
option :geoserver_secure_url
def watch(root_dir = nil)
root_dir ||= ENV['GINGR_WATCH_DIRECTORY'] || '/opt/app/data/gingr'
watcher = Gingr::Watcher.new(root_dir, options)
watcher.start!
end
Expand All @@ -45,12 +44,10 @@ def watch(root_dir = nil)
option :geoserver_secure_url
option :update_reference_field, type: :boolean, default: false
option :solr_url
def solr(dir_path)
def solr(directory)
reference_urls = ImportUtil.get_reference_urls(options)
solr_url = options[:solr_url] || ENV.fetch('SOLR_URL', Config.default_options[:solr_url])
ImportUtil.index_solr_from_dir(dir_path, solr_url, reference_urls)
txt = "all json files under '#{dir_path}' and subdirectories have been indexed to solr #{solr_url} successfully"
logger.info(txt)
solr = Gingr::SolrIndexer.new(options[:solr_url], reference_urls)
solr.index_directory(directory)
end

desc 'geoserver', 'publish a giving shapefile or GeoTIFF file to a geoserver'
Expand Down
26 changes: 6 additions & 20 deletions lib/gingr/import_util.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,28 +18,14 @@ def publish_geoservers(geofile_names, options)
publish_geoserver_files(geofile_names[:ucb], options[:geoserver_secure_url], false)
end

def index_solr_from_dir(directory_path, url, reference_urls)
indexer = SolrIndexer.new(url, reference_urls)
Find.find(directory_path) do |path|
next unless File.extname(path).downcase == '.json'

indexer.update(path)
rescue RSolr::Error::Http => e
logger.error("Solr index error: #{e.response}")
raise
end
indexer.commit
end

def get_reference_urls(options)
update_reference_field = options[:update_reference_field]
return {} unless update_reference_field

hash = {}
Config.reference_urls.each_key do |key|
hash[key] = reference_url(key, options)
{}.tap do |refs|
if options[:update_reference_field]
Config.reference_urls.each_key do |key|
refs[key] = reference_url(key, options)
end
end
end
hash
end

def root_path
Expand Down
51 changes: 23 additions & 28 deletions lib/gingr/solr_indexer.rb
Original file line number Diff line number Diff line change
@@ -1,49 +1,44 @@
# frozen_string_literal: true
require 'faraday/net_http_persistent'
require 'find'
require 'rsolr'
require_relative 'config'

module Gingr
class SolrIndexer
include Config

attr_reader :reference_urls
attr_reader :solr
attr_accessor :reference_urls
attr_accessor :solr

def initialize(url, reference_urls = {})
@solr = RSolr.connect url:, adapter: :net_http_persistent
@reference_urls = reference_urls
def initialize(solr = nil, reference_urls = nil)
solr ||= ENV['SOLR_URL'] || Gingr::Config.default_options[:solr_url]
solr = RSolr.connect url: solr, adapter: :net_http_persistent if solr.kind_of? String
@solr = solr
@reference_urls = reference_urls || {}
end

def update_reference_urls?
!@reference_urls.empty?
def add(doc)
doc = JSON.load_file(doc) if doc.kind_of? String
update_reference_urls!(doc)
@solr.add doc
end

def update(file_path)
commit_within = ENV.fetch('SOLR_COMMIT_WITHIN', 5000).to_i
doc = JSON.parse(File.read(file_path))
[doc].flatten.each do |record|
update_reference_urls!(record) if update_reference_urls?
@solr.update params: { commitWithin: commit_within, overwrite: true },
data: [record].to_json,
headers: { 'Content-Type' => 'application/json' }
end
end

def commit
@solr.commit
def index_directory(directory)
Find.find(directory)
.select(&method(:json_file?))
.each(&method(:add))
end

private

def update_reference_urls!(record)
references = record['dct_references_s']

Config.reference_urls.each do |name, from_url|
def update_reference_urls!(doc)
Gingr::Config.reference_urls.each do |name, from_url|
to_url = @reference_urls[name]
references = references.gsub(from_url, to_url) if to_url
doc['dct_references_s'].gsub!(from_url, to_url) if to_url
end
record['dct_references_s'] = references
end

def json_file?(filepath)
File.extname(filepath).casecmp?('.json')
end
end
end
4 changes: 2 additions & 2 deletions lib/gingr/watcher.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ class Watcher
attr_reader :options
attr_reader :root_dir

def initialize(root_dir, options = {})
def initialize(root_dir = nil, options = {})
# This is the Gingr root directory, not the directory to be watched.
# Watcher watches the ./ready directory under this one.
@root_dir = root_dir
@root_dir = root_dir || ENV['GINGR_WATCH_DIRECTORY'] || '/opt/app/data/gingr'

# Options are passed as-is to `gingr all`, so they should match the
# arguments you'd otherwise pass to that command
Expand Down
114 changes: 65 additions & 49 deletions spec/solr_indexer_spec.rb
Original file line number Diff line number Diff line change
@@ -1,74 +1,90 @@
# frozen_string_literal: true
require 'spec_helper'
require 'find'
require 'gingr/solr_indexer'

RSpec.describe Gingr::SolrIndexer do
let(:url) { 'http://solr:8983/solr/geodata-test' }
let(:solr) { instance_double(RSolr::Client) }
around(:each) do |test|
original_solr_url = ENV['SOLR_URL']
test.run
ensure
ENV['SOLR_URL'] = original_solr_url
end

describe '#initialize' do
before do
allow(RSolr).to receive(:connect).and_return(solr)
it 'initializes a solr client with the given url' do
indexer = Gingr::SolrIndexer.new('http://solr-from-init/')
expect(indexer.solr.uri.to_s).to eq 'http://solr-from-init/'
end

it 'falls back to ENV["SOLR_URL"] if it is set' do
ENV['SOLR_URL'] = 'http://solr-from-env/'
indexer = Gingr::SolrIndexer.new
expect(indexer.solr.uri.to_s).to eq 'http://solr-from-env/'
end

it 'should initialize' do
described_class.new(url)
expect(RSolr).to have_received(:connect).with(
url:,
adapter: :net_http_persistent
)
it 'falls back to the config' do
ENV.delete 'SOLR_URL'
indexer = Gingr::SolrIndexer.new
expect(indexer.solr.uri.to_s).to eq 'http://solr:8983/solr/geodata-test/'
end
end

describe '#update' do
let(:file_path) { 'spec/fixture/jsonfile/berkeley_public_pdf.json' }
let(:doc) { JSON.parse(File.read(file_path)) }
describe '#update_reference_urls!' do
let(:document) { JSON.load_file('spec/fixture/jsonfile/berkeley_public_pdf.json') }

before do
allow(solr).to receive(:update)
allow(RSolr).to receive(:connect).and_return(solr)
solr_indexer.update(file_path)
it 'does nothing if reference_urls are nil' do
indexer = Gingr::SolrIndexer.new
expect { indexer.update_reference_urls! document }.not_to change { document }
end

context 'update reference urls' do
let(:solr_indexer) { described_class.new(url, reference_urls) }
let(:reference_urls) do
{ 'geoserver_secure' => 'http://fake_geoserver_secure:8081',
'geoserver' => 'http://fake_geoserver:8080',
'download' => 'https://fake_spatial.lib.berkeley.edu' }
end
it 'updates references if configured to do so' do
refs = { geoserver_url: 'http://geoserver-at-init/' }
indexer = Gingr::SolrIndexer.new(nil, refs)
expect { indexer.update_reference_urls! document }.to change { document }
expect(document['dct_references_s']).to match 'http://geoserver-at-init/'
end
end

it 'should call solr' do
expect(solr).to have_received(:update).with(
params: { commitWithin: 5000, overwrite: true },
data: [[doc].flatten[0]].to_json,
headers: { 'Content-Type' => 'application/json' }
)
end
describe '#index_directory' do
it 'adds all .json files to solr' do
files = ['foo.xml', 'bar.json', 'baz.json'].shuffle
expect(Find).to receive(:find).with('directory').and_return files
Gingr::SolrIndexer.any_instance.stub(:add)

it 'should call the update reference field method' do
expect(solr_indexer.update_reference_urls?).to eq(true)
end
indexer = Gingr::SolrIndexer.new
indexer.index_directory('directory')
expect(indexer).to have_received(:add).with('bar.json')
expect(indexer).to have_received(:add).with('baz.json')
expect(indexer).not_to have_received(:add).with('foo.xml')
end
end

context 'not update reference urls' do
let(:solr_indexer) { described_class.new(url) }
it 'should not call the update reference field method' do
solr_indexer.update(file_path)
expect(solr_indexer.update_reference_urls?).to eq(false)
end
describe '#add' do
let(:document) { JSON.load_file document_path }
let(:document_path) { 'spec/fixture/jsonfile/berkeley_public_pdf.json' }

it 'passes documents to the rsolr client' do
solr = spy(RSolr::Client)
indexer = Gingr::SolrIndexer.new(solr)
indexer.add(document)
expect(solr).to have_received(:add).with(document)
end
end

describe '#commit' do
before do
allow(RSolr).to receive(:connect).and_return(solr)
allow(solr).to receive(:commit)
it 'automatically loads filepaths as JSON' do
solr = spy(RSolr::Client)
indexer = Gingr::SolrIndexer.new(solr)
indexer.add(document_path)
expect(solr).to have_received(:add).with(document)
end

it 'should initialize' do
solr_indexer = described_class.new(url)
solr_indexer.commit
expect(solr_indexer.solr).to have_received(:commit)
it 'modifies reference urls' do
solr = spy(RSolr::Client)
refs = { geoserver_url: 'http://geoserver-from-init/' }
indexer = Gingr::SolrIndexer.new(solr, refs)
expect { indexer.add(document) }.to change { document }
expect(document['dct_references_s']).to match('http://geoserver-from-init/')
expect(solr).to have_received(:add).with(document)
end
end
end

0 comments on commit 91b207a

Please sign in to comment.