Skip to content

Commit

Permalink
Feature: Enhance SOLR integration and add a Schema API (#54)
Browse files Browse the repository at this point in the history
* add an abstraction to SOLR integeration and add Schema API

* add SOLR  Schema API tests

* update SOLR backend configuration and init

* use the new Solr connector in the model search interface

* update search test to cover the new automatic indexing and unindexing

* handle the solr container initialization when running docker for tests

* add  omit_norms options for SolrSchemaGenerator

* fix solr schema initial dynamic fields declaration and replace the usage of   mapping-ISOLatin1Accent

* delay the schema generation to after model declarations or in demand

* add solr edismax fitlers tests

* fix indexBatch and unindexBatch tests

* add security checks to the index and unindex functions

* change dynamic fields names to have less code migration

* update clear_all_schema to remove all copy and normal fields

* add an option to force solr initialization if wanted

* handle indexing embed objects of a model

* add index update option

* fix clear all schema to just remove all the fields and recreate them

* add index_enabled?  helper for models

* perform a status test  when initializing the solr connector

* extract init_search_connection function from init_search_connections

* fix typo in indexOptimize call

* add solr search using  HTTP post instead of GET for large queries
  • Loading branch information
syphax-bouazzouni authored Mar 2, 2024
1 parent 1be1c83 commit 6c51346
Show file tree
Hide file tree
Showing 14 changed files with 1,252 additions and 81 deletions.
1 change: 1 addition & 0 deletions .ruby-version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2.7.8
9 changes: 2 additions & 7 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,10 @@ services:
retries: 30

solr-ut:
image: ontoportal/solr-ut:0.0.2
image: solr:8.11.2
ports:
- 8983:8983
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8983/solr/term_search_core1/admin/ping?wt=json | grep -iq '\"status\":\"OK\"}' || exit 1"]
start_period: 10s
interval: 10s
timeout: 5s
retries: 5
command: bin/solr start -cloud -f

agraph-ut:
image: franzinc/agraph:v8.1.0
Expand Down
49 changes: 42 additions & 7 deletions lib/goo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ module Goo
@@model_by_name = {}
@@search_backends = {}
@@search_connection = {}
@@search_collections = {}
@@default_namespace = nil
@@id_prefix = nil
@@redis_client = nil
Expand Down Expand Up @@ -101,7 +102,7 @@ def self.language_includes(lang)
end

def self.add_namespace(shortcut, namespace,default=false)
if !(namespace.instance_of? RDF::Vocabulary)
unless namespace.instance_of? RDF::Vocabulary
raise ArgumentError, "Namespace must be a RDF::Vocabulary object"
end
@@namespaces[shortcut.to_sym] = namespace
Expand Down Expand Up @@ -252,11 +253,9 @@ def self.configure
raise ArgumentError, "Configuration needs to receive a code block"
end
yield self
configure_sanity_check()
configure_sanity_check

if @@search_backends.length > 0
@@search_backends.each { |name, val| @@search_connection[name] = RSolr.connect(url: search_conf(name), timeout: 1800, open_timeout: 1800) }
end
init_search_connections

@@namespaces.freeze
@@sparql_backends.freeze
Expand All @@ -280,8 +279,44 @@ def self.search_conf(name=:main)
return @@search_backends[name][:service]
end

def self.search_connection(name=:main)
return @@search_connection[name]
def self.search_connection(collection_name)
return search_client(collection_name).solr
end

def self.search_client(collection_name)
@@search_connection[collection_name]
end

def self.add_search_connection(collection_name, search_backend = :main, &block)
@@search_collections[collection_name] = {
search_backend: search_backend,
block: block_given? ? block : nil
}
end

def self.search_connections
@@search_connection
end

def self.init_search_connection(collection_name, search_backend = :main, block = nil, force: false)
return @@search_connection[collection_name] if @@search_connection[collection_name] && !force

@@search_connection[collection_name] = SOLR::SolrConnector.new(search_conf(search_backend), collection_name)
if block
block.call(@@search_connection[collection_name].schema_generator)
@@search_connection[collection_name].enable_custom_schema
end
@@search_connection[collection_name].init(force)
@@search_connection[collection_name]
end


def self.init_search_connections(force=false)
@@search_collections.each do |collection_name, backend|
search_backend = backend[:search_backend]
block = backend[:block]
init_search_connection(collection_name, search_backend, block, force: force)
end
end

def self.sparql_query_client(name=:main)
Expand Down
16 changes: 11 additions & 5 deletions lib/goo/base/settings/attribute.rb
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,6 @@ def list?(attr)
attribute_settings(attr)[:enforce].include?(:list)
end

def index_attribute?(attr)
return false if attribute_settings(attr).nil?
attribute_settings(attr)[:index]
end

def transitive?(attr)
return false unless @model_settings[:attributes].include?(attr)
attribute_settings(attr)[:transitive] == true
Expand Down Expand Up @@ -212,6 +207,17 @@ def attribute_uri(attr, *args)
Goo.vocabulary(nil)[attr]
end


def indexable?(attr)
setting = attribute_settings(attr.to_sym)
setting && (setting[:index].nil? || setting[:index] == true)
end

def fuzzy_searchable?(attr)
attribute_settings(attr)[:fuzzy_search] == true
end


private

def set_no_list_by_default(options)
Expand Down
2 changes: 1 addition & 1 deletion lib/goo/config/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def config(&block)
@settings.goo_path_query ||= ENV['GOO_PATH_QUERY'] || '/sparql/'
@settings.goo_path_data ||= ENV['GOO_PATH_DATA'] || '/data/'
@settings.goo_path_update ||= ENV['GOO_PATH_UPDATE'] || '/update/'
@settings.search_server_url ||= ENV['SEARCH_SERVER_URL'] || 'http://localhost:8983/solr/term_search_core1'
@settings.search_server_url ||= ENV['SEARCH_SERVER_URL'] || 'http://localhost:8983/solr'
@settings.redis_host ||= ENV['REDIS_HOST'] || 'localhost'
@settings.redis_port ||= ENV['REDIS_PORT'] || 6379
@settings.bioportal_namespace ||= ENV['BIOPORTAL_NAMESPACE'] || 'http://data.bioontology.org/'
Expand Down
194 changes: 138 additions & 56 deletions lib/goo/search/search.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require 'rsolr'
require_relative 'solr/solr_connector'

module Goo

Expand All @@ -8,102 +9,183 @@ def self.included(base)
base.extend(ClassMethods)
end

def index(connection_name=:main)
def index(connection_name = nil, to_set = nil)
raise ArgumentError, "ID must be set to be able to index" if @id.nil?
doc = indexable_object
Goo.search_connection(connection_name).add(doc)
document = indexable_object(to_set)

return if document.blank? || document[:id].blank?

connection_name ||= self.class.search_collection_name
unindex(connection_name)
self.class.search_client(connection_name).index_document(document)
end

def index_update(to_set, connection_name=:main)
def index_update(attributes_to_update, connection_name = nil, to_set = nil)
raise ArgumentError, "ID must be set to be able to index" if @id.nil?
raise ArgumentError, "Field names to be updated in index must be provided" if to_set.nil?
raise ArgumentError, "Field names to be updated in index must be provided" if attributes_to_update.blank?

old_doc = self.class.search("id:\"#{index_id}\"").dig("response", "docs")&.first

raise ArgumentError, "ID must be set to be able to index" if old_doc.blank?

doc = indexable_object(to_set)

doc.each { |key, val|
next if key === :id
doc[key] = {set: val}
}
doc.each do |key, val|
next unless attributes_to_update.any? { |attr| key.to_s.eql?(attr.to_s) || key.to_s.include?("#{attr}_") }
old_doc[key] = val
end

connection_name ||= self.class.search_collection_name
unindex(connection_name)

Goo.search_connection(connection_name).update(
data: "[#{doc.to_json}]",
headers: { 'Content-Type' => 'application/json' }
)
old_doc.reject! { |k, v| k.to_s.end_with?('_sort') || k.to_s.end_with?('_sorts') }
old_doc.delete("_version_")
self.class.search_client(connection_name).index_document(old_doc)
end

def unindex(connection_name=:main)
id = index_id
Goo.search_connection(connection_name).delete_by_id(id)
def unindex(connection_name = nil)
connection_name ||= self.class.search_collection_name
self.class.search_client(connection_name).delete_by_id(index_id)
end

# default implementation, should be overridden by child class
def index_id()
def index_id
raise ArgumentError, "ID must be set to be able to index" if @id.nil?
@id.to_s
end

# default implementation, should be overridden by child class
def index_doc(to_set=nil)
def index_doc(to_set = nil)
raise NoMethodError, "You must define method index_doc in your class for it to be indexable"
end

def indexable_object(to_set=nil)
doc = index_doc(to_set)
# use resource_id for the actual term id because :id is a Solr reserved field
doc[:resource_id] = doc[:id].to_s
doc[:id] = index_id.to_s
doc
def embedded_doc
raise NoMethodError, "You must define method embedded_doc in your class for it to be indexable"
end

def indexable_object(to_set = nil)
begin
document = index_doc(to_set)
rescue NoMethodError
document = self.to_hash.reject { |k, _| !self.class.indexable?(k) }
document.transform_values! do |v|
is_array = v.is_a?(Array)
v = Array(v).map do |x|
if x.is_a?(Goo::Base::Resource)
x.embedded_doc rescue x.id.to_s
else
if x.is_a?(RDF::URI)
x.to_s
else
x.respond_to?(:object) ? x.object : x
end
end
end
is_array ? v : v.first
end
end

document = document.reduce({}) do |h, (k, v)|
if v.is_a?(Hash)
v.each { |k2, v2| h["#{k}_#{k2}".to_sym] = v2 }
else
h[k] = v
end
h
end

model_name = self.class.model_name.to_s.downcase
document.delete(:id)
document.delete("id")

document.transform_keys! do |k|
self.class.index_document_attr(k)
end

document[:resource_id] = self.id.to_s
document[:resource_model] = model_name
document[:id] = index_id.to_s
document
end

module ClassMethods

def search(q, params={}, connection_name=:main)
params["q"] = q
Goo.search_connection(connection_name).post('select', :data => params)
def index_enabled?
!@model_settings[:search_collection].nil?
end

def indexBatch(collection, connection_name=:main)
docs = Array.new
collection.each do |c|
docs << c.indexable_object
def enable_indexing(collection_name, search_backend = :main, &block)
@model_settings[:search_collection] = collection_name

if block_given?
# optional block to generate custom schema
Goo.add_search_connection(collection_name, search_backend, &block)
else
Goo.add_search_connection(collection_name, search_backend)
end
Goo.search_connection(connection_name).add(docs)

after_save :index
after_destroy :unindex
end

def unindexBatch(collection, connection_name=:main)
docs = Array.new
collection.each do |c|
docs << c.index_id
end
Goo.search_connection(connection_name).delete_by_id(docs)
def search_collection_name
@model_settings[:search_collection]
end

def search_client(connection_name = search_collection_name)
Goo.search_client(connection_name)
end

def custom_schema?(connection_name = search_collection_name)
search_client(connection_name).custom_schema?
end

def schema_generator
Goo.search_client(search_collection_name).schema_generator
end

def index_document_attr(key)
return key.to_s if custom_schema? || self.attribute_settings(key).nil?

type = self.datatype(key)
is_list = self.list?(key)
fuzzy = self.fuzzy_searchable?(key)
search_client.index_document_attr(key, type, is_list, fuzzy)
end

def search(q, params = {}, connection_name = search_collection_name)
search_client(connection_name).search(q, params)
end

def submit_search_query(query, params = {}, connection_name = search_collection_name)
search_client(connection_name).submit_search_query(query, params)
end

def indexBatch(collection, connection_name = search_collection_name)
docs = collection.map(&:indexable_object)
search_client(connection_name).index_document(docs)
end

def unindexByQuery(query, connection_name=:main)
Goo.search_connection(connection_name).delete_by_query(query)
def unindexBatch(collection, connection_name = search_collection_name)
docs = collection.map(&:index_id)
search_client(connection_name).delete_by_id(docs)
end

# Get the doc that will be indexed in solr
def get_indexable_object()
# To make the code less readable the guys that wrote it managed to hide the real function called by this line
# It is "get_index_doc" in ontologies_linked_data Class.rb
doc = self.class.model_settings[:search_options][:document].call(self)
doc[:resource_id] = doc[:id].to_s
doc[:id] = get_index_id.to_s
# id: clsUri_ONTO-ACRO_submissionNumber. i.e.: http://lod.nal.usda.gov/nalt/5260_NALT_4
doc
def unindexByQuery(query, connection_name = search_collection_name)
search_client(connection_name).delete_by_query(query)
end

def indexCommit(attrs=nil, connection_name=:main)
Goo.search_connection(connection_name).commit(:commit_attributes => attrs || {})
def indexCommit(attrs = nil, connection_name = search_collection_name)
search_client(connection_name).index_commit(attrs)
end

def indexOptimize(attrs=nil, connection_name=:main)
Goo.search_connection(connection_name).optimize(:optimize_attributes => attrs || {})
def indexOptimize(attrs = nil, connection_name = search_collection_name)
search_client(connection_name).index_optimize(attrs)
end

def indexClear(connection_name=:main)
# WARNING: this deletes ALL data from the index
unindexByQuery("*:*", connection_name)
# WARNING: this deletes ALL data from the index
def indexClear(connection_name = search_collection_name)
search_client(connection_name).clear_all_data
end
end
end
Expand Down
Loading

0 comments on commit 6c51346

Please sign in to comment.