Feature: Enhance SOLR integration and add a Schema API (#54)

* add an abstraction to SOLR integeration and add Schema API * add SOLR Schema API tests * update SOLR backend configuration and init * use the new Solr connector in the model search interface * update search test to cover the new automatic indexing and unindexing * handle the solr container initialization when running docker for tests * add omit_norms options for SolrSchemaGenerator * fix solr schema initial dynamic fields declaration and replace the usage of mapping-ISOLatin1Accent * delay the schema generation to after model declarations or in demand * add solr edismax fitlers tests * fix indexBatch and unindexBatch tests * add security checks to the index and unindex functions * change dynamic fields names to have less code migration * update clear_all_schema to remove all copy and normal fields * add an option to force solr initialization if wanted * handle indexing embed objects of a model * add index update option * fix clear all schema to just remove all the fields and recreate them * add index_enabled? helper for models * perform a status test when initializing the solr connector * extract init_search_connection function from init_search_connections * fix typo in indexOptimize call * add solr search using HTTP post instead of GET for large queries
ontoportal-lirmm · Mar 2, 2024 · 6c51346 · 6c51346
1 parent 1be1c83
commit 6c51346
Show file tree

Hide file tree

Showing 14 changed files with 1,252 additions and 81 deletions.
diff --git a/.ruby-version b/.ruby-version
@@ -0,0 +1 @@
+2.7.8
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -10,15 +10,10 @@ services:
       retries: 30
 
   solr-ut:
-    image: ontoportal/solr-ut:0.0.2
+    image: solr:8.11.2
     ports:
       - 8983:8983
-    healthcheck:
-      test: ["CMD-SHELL", "curl -sf http://localhost:8983/solr/term_search_core1/admin/ping?wt=json | grep -iq '\"status\":\"OK\"}'  || exit 1"]
-      start_period: 10s
-      interval: 10s
-      timeout: 5s
-      retries: 5
+    command: bin/solr start -cloud -f
 
   agraph-ut:
     image: franzinc/agraph:v8.1.0

diff --git a/lib/goo.rb b/lib/goo.rb
@@ -42,6 +42,7 @@ module Goo
   @@model_by_name = {}
   @@search_backends = {}
   @@search_connection = {}
+  @@search_collections = {}
   @@default_namespace = nil
   @@id_prefix = nil
   @@redis_client = nil
@@ -101,7 +102,7 @@ def self.language_includes(lang)
   end
 
   def self.add_namespace(shortcut, namespace,default=false)
-    if !(namespace.instance_of? RDF::Vocabulary)
+    unless namespace.instance_of? RDF::Vocabulary
       raise ArgumentError, "Namespace must be a RDF::Vocabulary object"
     end
     @@namespaces[shortcut.to_sym] = namespace
@@ -252,11 +253,9 @@ def self.configure
       raise ArgumentError, "Configuration needs to receive a code block"
     end
     yield self
-    configure_sanity_check()
+    configure_sanity_check
 
-    if @@search_backends.length > 0
-      @@search_backends.each { |name, val| @@search_connection[name] = RSolr.connect(url: search_conf(name), timeout: 1800, open_timeout: 1800) }
-    end
+      init_search_connections
 
     @@namespaces.freeze
     @@sparql_backends.freeze
@@ -280,8 +279,44 @@ def self.search_conf(name=:main)
     return @@search_backends[name][:service]
   end
 
-  def self.search_connection(name=:main)
-    return @@search_connection[name]
+  def self.search_connection(collection_name)
+    return search_client(collection_name).solr
+  end
+
+  def self.search_client(collection_name)
+    @@search_connection[collection_name]
+  end
+
+  def self.add_search_connection(collection_name, search_backend = :main, &block)
+    @@search_collections[collection_name] = {
+      search_backend: search_backend,
+      block: block_given? ? block : nil
+    }
+  end
+
+  def self.search_connections
+    @@search_connection
+  end
+
+  def self.init_search_connection(collection_name, search_backend = :main,  block = nil, force: false)
+    return @@search_connection[collection_name] if @@search_connection[collection_name] && !force
+
+    @@search_connection[collection_name] = SOLR::SolrConnector.new(search_conf(search_backend), collection_name)
+    if block
+      block.call(@@search_connection[collection_name].schema_generator)
+      @@search_connection[collection_name].enable_custom_schema
+    end
+    @@search_connection[collection_name].init(force)
+    @@search_connection[collection_name]
+  end
+
+
+  def self.init_search_connections(force=false)
+    @@search_collections.each do |collection_name, backend|
+      search_backend = backend[:search_backend]
+      block =  backend[:block]
+      init_search_connection(collection_name, search_backend, block, force: force)
+    end
   end
 
   def self.sparql_query_client(name=:main)

diff --git a/lib/goo/base/settings/attribute.rb b/lib/goo/base/settings/attribute.rb
@@ -158,11 +158,6 @@ def list?(attr)
           attribute_settings(attr)[:enforce].include?(:list)
         end
 
-        def index_attribute?(attr)
-          return false if attribute_settings(attr).nil?
-          attribute_settings(attr)[:index]
-        end
-
         def transitive?(attr)
           return false unless @model_settings[:attributes].include?(attr)
           attribute_settings(attr)[:transitive] == true
@@ -212,6 +207,17 @@ def attribute_uri(attr, *args)
           Goo.vocabulary(nil)[attr]
         end
 
+
+        def indexable?(attr)
+          setting = attribute_settings(attr.to_sym)
+          setting  && (setting[:index].nil? || setting[:index] == true)
+        end
+
+        def fuzzy_searchable?(attr)
+          attribute_settings(attr)[:fuzzy_search] == true
+        end
+
+
         private
 
         def set_no_list_by_default(options)

diff --git a/lib/goo/config/config.rb b/lib/goo/config/config.rb
@@ -20,7 +20,7 @@ def config(&block)
     @settings.goo_path_query      ||= ENV['GOO_PATH_QUERY'] || '/sparql/'
     @settings.goo_path_data       ||= ENV['GOO_PATH_DATA'] || '/data/'
     @settings.goo_path_update     ||= ENV['GOO_PATH_UPDATE'] || '/update/'
-    @settings.search_server_url   ||= ENV['SEARCH_SERVER_URL'] || 'http://localhost:8983/solr/term_search_core1'
+    @settings.search_server_url   ||= ENV['SEARCH_SERVER_URL'] || 'http://localhost:8983/solr'
     @settings.redis_host          ||= ENV['REDIS_HOST'] || 'localhost'
     @settings.redis_port          ||= ENV['REDIS_PORT'] || 6379
     @settings.bioportal_namespace ||= ENV['BIOPORTAL_NAMESPACE'] || 'http://data.bioontology.org/'

diff --git a/lib/goo/search/search.rb b/lib/goo/search/search.rb
@@ -1,4 +1,5 @@
 require 'rsolr'
+require_relative 'solr/solr_connector'
 
 module Goo
 
@@ -8,102 +9,183 @@ def self.included(base)
       base.extend(ClassMethods)
     end
 
-    def index(connection_name=:main)
+    def index(connection_name = nil, to_set = nil)
       raise ArgumentError, "ID must be set to be able to index" if @id.nil?
-      doc = indexable_object
-      Goo.search_connection(connection_name).add(doc)
+      document = indexable_object(to_set)
+
+      return if document.blank? || document[:id].blank?
+
+      connection_name ||= self.class.search_collection_name
+      unindex(connection_name)
+      self.class.search_client(connection_name).index_document(document)
     end
 
-    def index_update(to_set, connection_name=:main)
+    def index_update(attributes_to_update, connection_name = nil, to_set = nil)
       raise ArgumentError, "ID must be set to be able to index" if @id.nil?
-      raise ArgumentError, "Field names to be updated in index must be provided" if to_set.nil?
+      raise ArgumentError, "Field names to be updated in index must be provided" if attributes_to_update.blank?
+
+      old_doc = self.class.search("id:\"#{index_id}\"").dig("response", "docs")&.first
+
+      raise ArgumentError, "ID must be set to be able to index" if old_doc.blank?
+
       doc = indexable_object(to_set)
 
-      doc.each { |key, val|
-        next if key === :id
-        doc[key] = {set: val}
-      }
+      doc.each do |key, val|
+        next unless attributes_to_update.any? { |attr| key.to_s.eql?(attr.to_s) || key.to_s.include?("#{attr}_") }
+        old_doc[key] = val
+      end
+
+      connection_name ||= self.class.search_collection_name
+      unindex(connection_name)
 
-      Goo.search_connection(connection_name).update(
-          data: "[#{doc.to_json}]",
-          headers: { 'Content-Type' => 'application/json' }
-      )
+      old_doc.reject! { |k, v| k.to_s.end_with?('_sort') || k.to_s.end_with?('_sorts') }
+      old_doc.delete("_version_")
+      self.class.search_client(connection_name).index_document(old_doc)
     end
 
-    def unindex(connection_name=:main)
-      id = index_id
-      Goo.search_connection(connection_name).delete_by_id(id)
+    def unindex(connection_name = nil)
+      connection_name ||= self.class.search_collection_name
+      self.class.search_client(connection_name).delete_by_id(index_id)
     end
 
     # default implementation, should be overridden by child class
-    def index_id()
+    def index_id
       raise ArgumentError, "ID must be set to be able to index" if @id.nil?
       @id.to_s
     end
 
     # default implementation, should be overridden by child class
-    def index_doc(to_set=nil)
+    def index_doc(to_set = nil)
       raise NoMethodError, "You must define method index_doc in your class for it to be indexable"
     end
 
-    def indexable_object(to_set=nil)
-      doc = index_doc(to_set)
-      # use resource_id for the actual term id because :id is a Solr reserved field
-      doc[:resource_id] = doc[:id].to_s
-      doc[:id] = index_id.to_s
-      doc
+    def embedded_doc
+      raise NoMethodError, "You must define method embedded_doc in your class for it to be indexable"
     end
 
+    def indexable_object(to_set = nil)
+      begin
+        document = index_doc(to_set)
+      rescue NoMethodError
+        document = self.to_hash.reject { |k, _| !self.class.indexable?(k) }
+        document.transform_values! do |v|
+          is_array = v.is_a?(Array)
+          v = Array(v).map do |x|
+            if x.is_a?(Goo::Base::Resource)
+              x.embedded_doc rescue x.id.to_s
+            else
+              if x.is_a?(RDF::URI)
+                x.to_s
+              else
+                x.respond_to?(:object) ? x.object : x
+              end
+            end
+          end
+          is_array ? v : v.first
+        end
+      end
+
+      document = document.reduce({}) do |h, (k, v)|
+        if v.is_a?(Hash)
+          v.each { |k2, v2| h["#{k}_#{k2}".to_sym] = v2 }
+        else
+          h[k] = v
+        end
+        h
+      end
+
+      model_name = self.class.model_name.to_s.downcase
+      document.delete(:id)
+      document.delete("id")
+
+      document.transform_keys! do |k|
+        self.class.index_document_attr(k)
+      end
+
+      document[:resource_id] = self.id.to_s
+      document[:resource_model] = model_name
+      document[:id] = index_id.to_s
+      document
+    end
 
     module ClassMethods
 
-      def search(q, params={}, connection_name=:main)
-        params["q"] = q
-        Goo.search_connection(connection_name).post('select', :data => params)
+      def index_enabled?
+        !@model_settings[:search_collection].nil?
       end
 
-      def indexBatch(collection, connection_name=:main)
-        docs = Array.new
-        collection.each do |c|
-          docs << c.indexable_object
+      def enable_indexing(collection_name, search_backend = :main, &block)
+        @model_settings[:search_collection] = collection_name
+
+        if block_given?
+          # optional block to generate custom schema
+          Goo.add_search_connection(collection_name, search_backend, &block)
+        else
+          Goo.add_search_connection(collection_name, search_backend)
         end
-        Goo.search_connection(connection_name).add(docs)
+
+        after_save :index
+        after_destroy :unindex
       end
 
-      def unindexBatch(collection, connection_name=:main)
-        docs = Array.new
-        collection.each do |c|
-          docs << c.index_id
-        end
-        Goo.search_connection(connection_name).delete_by_id(docs)
+      def search_collection_name
+        @model_settings[:search_collection]
+      end
+
+      def search_client(connection_name = search_collection_name)
+        Goo.search_client(connection_name)
+      end
+
+      def custom_schema?(connection_name = search_collection_name)
+        search_client(connection_name).custom_schema?
+      end
+
+      def schema_generator
+        Goo.search_client(search_collection_name).schema_generator
+      end
+
+      def index_document_attr(key)
+        return key.to_s if custom_schema? || self.attribute_settings(key).nil?
+
+        type = self.datatype(key)
+        is_list = self.list?(key)
+        fuzzy = self.fuzzy_searchable?(key)
+        search_client.index_document_attr(key, type, is_list, fuzzy)
+      end
+
+      def search(q, params = {}, connection_name = search_collection_name)
+        search_client(connection_name).search(q, params)
+      end
+
+      def submit_search_query(query, params = {}, connection_name = search_collection_name)
+        search_client(connection_name).submit_search_query(query, params)
+      end
+
+      def indexBatch(collection, connection_name = search_collection_name)
+        docs = collection.map(&:indexable_object)
+        search_client(connection_name).index_document(docs)
       end
 
-      def unindexByQuery(query, connection_name=:main)
-        Goo.search_connection(connection_name).delete_by_query(query)
+      def unindexBatch(collection, connection_name = search_collection_name)
+        docs = collection.map(&:index_id)
+        search_client(connection_name).delete_by_id(docs)
       end
 
-      # Get the doc that will be indexed in solr
-      def get_indexable_object()
-        # To make the code less readable the guys that wrote it managed to hide the real function called by this line
-        # It is "get_index_doc" in ontologies_linked_data Class.rb
-        doc = self.class.model_settings[:search_options][:document].call(self)
-        doc[:resource_id] = doc[:id].to_s
-        doc[:id] = get_index_id.to_s
-        # id: clsUri_ONTO-ACRO_submissionNumber. i.e.: http://lod.nal.usda.gov/nalt/5260_NALT_4
-        doc
+      def unindexByQuery(query, connection_name = search_collection_name)
+        search_client(connection_name).delete_by_query(query)
       end
 
-      def indexCommit(attrs=nil, connection_name=:main)
-        Goo.search_connection(connection_name).commit(:commit_attributes => attrs || {})
+      def indexCommit(attrs = nil, connection_name = search_collection_name)
+        search_client(connection_name).index_commit(attrs)
       end
 
-      def indexOptimize(attrs=nil, connection_name=:main)
-        Goo.search_connection(connection_name).optimize(:optimize_attributes => attrs || {})
+      def indexOptimize(attrs = nil, connection_name = search_collection_name)
+        search_client(connection_name).index_optimize(attrs)
       end
 
-      def indexClear(connection_name=:main)
-        # WARNING: this deletes ALL data from the index
-        unindexByQuery("*:*", connection_name)
+      # WARNING: this deletes ALL data from the index
+      def indexClear(connection_name = search_collection_name)
+        search_client(connection_name).clear_all_data
       end
     end
   end