Merge pull request #65 from ncats/aw_a005_greektome

WIP: Need to treat these characters as “real” indexed ones.
ncats · Apr 5, 2022 · e6def8c · e6def8c
2 parents 3629d10 + 41d1ea7
commit e6def8c
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 30 deletions.
diff --git a/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/IndexedTextEncoder.java b/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/IndexedTextEncoder.java
@@ -0,0 +1,12 @@
+package ix.core.search.text;
+
+public interface IndexedTextEncoder{
+    public String encode(String s);
+    default IndexedTextEncoder combine(IndexedTextEncoder enc){
+        IndexedTextEncoder _this=this;
+        return (s)->{
+            String e=_this.encode(s);
+            return enc.encode(e);
+        };
+    }
+}
diff --git a/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/StandardEncoding.java b/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/StandardEncoding.java
@@ -0,0 +1,30 @@
+package ix.core.search.text;
+
+import lombok.Data;
+
+import java.util.regex.Pattern;
+
+@Data
+public class StandardEncoding implements IndexedTextEncoder{
+
+    private String regex;
+    private String replaceWith;
+    private Pattern _pattern;
+
+    public StandardEncoding() {}
+
+    public StandardEncoding(String regex, String replaceWith) {
+        this.regex = regex;
+        this.replaceWith = replaceWith;
+        _pattern=Pattern.compile(Pattern.quote(regex));
+    }
+
+    @Override
+    public String encode(String s){
+        if(_pattern==null){
+            _pattern=Pattern.compile(Pattern.quote(regex));
+        }
+        s=_pattern.matcher(s).replaceAll(replaceWith);
+        return s;
+    }
+}
diff --git a/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/StandardEncodings.java b/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/StandardEncodings.java
@@ -0,0 +1,43 @@
+package ix.core.search.text;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class StandardEncodings {
+
+   private static final String LEVO_REGEX = "(-)";
+   private static final String DEXTRO_REGEX = "(+)";
+   private static final String RACEMIC_REGEX = "(+/-)";
+   private static final String RACEMIC_COMBO_REGEX = "±"; // Associated with RACEMIC_WORD
+
+   public static final String LEVO_WORD = "LEVOROTATION";
+   public static final String RACEMIC_WORD = "RACEMICROTATION";
+   public static final String DEXTRO_WORD = "DEXTROROTATION";
+
+   private static final String REPLACEMENT_SOURCE_GREEK = "\u03B1;.ALPHA.;\u03B2;.BETA.;\u03B3;.GAMMA.;\u03B4;.DELTA.;\u03B5;.EPSILON.;\u03B6;.ZETA.;\u03B7;.ETA.;\u03B8;.THETA.;\u03B9;.IOTA.;\u03BA;.KAPPA.;\u03BB;.LAMBDA.;\u03BC;.MU.;\u03BD;.NU.;\u03BE;.XI.;\u03BF;.OMICRON.;\u03C0;.PI.;\u03C1;.RHO.;\u03C2;.SIGMA.;\u03C3;.SIGMA.;\u03C4;.TAU.;\u03C5;.UPSILON.;\u03C6;.PHI.;\u03C7;.CHI.;\u03C8;.PSI.;\u03C9;.OMEGA.;\u0391;.ALPHA.;\u0392;.BETA.;\u0393;.GAMMA.;\u0394;.DELTA.;\u0395;.EPSILON.;\u0396;.ZETA.;\u0397;.ETA.;\u0398;.THETA.;\u0399;.IOTA.;\u039A;.KAPPA.;\u039B;.LAMBDA.;\u039C;.MU.;\u039D;.NU.;\u039E;.XI.;\u039F;.OMICRON.;\u03A0;.PI.;\u03A1;.RHO.;\u03A3;.SIGMA.;\u03A4;.TAU.;\u03A5;.UPSILON.;\u03A6;.PHI.;\u03A7;.CHI.;\u03A8;.PSI.;\u03A9;.OMEGA.";
+   // The uppercase and lowercase forms of the 24 letters are: Α α, Β β, Γ γ, Δ δ, Ε ε, Ζ ζ, Η η, Θ θ, Ι ι, Κ κ, Λ λ, Μ μ, Ν ν, Ξ ξ, Ο ο, Π π, Ρ ρ, Σ σ/ς, Τ τ, Υ υ, Φ φ, Χ χ, Ψ ψ, and Ω ω.
+
+   private static StandardEncodings _INSTANCE;
+
+   private List<StandardEncoding> encodings = new ArrayList<StandardEncoding>();
+
+   public List<StandardEncoding> getEncodings(){ return encodings; }
+
+   public static StandardEncodings getInstance(){
+      if(_INSTANCE==null)_INSTANCE=new StandardEncodings();
+      return _INSTANCE;
+   }
+
+   StandardEncodings() {
+      encodings.add(new StandardEncoding(LEVO_REGEX, LEVO_WORD));
+      encodings.add(new StandardEncoding(DEXTRO_REGEX, DEXTRO_WORD));
+      encodings.add(new StandardEncoding(RACEMIC_REGEX, RACEMIC_WORD));
+      encodings.add(new StandardEncoding(RACEMIC_REGEX, RACEMIC_WORD));
+
+      String[] replacementTokensGreek = REPLACEMENT_SOURCE_GREEK.split(";");
+      for (int i = 0; i < replacementTokensGreek.length; i = i + 2) {
+         encodings.add(new StandardEncoding(replacementTokensGreek[i], replacementTokensGreek[i + 1]));
+      }
+   }
+}
+
diff --git a/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/TextIndexer.java b/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/TextIndexer.java
@@ -70,7 +70,6 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Version;
 import org.springframework.beans.factory.annotation.Autowired;
-
 import java.io.*;
 import java.nio.file.Files;
 import java.text.DateFormat;
@@ -104,6 +103,7 @@ public class TextIndexer implements Closeable, ProcessListener {
 
     private TextIndexerConfig textIndexerConfig;
 
+
 //	public static final boolean INDEXING_ENABLED = ConfigHelper.getBoolean("ix.textindex.enabled",true);
 //	private static final boolean USE_ANALYSIS =    ConfigHelper.getBoolean("ix.textindex.fieldsuggest",true);
 //    private static final CachedSupplier<Boolean> SHOULD_LOG_INDEXING =    CachedSupplier.of(new Supplier<Boolean>() {
@@ -1083,7 +1083,7 @@ public DirectoryTaxonomyWriter getTaxonWriter() {
     }
 
     private TextIndexer(IndexerServiceFactory indexerServiceFactory, IndexerService indexerService, TextIndexerConfig textIndexerConfig, IndexValueMakerFactory indexValueMakerFactory, Function<EntityWrapper, Boolean> deepKindFunction) {
-        
+
         // empty instance should only be used for
 		// facet subsearching so we only need to have
 		// a single thread...
@@ -1120,6 +1120,7 @@ public TextIndexer(File dir, IndexerServiceFactory indexerServiceFactory, Indexe
     }
 
     private void initialSetup() throws IOException {
+
         searchManager = this.indexerService.createSearchManager();
         facetFileDir = new File(baseDir, "facet");
         Files.createDirectories(facetFileDir.toPath());
@@ -2562,6 +2563,7 @@ private void add(EntityWrapper ew, boolean force) throws IOException {
                             if(textIndexerConfig.isShouldLog()){
                                 log.debug("[LOG_INDEX] .." + f.name() + ":" + text + " [" + f.getClass().getName() + "]");
                             }
+// This is where you can see how things get indexed.
 //						    System.out.println(".." + f.name() + ":" + text + " [" + f.getClass().getName() + "]");
 //							if (DEBUG(2)){
 //								log.debug(".." + f.name() + ":" + text + " [" + f.getClass().getName() + "]");
@@ -3350,18 +3352,19 @@ public void instrumentIndexableValue(Consumer<IndexableField> fields, IndexableV
 	public static String toExactMatchString(String in){
 		return TextIndexer.START_WORD + replaceSpecialCharsForExactMatch(in) + TextIndexer.STOP_WORD;
 	}
-
+
+
+
 	public static String toExactMatchQueryString(String in){
         return toExactMatchString(in).replace("*", "").replace("?", ""); //remove wildcards
     }
 
 	private static String replaceSpecialCharsForExactMatch(String in) {
-
-		String tmp = LEVO_PATTERN.matcher(in).replaceAll(LEVO_WORD);
-		tmp = DEXTRO_PATTERN.matcher(tmp).replaceAll(DEXTRO_WORD);
-        tmp = RACEMIC_PATTERN.matcher(tmp).replaceAll(RACEMIC_WORD);
-		return tmp;
-
+        String tmp = in;
+        for(StandardEncoding se: StandardEncodings.getInstance().getEncodings()) {
+            tmp=se.encode(tmp);
+        }
+        return tmp;
 	}
 
 	/*
@@ -3372,30 +3375,18 @@ private static String replaceSpecialCharsForExactMatch(String in) {
 	//TODO: this is a fairly hacky way to try to recreate simple character sequence-level
 	//functionality within lucene, and there needs to be a better way
 	private static String transformQueryForExactMatch(String in){
-
+        // This is called when doing searches and maybe other cases
 		String tmp =  START_PATTERN.matcher(in).replaceAll(TextIndexer.START_WORD);
 		tmp =  STOP_PATTERN.matcher(tmp).replaceAll(TextIndexer.STOP_WORD);
-
-
-		tmp =  LEVO_PATTERN.matcher(tmp).replaceAll(TextIndexer.LEVO_WORD);
-		tmp =  DEXTRO_PATTERN.matcher(tmp).replaceAll(TextIndexer.DEXTRO_WORD);
-        tmp =  RACEMIC_PATTERN.matcher(tmp).replaceAll(TextIndexer.RACEMIC_WORD);
-
-		return tmp;
+        for(StandardEncoding se: StandardEncodings.getInstance().getEncodings()) {
+            tmp=se.encode(tmp);
+        }
+        return tmp;
 	}
 
 	private static final Pattern START_PATTERN = Pattern.compile(TextIndexer.GIVEN_START_WORD,Pattern.LITERAL );
 	private static final Pattern STOP_PATTERN = Pattern.compile(TextIndexer.GIVEN_STOP_WORD,Pattern.LITERAL );
 
-	private static final Pattern LEVO_PATTERN = Pattern.compile(Pattern.quote("(-)"));
-	private static final Pattern DEXTRO_PATTERN = Pattern.compile(Pattern.quote("(+)"));
-    private static final Pattern RACEMIC_PATTERN = Pattern.compile(Pattern.quote("(+/-)"));
-
-	private static final String LEVO_WORD = "LEVOROTATION";
-    private static final String RACEMIC_WORD = "RACEMICROTATION";
-	private static final String DEXTRO_WORD = "DEXTROROTATION";
-
-
 	/**
 	 * Add the specified field and value pair to the suggests
 	 * which are used for type-ahead queries.

diff --git a/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/TextIndexerConfig.java b/gsrs-spring-legacy-indexer/src/main/java/ix/core/search/text/TextIndexerConfig.java
@@ -21,10 +21,7 @@ public class TextIndexerConfig {
     private boolean fieldsuggest;
     @Value("#{new Boolean('${ix.textindex.shouldLog:false}')}")
     private boolean shouldLog;
-
-
-
-
+
 //    private static final boolean USE_ANALYSIS =    ConfigHelper.getBoolean("ix.textindex.fieldsuggest",true);
 
     @Value("#{new Integer('${ix.fetchWorkerCount:4}')}")