Skip to content

Commit

Permalink
Merge pull request #65 from ncats/aw_a005_greektome
Browse files Browse the repository at this point in the history
WIP: Need to treat these characters as “real” indexed ones.
  • Loading branch information
tylerperyea authored Apr 5, 2022
2 parents 3629d10 + 41d1ea7 commit e6def8c
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 30 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package ix.core.search.text;

public interface IndexedTextEncoder{
public String encode(String s);
default IndexedTextEncoder combine(IndexedTextEncoder enc){
IndexedTextEncoder _this=this;
return (s)->{
String e=_this.encode(s);
return enc.encode(e);
};
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package ix.core.search.text;

import lombok.Data;

import java.util.regex.Pattern;

@Data
public class StandardEncoding implements IndexedTextEncoder{

private String regex;
private String replaceWith;
private Pattern _pattern;

public StandardEncoding() {}

public StandardEncoding(String regex, String replaceWith) {
this.regex = regex;
this.replaceWith = replaceWith;
_pattern=Pattern.compile(Pattern.quote(regex));
}

@Override
public String encode(String s){
if(_pattern==null){
_pattern=Pattern.compile(Pattern.quote(regex));
}
s=_pattern.matcher(s).replaceAll(replaceWith);
return s;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package ix.core.search.text;

import java.util.ArrayList;
import java.util.List;

public class StandardEncodings {

private static final String LEVO_REGEX = "(-)";
private static final String DEXTRO_REGEX = "(+)";
private static final String RACEMIC_REGEX = "(+/-)";
private static final String RACEMIC_COMBO_REGEX = "±"; // Associated with RACEMIC_WORD

public static final String LEVO_WORD = "LEVOROTATION";
public static final String RACEMIC_WORD = "RACEMICROTATION";
public static final String DEXTRO_WORD = "DEXTROROTATION";

private static final String REPLACEMENT_SOURCE_GREEK = "\u03B1;.ALPHA.;\u03B2;.BETA.;\u03B3;.GAMMA.;\u03B4;.DELTA.;\u03B5;.EPSILON.;\u03B6;.ZETA.;\u03B7;.ETA.;\u03B8;.THETA.;\u03B9;.IOTA.;\u03BA;.KAPPA.;\u03BB;.LAMBDA.;\u03BC;.MU.;\u03BD;.NU.;\u03BE;.XI.;\u03BF;.OMICRON.;\u03C0;.PI.;\u03C1;.RHO.;\u03C2;.SIGMA.;\u03C3;.SIGMA.;\u03C4;.TAU.;\u03C5;.UPSILON.;\u03C6;.PHI.;\u03C7;.CHI.;\u03C8;.PSI.;\u03C9;.OMEGA.;\u0391;.ALPHA.;\u0392;.BETA.;\u0393;.GAMMA.;\u0394;.DELTA.;\u0395;.EPSILON.;\u0396;.ZETA.;\u0397;.ETA.;\u0398;.THETA.;\u0399;.IOTA.;\u039A;.KAPPA.;\u039B;.LAMBDA.;\u039C;.MU.;\u039D;.NU.;\u039E;.XI.;\u039F;.OMICRON.;\u03A0;.PI.;\u03A1;.RHO.;\u03A3;.SIGMA.;\u03A4;.TAU.;\u03A5;.UPSILON.;\u03A6;.PHI.;\u03A7;.CHI.;\u03A8;.PSI.;\u03A9;.OMEGA.";
// The uppercase and lowercase forms of the 24 letters are: Α α, Β β, Γ γ, Δ δ, Ε ε, Ζ ζ, Η η, Θ θ, Ι ι, Κ κ, Λ λ, Μ μ, Ν ν, Ξ ξ, Ο ο, Π π, Ρ ρ, Σ σ/ς, Τ τ, Υ υ, Φ φ, Χ χ, Ψ ψ, and Ω ω.

private static StandardEncodings _INSTANCE;

private List<StandardEncoding> encodings = new ArrayList<StandardEncoding>();

public List<StandardEncoding> getEncodings(){ return encodings; }

public static StandardEncodings getInstance(){
if(_INSTANCE==null)_INSTANCE=new StandardEncodings();
return _INSTANCE;
}

StandardEncodings() {
encodings.add(new StandardEncoding(LEVO_REGEX, LEVO_WORD));
encodings.add(new StandardEncoding(DEXTRO_REGEX, DEXTRO_WORD));
encodings.add(new StandardEncoding(RACEMIC_REGEX, RACEMIC_WORD));
encodings.add(new StandardEncoding(RACEMIC_REGEX, RACEMIC_WORD));

String[] replacementTokensGreek = REPLACEMENT_SOURCE_GREEK.split(";");
for (int i = 0; i < replacementTokensGreek.length; i = i + 2) {
encodings.add(new StandardEncoding(replacementTokensGreek[i], replacementTokensGreek[i + 1]));
}
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.springframework.beans.factory.annotation.Autowired;

import java.io.*;
import java.nio.file.Files;
import java.text.DateFormat;
Expand Down Expand Up @@ -104,6 +103,7 @@ public class TextIndexer implements Closeable, ProcessListener {

private TextIndexerConfig textIndexerConfig;


// public static final boolean INDEXING_ENABLED = ConfigHelper.getBoolean("ix.textindex.enabled",true);
// private static final boolean USE_ANALYSIS = ConfigHelper.getBoolean("ix.textindex.fieldsuggest",true);
// private static final CachedSupplier<Boolean> SHOULD_LOG_INDEXING = CachedSupplier.of(new Supplier<Boolean>() {
Expand Down Expand Up @@ -1083,7 +1083,7 @@ public DirectoryTaxonomyWriter getTaxonWriter() {
}

private TextIndexer(IndexerServiceFactory indexerServiceFactory, IndexerService indexerService, TextIndexerConfig textIndexerConfig, IndexValueMakerFactory indexValueMakerFactory, Function<EntityWrapper, Boolean> deepKindFunction) {

// empty instance should only be used for
// facet subsearching so we only need to have
// a single thread...
Expand Down Expand Up @@ -1120,6 +1120,7 @@ public TextIndexer(File dir, IndexerServiceFactory indexerServiceFactory, Indexe
}

private void initialSetup() throws IOException {

searchManager = this.indexerService.createSearchManager();
facetFileDir = new File(baseDir, "facet");
Files.createDirectories(facetFileDir.toPath());
Expand Down Expand Up @@ -2562,6 +2563,7 @@ private void add(EntityWrapper ew, boolean force) throws IOException {
if(textIndexerConfig.isShouldLog()){
log.debug("[LOG_INDEX] .." + f.name() + ":" + text + " [" + f.getClass().getName() + "]");
}
// This is where you can see how things get indexed.
// System.out.println(".." + f.name() + ":" + text + " [" + f.getClass().getName() + "]");
// if (DEBUG(2)){
// log.debug(".." + f.name() + ":" + text + " [" + f.getClass().getName() + "]");
Expand Down Expand Up @@ -3350,18 +3352,19 @@ public void instrumentIndexableValue(Consumer<IndexableField> fields, IndexableV
public static String toExactMatchString(String in){
return TextIndexer.START_WORD + replaceSpecialCharsForExactMatch(in) + TextIndexer.STOP_WORD;
}




public static String toExactMatchQueryString(String in){
return toExactMatchString(in).replace("*", "").replace("?", ""); //remove wildcards
}

private static String replaceSpecialCharsForExactMatch(String in) {

String tmp = LEVO_PATTERN.matcher(in).replaceAll(LEVO_WORD);
tmp = DEXTRO_PATTERN.matcher(tmp).replaceAll(DEXTRO_WORD);
tmp = RACEMIC_PATTERN.matcher(tmp).replaceAll(RACEMIC_WORD);
return tmp;

String tmp = in;
for(StandardEncoding se: StandardEncodings.getInstance().getEncodings()) {
tmp=se.encode(tmp);
}
return tmp;
}

/*
Expand All @@ -3372,30 +3375,18 @@ private static String replaceSpecialCharsForExactMatch(String in) {
//TODO: this is a fairly hacky way to try to recreate simple character sequence-level
//functionality within lucene, and there needs to be a better way
private static String transformQueryForExactMatch(String in){

// This is called when doing searches and maybe other cases
String tmp = START_PATTERN.matcher(in).replaceAll(TextIndexer.START_WORD);
tmp = STOP_PATTERN.matcher(tmp).replaceAll(TextIndexer.STOP_WORD);


tmp = LEVO_PATTERN.matcher(tmp).replaceAll(TextIndexer.LEVO_WORD);
tmp = DEXTRO_PATTERN.matcher(tmp).replaceAll(TextIndexer.DEXTRO_WORD);
tmp = RACEMIC_PATTERN.matcher(tmp).replaceAll(TextIndexer.RACEMIC_WORD);

return tmp;
for(StandardEncoding se: StandardEncodings.getInstance().getEncodings()) {
tmp=se.encode(tmp);
}
return tmp;
}

private static final Pattern START_PATTERN = Pattern.compile(TextIndexer.GIVEN_START_WORD,Pattern.LITERAL );
private static final Pattern STOP_PATTERN = Pattern.compile(TextIndexer.GIVEN_STOP_WORD,Pattern.LITERAL );

private static final Pattern LEVO_PATTERN = Pattern.compile(Pattern.quote("(-)"));
private static final Pattern DEXTRO_PATTERN = Pattern.compile(Pattern.quote("(+)"));
private static final Pattern RACEMIC_PATTERN = Pattern.compile(Pattern.quote("(+/-)"));

private static final String LEVO_WORD = "LEVOROTATION";
private static final String RACEMIC_WORD = "RACEMICROTATION";
private static final String DEXTRO_WORD = "DEXTROROTATION";


/**
* Add the specified field and value pair to the suggests
* which are used for type-ahead queries.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ public class TextIndexerConfig {
private boolean fieldsuggest;
@Value("#{new Boolean('${ix.textindex.shouldLog:false}')}")
private boolean shouldLog;





// private static final boolean USE_ANALYSIS = ConfigHelper.getBoolean("ix.textindex.fieldsuggest",true);

@Value("#{new Integer('${ix.fetchWorkerCount:4}')}")
Expand Down

0 comments on commit e6def8c

Please sign in to comment.