Merge branch 'ncats:master' into master

ncats · Jul 5, 2024 · 3914dc3 · 3914dc3
2 parents b2e6f2a + 8c59bfc
commit 3914dc3
Show file tree

Hide file tree

Showing 15 changed files with 210 additions and 38 deletions.
diff --git a/gsrs-module-substance-example/pom.xml b/gsrs-module-substance-example/pom.xml
@@ -252,7 +252,7 @@
         <dependency>
             <groupId>gov.nih.ncats</groupId>
             <artifactId>molwitch-cdk</artifactId>
-            <version>1.0.18</version>
+            <version>1.0.19</version>
         </dependency>
         <dependency>
             <groupId>org.springframework.boot</groupId>

diff --git a/gsrs-module-substance-example/src/test/java/example/chem/VocabFragmentCleanupTest.java b/gsrs-module-substance-example/src/test/java/example/chem/VocabFragmentCleanupTest.java
@@ -3,25 +3,45 @@
 import gov.nih.ncats.molwitch.Chemical;
 import ix.core.chem.Chem;
 import ix.ginas.models.v1.FragmentVocabularyTerm;
+import ix.ginas.utils.validation.validators.CVFragmentStructureValidator;
 import lombok.extern.slf4j.Slf4j;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
 
 import java.io.IOException;
+import java.util.Optional;
+import java.util.stream.Stream;
 
 @Slf4j
 public class VocabFragmentCleanupTest {
 
     @Test
     void testQueryFeatures() throws IOException {
+        //test like we have a vocabulary term here
         FragmentVocabularyTerm fragmentVocabularyTerm = new FragmentVocabularyTerm();
-        fragmentVocabularyTerm.setFragmentStructure("[*]N[C@@H](CS[*])C([*])=O |$_R1;;;;;_R3;;_R2;$|");
+        fragmentVocabularyTerm.setFragmentStructure("[*]N[C@@H](CS[*])C([*])=O |$_R1;;;;;_R3;;_R2;$|"); //_R...  - atom alias
         String inputStructure = fragmentVocabularyTerm.getFragmentStructure().split(" ")[0];
+        //todo: include optional square brackets around *...
+        //  2 replacements
+
+        String lexicallyCleaned = inputStructure.replace("*", "6He");
+        log.trace("lexicallyCleaned: {}}", lexicallyCleaned);
+
         Chemical chem = Chemical.parse(inputStructure);
         chem = Chem.RemoveQueryFeaturesForPseudoInChI(chem);
-        String inchiKey = chem.toInchi().getKey();
-        log.debug("Created InChIKey: {}", inchiKey);
-        Assertions.assertTrue(inchiKey.length()>0);
+        String processedSmiles = chem.toSmiles();
+        log.trace("processedSmiles:{}", processedSmiles);
+        String inChIKey = chem.toInchi().getKey();
+
+        String expectedInChIKey =getInChiKey(lexicallyCleaned);// "OTIPWSTYBNONSP-CQOJXGFHSA-N";
+        log.debug("Created InChIKey: {}", inChIKey);
+        Assertions.assertEquals(expectedInChIKey, inChIKey);
+        String molfile = chem.toMol();
+        log.trace("molfile:\n {}", molfile);
+        Assertions.assertFalse(molfile.contains("*"));
     }
 
     @Test
@@ -36,4 +56,109 @@ void testOutput() throws IOException {
         Assertions.assertTrue(smiles.length()>0);
     }
 
+
+    @Test
+    void testSmilesParse() throws IOException {
+        //test like we have a vocabulary term here
+        FragmentVocabularyTerm fragmentVocabularyTerm = new FragmentVocabularyTerm();
+        fragmentVocabularyTerm.setFragmentStructure("CCCCCCCC\\C=C/CCCCCCCCCCCCOCC(CO[*])OCCCCCCCCCCCC\\C=C/CCCCCCCC |$;;;;;;;;;;;;;;;;;;;;;;;;;;;_R92;;;;;;;;;;;;;;;;;;;;;;;$|");
+        String inputStructure = fragmentVocabularyTerm.getFragmentStructure().split(" ")[0];
+        String lexicallyCleaned = inputStructure.replace("*", "6He");
+        log.trace("lexicallyCleaned: {}", lexicallyCleaned);
+
+        Chemical chem = Chemical.parse(inputStructure);
+        chem = Chem.RemoveQueryFeaturesForPseudoInChI(chem);
+        String processedSmiles = chem.toSmiles();
+        log.trace("processedSmiles: {}", processedSmiles);
+        String inChIKey = chem.toInchi().getKey();
+
+        String expectedInChIKey =getInChiKey(lexicallyCleaned);
+        log.debug("Created InChIKey: {}", inChIKey);
+        Assertions.assertEquals(expectedInChIKey, inChIKey);
+        String molfile = chem.toMol();
+        log.trace("molfile:\n {}", molfile);
+        Assertions.assertFalse(molfile.contains("*"));
+    }
+
+    private static Stream<Arguments> fragmentSmiles() {
+        return Stream.of(
+                Arguments.of("C[C@H](N[*])C([*])=O |$;;;_R1;;_R2;$|"),
+                Arguments.of("[*]N[C@@H](CS[*])C([*])=O |$_R1;;;;;_R3;;_R2;$|"),
+                Arguments.of("[*]N[C@@H](CC([*])=O)C([*])=O |$_R1;;;;;_R3;;;_R2;$|"),
+                Arguments.of("[*]N[C@@H](CCC([*])=O)C([*])=O |$_R1;;;;;;_R3;;;_R2;$|"),
+                Arguments.of("[*]N[C@@H](CC1=CC=CC=C1)C([*])=O |$_R1;;;;;;;;;;;_R2;$,c:6,8,t:4|"),
+                Arguments.of("[*]NCC([*])=O |$_R1;;;;_R2;$|"),
+                Arguments.of("[*]N[C@@H](CC1=CNC=N1)C([*])=O |$_R1;;;;;;;;;;_R2;$,c:7,t:4|"),
+                Arguments.of("CC[C@H](C)[C@H](N[*])C([*])=O |$;;;;;;_R1;;_R2;$|"),
+                Arguments.of("[*]N[C@@H](CCCCN[*])C([*])=O |$_R1;;;;;;;;_R3;;_R2;$|"),
+                Arguments.of("CC(C)C[C@H](N[*])C([*])=O |$;;;;;;_R1;;_R2;$|"),
+                Arguments.of("CSCC[C@H](N[*])C([*])=O |$;;;;;;_R1;;_R2;$|"),
+                Arguments.of("NC(=O)C[C@H](N[*])C([*])=O |$;;;;;;_R1;;_R2;$|"),
+                Arguments.of("[*]N1CCC[C@H]1C([*])=O |$_R1;;;;;;;_R2;$|"),
+                Arguments.of("NC(=O)CC[C@H](N[*])C([*])=O |$;;;;;;;_R1;;_R2;$|"),
+                Arguments.of("NC(=N)NCCC[C@H](N[*])C([*])=O |$;;;;;;;;;_R1;;_R2;$|"),
+                Arguments.of("OC[C@H](N[*])C([*])=O |$;;;;_R1;;_R2;$|"),
+                Arguments.of("C[C@@H](O)[C@H](N[*])C([*])=O |$;;;;;_R1;;_R2;$|"),
+                Arguments.of("CC(C)[C@H](N[*])C([*])=O |$;;;;;_R1;;_R2;$|"),
+                Arguments.of("[*]N[C@@H](CC1=CNC2=C1C=CC=C2)C([*])=O |$_R1;;;;;;;;;;;;;;_R2;$,c:7,10,12,t:4|"),
+                Arguments.of("OC1=CC=C(C[C@H](N[*])C([*])=O)C=C1 |$;;;;;;;;_R1;;_R2;;;$,c:12,t:1,3|"),
+                Arguments.of("CCCCCCCC\\C=C/CCCCCCCCCCCCOCC(CO[*])OCCCCCCCCCCCC\\C=C/CCCCCCCC |$;;;;;;;;;;;;;;;;;;;;;;;;;;;_R92;;;;;;;;;;;;;;;;;;;;;;;$|"),
+                Arguments.of("[H]C(=O)c1ccc(cc1)C(=O)NCCCCCCO[*] |$;;;;;;;;;;;;;;;;;;;_R92$|"),
+                Arguments.of("[*]OC[C@]12CO[C@H]([C@H]([*])O1)[C@H]2O[*] |$_R91;;;;;;;;_R90;;;;_R92$|"),
+                Arguments.of("OC[C@H]([*])O[C@H](CO[*])CO[*] |$;;;_R90;;;;;_R91;;;_R92$|"),
+                Arguments.of("FC(F)(F)C(OCCO[C@H]1[C@H]([*])O[C@H](CO[*])[C@H]1O[*])(C(F)(F)F)C(F)(F)F |$;;;;;;;;;;;_R90;;;;;_R91;;;_R92;;;;;;;;$|"),
+                Arguments.of("O[C@@H]1[C@@H](CO[*])O[C@@H]([*])[C@@H]1O[*] |$;;;;;_R91;;;_R90;;;_R92$|"),
+                Arguments.of("CO[C@H]1[C@H]([*])O[C@H](CO[*])[C@H]1O[*] |$;;;;_R90;;;;;_R91;;;_R92$|"),
+                Arguments.of("O[C@H]1[C@H]([*])O[C@H](CO[*])[C@H]1N[*] |$;;;_R90;;;;;_R91;;;_R92$|"),
+                Arguments.of("[*]OC[C@@]12CO[C@@H]([C@H]([*])O1)[C@@H]2O[*] |$_R91;;;;;;;;_R90;;;;_R92$|"),
+                Arguments.of("O[C@H]1[C@H]([*])O[C@H](CO[*])[C@H]1O[*] |$;;;_R90;;;;;_R91;;;_R92$|"),
+                Arguments.of("[*]OC[C@H]1O[C@@H]([*])C[C@@H]1O[*] |$_R91;;;;;;_R90;;;;_R92$|"),
+                Arguments.of("COCCO[C@H]1[C@H]([*])O[C@H](CO[*])[C@H]1O[*] |$;;;;;;;_R90;;;;;_R91;;;_R92$|"),
+                Arguments.of("[*]OC[C@@H]1CN([*])C[C@H]([*])O1 |$_R91;;;;;;_R92;;;_R90;$|"),
+                Arguments.of("Oc1ccc2c(Oc3cc(O)ccc3C22OC(=O)c3ccc(cc23)C(=O)NCCCCC(CO[*])O[*])c1 |$;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;_R91;;_R92;$|"),
+                Arguments.of("[*]N([*])CCCCCCO[*] |$_R91;;_R90;;;;;;;;_R92$|")
+        );
+    }
+
+    @ParameterizedTest
+    @MethodSource("fragmentSmiles")
+    void testSmilesParsing(String rawInput) throws IOException {
+        String inputSmiles = rawInput.split(" ")[0];
+        String lexicallyCleaned = inputSmiles.replace("*", "6He");
+        log.debug("lexicallyCleaned: %s\n", lexicallyCleaned);
+
+        Chemical parsedChemical = Chemical.parse(inputSmiles);
+        Chemical cleanedChemical = Chem.RemoveQueryFeaturesForPseudoInChI(parsedChemical);
+        String processedSmiles = "unknown";
+        try {
+            processedSmiles= cleanedChemical.toSmiles();
+        }catch (NullPointerException npe) {
+            log.error("Error creating output SMILES from input {}", inputSmiles);
+        }
+        log.debug("processedSmiles: {}", processedSmiles);
+        String inChIKey = cleanedChemical.toInchi().getKey();
+
+        String expectedInChIKey =getInChiKey(lexicallyCleaned);
+        log.debug("Created InChIKey: {}", inChIKey);
+        Assertions.assertEquals(expectedInChIKey, inChIKey);
+        String molfile = cleanedChemical.toMol();
+        log.trace("molfile: {}}", molfile);
+        Assertions.assertFalse(molfile.contains("*"));
+    }
+
+    @Test
+    void testSmilesParse2() throws IOException {
+        //test like we have a vocabulary term here
+        FragmentVocabularyTerm fragmentVocabularyTerm = new FragmentVocabularyTerm();
+        fragmentVocabularyTerm.setFragmentStructure("CCCCCCCC\\C=C/CCCCCCCCCCCCOCC(CO[*])OCCCCCCCCCCCC\\C=C/CCCCCCCC |$;;;;;;;;;;;;;;;;;;;;;;;;;;;_R92;;;;;;;;;;;;;;;;;;;;;;;$|");
+        Optional<String> hash= CVFragmentStructureValidator.getHash(fragmentVocabularyTerm);
+        Assertions.assertTrue(hash.isPresent());
+        log.trace("hash: {}", hash.get());
+    }
+
+    private String getInChiKey(String smiles) throws IOException {
+        Chemical chem = Chemical.parse(smiles);
+        String inChIKey = chem.toInchi().getKey();
+        return inChIKey;
+    }
 }
diff --git a/gsrs-module-substance-example/src/test/resources/application-test.conf b/gsrs-module-substance-example/src/test/resources/application-test.conf
@@ -44,7 +44,7 @@ logging.level.ix.core.EntityFetcher=OFF
 #logging.level.gsrs.module.substance.scrubbers=trace
 #logging.level.example.exports.scrubbers=trace
 
-logging.level.ix.core.chem=TRACE;
+logging.level.ix.core.chem=TRACE
 logging.level.example.chem=TRACE
 
 spring.jpa.database-platform=org.hibernate.dialect.H2Dialect

diff --git a/gsrs-module-substances-api/pom.xml b/gsrs-module-substances-api/pom.xml
@@ -32,12 +32,12 @@
         <dependency>
             <groupId>gov.nih.ncats</groupId>
             <artifactId>molwitch</artifactId>
-            <version>0.6.7</version>
+            <version>0.6.8</version>
         </dependency>
         <dependency>
             <groupId>gov.nih.ncats</groupId>
             <artifactId>molwitch-cdk</artifactId>
-            <version>1.0.18</version>
+            <version>1.0.19</version>
         </dependency>
         <dependency>
             <groupId>gov.nih.ncats</groupId>

diff --git a/gsrs-module-substances-core/pom.xml b/gsrs-module-substances-core/pom.xml
@@ -264,7 +264,7 @@
         <dependency>
             <groupId>gov.nih.ncats</groupId>
             <artifactId>molwitch</artifactId>
-            <version>0.6.7</version>
+            <version>0.6.8</version>
         </dependency>
 
         <dependency>

diff --git a/...tances-core/src/main/java/gsrs/module/substance/controllers/LegacyGinasAppController.java b/...tances-core/src/main/java/gsrs/module/substance/controllers/LegacyGinasAppController.java
@@ -34,6 +34,8 @@
 import javax.servlet.http.HttpServletRequest;
 import java.io.IOException;
 import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 /**
@@ -96,6 +98,16 @@ public ResponseEntity<Object> uploadPayload(
         return payloadController.handleFileUpload(file, queryParameters);
 
     }
+
+    // are the slashes right, needed?
+    public final Pattern ID_PATTERN = Pattern.compile("[a-f0-9\\-]+");
+
+    private boolean checkId(String id)  {
+        if (id == null) return false;
+        Matcher matcher = ID_PATTERN.matcher(id);
+        return matcher.find();
+    }
+
     //GET         /export/$id<[a-f0-9\-]+>.$format<(mol|sdf|smi|smiles|fas)>
     // ix.ginas.controllers.GinasApp.structureExport(id: String, format: String, context: String ?= null)
     @GetMapping({"export/{id:[a-f0-9\\-]+}.{format}","/ginas/app/export/{id:[a-f0-9\\-]+}.{format}"})
@@ -106,6 +118,10 @@ public Object exportStructure(@PathVariable String id, @PathVariable String form
                               @RequestParam(value = "stereo", required = false, defaultValue = "") Boolean stereo,
                               HttpServletRequest httpRequest, RedirectAttributes attributes,
                                   @RequestParam Map<String, String> queryParameters){
+        if (!checkId(id)) {
+            // This is to satisfy Snyk security analysis, probably never gets here if annotation works.
+            return gsrsControllerConfiguration.handleBadRequest(400, "Badly formatted id in url placeholder", null);
+        }
         if("mol".equalsIgnoreCase(format) || "sdf".equalsIgnoreCase(format) ||
                 "smi".equalsIgnoreCase(format) ||  "smiles".equalsIgnoreCase(format) ) {
             //TODO: use cache where possible here

diff --git a/...nces-core/src/main/java/gsrs/module/substance/tasks/DatabaseIndexSyncTaskInitializer.java b/...nces-core/src/main/java/gsrs/module/substance/tasks/DatabaseIndexSyncTaskInitializer.java
@@ -6,7 +6,6 @@
 import java.util.stream.Stream;
 
 import com.fasterxml.jackson.annotation.JsonProperty;
-import com.fasterxml.jackson.core.JsonProcessingException;
 
 import gsrs.controller.AbstractLegacyTextSearchGsrsEntityController;
 import gsrs.controller.hateoas.GsrsEntityToControllerMapper;
@@ -42,11 +41,13 @@ public void run(SchedulerPlugin.JobStats stats, SchedulerPlugin.TaskListener l)
 			log.info("DatabaseIndexSyncTask: No allowed entities defined for the database and index sync scheduler.");
 			return;
 		}
-
+		l.message("Getting entities synchronized");		
 		for(String entity: syncEntities) {
-
-			String entityClassName = generateEntityClassName(entity);
-			log.info("DatabaseIndexSyncTask: Entity class: " + entityClassName);
+			//To improve: need a generalized way of dealing with this, there are several different versions of this			
+			//need to do this in a utility class
+
+			l.message("Start indexing " + entity);
+			String entityClassName = generateEntityClassName(entity);			
 
 			if(entityClassName.isEmpty()) {
 				log.error("Illegal entity class: " + entity + " in database and index sync scheduler.");
@@ -69,25 +70,38 @@ public void run(SchedulerPlugin.JobStats stats, SchedulerPlugin.TaskListener l)
 	        if(!controllerOpt.isPresent()) {
 	        	continue;
 	        }
-	      		
+
 			AbstractLegacyTextSearchGsrsEntityController searchController = (AbstractLegacyTextSearchGsrsEntityController) StaticContextAccessor.getBean(controllerOpt.get());
 			try {
-				searchController.syncIndexesWithDatabase();
-			} catch (JsonProcessingException e) {
-				log.error("Error in database and index sync scheduler: " + entityClassName);
+
+				log.info("Starting DatabaseIndexSync job for Entity class: " + entity);
+
+				AbstractLegacyTextSearchGsrsEntityController.ReindexJobStatus stat = searchController.syncIndexesWithDatabaseWithStatus();
+
+				while(!stat.isDone()) {					
+					l.message(entity + ": " + stat.getStatus());					
+					stat = searchController.getJobStatus(stat.getStatusID());					
+					Thread.sleep(2000);
+				}							
+				l.message("Indexing " + entity + " is done");		
+
+				log.info("Ending DatabaseIndexSync job for Entity class: " + entity);
+
+			} catch (Exception e) {
+				log.error("Error in database and index sync scheduler: " + entity);
 				e.printStackTrace();
 				continue;
 			}
 		}
 	}
-	
+
 	private String generateEntityClassName(String entity) {
 		return "ix.ginas.models.v1." + entity;		
 	}
 
 	@Override
 	public String getDescription() {
-		return "Reindex entities in backup tables that are not in indexes";
+		return "Database and index sync: reindex entities in backup tables that are not in indexes";
 	}
 
 }
diff --git a/gsrs-module-substances-core/src/main/java/ix/core/chem/Chem.java b/gsrs-module-substances-core/src/main/java/ix/core/chem/Chem.java
@@ -7,6 +7,7 @@
 import ix.core.util.LogUtil;
 import lombok.extern.slf4j.Slf4j;
 
+import java.io.IOException;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@@ -31,10 +32,16 @@ public static void setFormula (Structure struc) {
 
     public static Chemical RemoveQueryFeaturesForPseudoInChI(Chemical c) {
         Chemical chemicalToUse = c;
-        if(c.hasQueryAtoms() || c.atoms().filter(at->"A".equals(at.getSymbol())).count()>0){
+        /*try {
+            log.trace("RemoveQueryFeaturesForPseudoInChI processing molfile c {}", c.toMol());
+        } catch (IOException e) {
+            log.error("Error generating mol from Chemical");
+        }*/
+        if(c.hasQueryAtoms() || c.atoms().filter(at->("A".equals(at.getSymbol()) || "*".equals(at.getSymbol()) || "R".equals(at.getSymbol()))).count()>0){
             chemicalToUse = c.copy();
             chemicalToUse.atoms()
-                    .filter(at->at.isQueryAtom() || "A".equals(at.getSymbol()))
+                    .filter(at-> at.getSymbol() == null || "A".equals(at.getSymbol()) || "*".equals(at.getSymbol())
+                            || "R".equals(at.getSymbol()))//isQueryAtom returns true
                     .forEach(a->{
                         a.setAtomicNumber(2);
                         //verify that this is setting a symbol as well
@@ -43,16 +50,12 @@ public static Chemical RemoveQueryFeaturesForPseudoInChI(Chemical c) {
                     });
         }
         Chemical processBonds = chemicalToUse.copy();
-        //temporary diagnostics
-        /*System.out.println("total bonds: " + processBonds.getBondCount());
-        processBonds.bonds().forEach(b->{
-            System.out.printf("bond: %s; atom 1: %s; atom 2: %s\n", b.getBondType(), b.getAtom1().getSymbol(),  b.getAtom2().getSymbol() );});*/
         try{
             Chemical finalChem= Chemical.parse(ChemCleaner.removeSGroupsAndLegacyAtomLists(processBonds.toMol()));
-            finalChem.bonds().filter(b->b.getBondType() == null || b.getBondType().equals(Bond.BondType.SINGLE_OR_DOUBLE) || b.getBondType().equals(b))
+            finalChem.bonds().filter(b->b.getBondType() == null || b.getBondType().equals(Bond.BondType.SINGLE_OR_DOUBLE) || b.isQueryBond())
                     .forEach(b->{
+                        log.trace("about to replace bond {}", b);
                         b.setBondType(Bond.BondType.SINGLE);
-
                     });
             return finalChem;
         }catch(Exception e){

diff --git a/gsrs-module-substances-core/src/main/java/ix/core/chem/InchiStructureHasher.java b/gsrs-module-substances-core/src/main/java/ix/core/chem/InchiStructureHasher.java
@@ -3,6 +3,7 @@
 import gov.nih.ncats.molwitch.Chemical;
 import gov.nih.ncats.molwitch.inchi.InChiResult;
 import ix.core.models.Structure;
+import ix.ginas.utils.ChemUtils;
 
 import java.io.IOException;
 import java.util.function.BiConsumer;
@@ -17,7 +18,9 @@ public void hash(Chemical chem, String mol, BiConsumer<String, String> keyValueC
         try{
             String key=null;
             if(chem.getAtomCount()>0) {
-                InChiResult result = chem.toInchi();
+                Chemical cleaned = chem.copy();
+                cleaned =Chem.RemoveQueryFeaturesForPseudoInChI(cleaned);
+                InChiResult result = cleaned.toInchi();
                 key = result.getKey();    
             }else {
                 key = "MOSFIJXAXDLOML-UHFFFAOYSA-N";

diff --git a/gsrs-module-substances-core/src/main/java/ix/core/chem/StructureProcessor.java b/gsrs-module-substances-core/src/main/java/ix/core/chem/StructureProcessor.java
@@ -353,7 +353,13 @@ public void accept(String key, String value){
 
 
         Chem.setFormula(struc);
-        struc.setMwt(mol.getMass());
+        try {
+            struc.setMwt(mol.getMass());
+        } catch (Exception ignore){
+            //todo: deal with getMass for query atoms
+            //exception swallowed based on discussion between T. Peryea and M. Miller 27 June 2024
+        }
+
 
         if(!query){
             try {