Skip to content

Commit

Permalink
- Google NLP improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
alegauss committed Feb 27, 2024
1 parent 7bdfc0d commit df8bc3b
Show file tree
Hide file tree
Showing 9 changed files with 292 additions and 290 deletions.
Binary file removed turing-app/lib/gradle-js-plugin-2.14.2-SNAPSHOT.jar
Binary file not shown.
5 changes: 0 additions & 5 deletions turing-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -231,11 +231,6 @@
<artifactId>schema-org-client</artifactId>
<version>1.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,10 @@
import java.io.*;
import java.nio.file.Files;
import java.nio.file.StandardCopyOption;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
import java.util.Map.Entry;
import java.util.Scanner;
import java.util.UUID;
import java.util.regex.Pattern;

@Slf4j
@RestController
@RequestMapping("/api/nlp")
Expand Down Expand Up @@ -216,11 +214,9 @@ public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata m
TurNLPTextValidate textValidate = new TurNLPTextValidate();
contentFile.append(TurCommonsUtils.cleanTextContent(handler.toString()));
textValidate.setText(contentFile.toString());

return this.turNLPInstanceRepository.findById(id).map(turNLPInstance -> {
TurNLPResponse turNLPResponse = turNLPProcess.processTextByNLP(turNLPInstance, textValidate.getText());
return createRedactionScript(turNLPResponse);
}).orElse(new RedactionScript());
return this.turNLPInstanceRepository.findById(id).map(turNLPInstance ->
createRedactionScript(turNLPProcess.processTextByNLP(turNLPInstance, textValidate.getText())))
.orElse(new RedactionScript());

} catch (IOException | SAXException | TikaException e) {
log.error(e.getMessage(), e);
Expand All @@ -233,20 +229,20 @@ private RedactionScript createRedactionScript(TurNLPResponse turNLPResponse) {
List<RedactionCommand> redactionCommands = new ArrayList<>();
RedactionScript redactionScript = new RedactionScript();
redactionScript.setVersion("1");
if (turNLPResponse != null) {
turNLPResponse.getEntityMapWithProcessedValues().forEach((key, value) -> {
if (value != null) {
value.forEach(term -> {
RedactionCommand redactionCommand = new RedactionCommand();
SearchString searchString = new SearchString();
searchString.setMatchWholeWord(true);
searchString.setString(String.format("%s", term));
redactionCommand.setSearchString(searchString);
redactionCommands.add(redactionCommand);
});
}
});
}
Optional.ofNullable(turNLPResponse)
.map(TurNLPResponse::getEntityMapWithProcessedValues)
.ifPresent(entityMap -> {
entityMap.forEach((key, value) ->
Optional.ofNullable(value).ifPresent(v ->
v.forEach(term -> {
RedactionCommand redactionCommand = new RedactionCommand();
SearchString searchString = new SearchString();
searchString.setMatchWholeWord(true);
searchString.setString(String.format("%s", term));
redactionCommand.setSearchString(searchString);
redactionCommands.add(redactionCommand);
})));
});
redactionScript.setRedactionCommands(redactionCommands);
return redactionScript;
}
Expand Down Expand Up @@ -369,7 +365,7 @@ private List<String> getNLPTerms(TurNLPResponse turNLPResponse) {
}

public boolean isPDF(File file) {
try(Scanner input = new Scanner(new FileReader(file))) {
try (Scanner input = new Scanner(new FileReader(file))) {
while (input.hasNextLine()) {
if (input.nextLine().contains("%PDF-")) {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,16 @@ public TurNLPResponse processTextByNLP(TurNLPInstance turNLPInstance, String tex
private List<TurNLPValidateEntity> getEntitiesFromNLPVendor(TurNLPInstance turNLPInstance) {
List<TurNLPValidateEntity> entities = new ArrayList<>();
turNLPVendorEntityRepository.findByTurNLPVendor(turNLPInstance.getTurNLPVendor())
.forEach(entity -> entities.add(new TurNLPValidateEntity(entity.getName())));
.forEach(entity -> {
entities.add(new TurNLPValidateEntity(entity.getName()));
});

return entities;
}

public TurNLPResponse processTextByNLP(TurNLPInstance turNLPInstance, String text,
List<TurNLPValidateEntity> turNLPValidateEntities) {
System.out.println(text);
Optional<TurNLPRequest> turNLPRequest = this.init(turNLPInstance, createDataWithTextAttrib(text),
turNLPValidateEntities);
return getNLPResponse(turNLPRequest);
Expand Down
15 changes: 2 additions & 13 deletions turing-app/src/main/java/com/viglet/turing/nlp/TurNLPRequest.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

import com.viglet.turing.persistence.model.nlp.TurNLPInstance;
import lombok.Getter;
import lombok.Setter;

/**
*
Expand All @@ -33,22 +34,10 @@
* @since 0.3.6
*
*/
@Setter
@Getter
public class TurNLPRequest {
private TurNLPInstance turNLPInstance;
private Map<String, Object> data;
private List<TurNLPEntityRequest> entities;

public void setTurNLPInstance(TurNLPInstance turNLPInstance) {
this.turNLPInstance = turNLPInstance;
}

public void setData(Map<String, Object> data) {
this.data = data;
}

public void setEntities(List<TurNLPEntityRequest> entities) {
this.entities = entities;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
package com.viglet.turing.onstartup.nlp;

import com.google.inject.Inject;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;

Expand All @@ -41,40 +40,40 @@ public TurNLPEntityOnStartup(TurNLPEntityRepository turNLPEntityRepository) {
public void createDefaultRows() {

if (turNLPEntityRepository.findAll().isEmpty()) {
saveTurNLPEntity("PN", "People", "People Entity.","persons", 0, 1);
saveTurNLPEntity("GL", "Places", "Place Entity.","locations", 0, 1);
saveTurNLPEntity("FR", "Fraud", "Fraud Entity.","frauds", 0, 1);
saveTurNLPEntity("ON", "Organization", "Organization Entity.","organizations", 0, 1);
saveTurNLPEntity("DURATION", "Duration", "Duration Entity.","durations", 0, 1);
saveTurNLPEntity("ORDINAL", "Ordinal", "Ordinal Entity.","ordinals", 0, 1);
saveTurNLPEntity("MISC", "Misc", "Misc Entity.","miscs", 0, 1);
saveTurNLPEntity("DATE", "Date", "Date Entity.","dates", 0, 1);
saveTurNLPEntity("TIME", "Time", "Time Entity.","times", 0, 1);
saveTurNLPEntity("MONEY", "Money", "Money Entity.","moneys", 0, 1);
saveTurNLPEntity("PERCENTAGE", "Percentage", "Percentage Entity.","percentages", 0, 1);
saveTurNLPEntity("NORP", "NORP", "Nationalities or religious or political groups.","norps", 0, 1);
saveTurNLPEntity("FAC", "FAC", "Buildings, airports, highways, bridges, etc.","norps", 0, 1);
saveTurNLPEntity("GPE", "GPE", "Countries, cities, states.","gpe", 0, 1);
saveTurNLPEntity("LOC", "LOC", "Non-GPE locations, mountain ranges, bodies of water.","locs", 0, 1);
saveTurNLPEntity("PRODUCT", "Product", "Objects, vehicles, foods, etc. (Not services.)","products", 0, 1);
saveTurNLPEntity("EVENT", "Event", "Named hurricanes, battles, wars, sports events, etc.","dates", 0, 1);
saveTurNLPEntity("WORK_OF_ART", "Work of Art", "Titles of books, songs, etc.","worksOfArt", 0, 1);
saveTurNLPEntity("LAW", "Law", "Named documents made into laws.","laws", 0, 1);
saveTurNLPEntity("LANGUAGE", "Language", "Any named language.","languages", 0, 1);
saveTurNLPEntity("QUANTITY", "Quantity", "Measurements, as of weight or distance.","quantities", 0, 1);
saveTurNLPEntity("CARDINAL", "Cardinal", "Numerals that do not fall under another type.","cardinals", 0, 1);
saveTurNLPEntity("DNI", "DNI", "National Identity Document.","dnis", 0, 1);
saveTurNLPEntity("CIF", "CIF", "Certificate of Fiscal Identification.","cifs", 0, 1);
saveTurNLPEntity("NIE", "NIE", "Extranjero Identification Number.","nies", 0, 1);
saveTurNLPEntity("PASSAPORT", "Passaport", "Passport ID.","passports", 0, 1);
saveTurNLPEntity("EMAIL", "Email", "Emails.","emails", 0, 1);
saveTurNLPEntity("FIRST_NAME", "First Name", "First Name.","firstnames", 0, 1);
saveTurNLPEntity("LAST_NAME", "Last Name", "Last Name.","lastnames", 0, 1);
saveTurNLPEntity("PN", "People", "People Entity.","persons");
saveTurNLPEntity("GL", "Places", "Place Entity.","locations");
saveTurNLPEntity("FR", "Fraud", "Fraud Entity.","frauds");
saveTurNLPEntity("ON", "Organization", "Organization Entity.","organizations");
saveTurNLPEntity("DURATION", "Duration", "Duration Entity.","durations");
saveTurNLPEntity("ORDINAL", "Ordinal", "Ordinal Entity.","ordinals");
saveTurNLPEntity("MISC", "Misc", "Misc Entity.","miscs");
saveTurNLPEntity("DATE", "Date", "Date Entity.","dates");
saveTurNLPEntity("TIME", "Time", "Time Entity.","times");
saveTurNLPEntity("MONEY", "Money", "Money Entity.","moneys");
saveTurNLPEntity("PERCENTAGE", "Percentage", "Percentage Entity.","percentages");
saveTurNLPEntity("NORP", "NORP", "Nationalities or religious or political groups.","norps");
saveTurNLPEntity("FAC", "FAC", "Buildings, airports, highways, bridges, etc.","norps");
saveTurNLPEntity("GPE", "GPE", "Countries, cities, states.","gpe");
saveTurNLPEntity("LOC", "LOC", "Non-GPE locations, mountain ranges, bodies of water.","locs");
saveTurNLPEntity("PRODUCT", "Product", "Objects, vehicles, foods, etc. (Not services.)","products");
saveTurNLPEntity("EVENT", "Event", "Named hurricanes, battles, wars, sports events, etc.","dates");
saveTurNLPEntity("WORK_OF_ART", "Work of Art", "Titles of books, songs, etc.","worksOfArt");
saveTurNLPEntity("LAW", "Law", "Named documents made into laws.","laws");
saveTurNLPEntity("LANGUAGE", "Language", "Any named language.","languages");
saveTurNLPEntity("QUANTITY", "Quantity", "Measurements, as of weight or distance.","quantities");
saveTurNLPEntity("CARDINAL", "Cardinal", "Numerals that do not fall under another type.","cardinals");
saveTurNLPEntity("DNI", "DNI", "National Identity Document.","dnis");
saveTurNLPEntity("CIF", "CIF", "Certificate of Fiscal Identification.","cifs");
saveTurNLPEntity("NIE", "NIE", "Extranjero Identification Number.","nies");
saveTurNLPEntity("PASSPORT", "Passport", "Passport ID.","passports");
saveTurNLPEntity("EMAIL", "Email", "Emails.","emails");
saveTurNLPEntity("FIRST_NAME", "First Name", "First Name.","firstnames");
saveTurNLPEntity("LAST_NAME", "Last Name", "Last Name.","lastnames");
}
}

private void saveTurNLPEntity(String internalName, String name, String description, String collectionName, int local, int enabled) {
TurNLPEntity turNLPEntity = new TurNLPEntity(internalName,name,description, collectionName, local, enabled);
private void saveTurNLPEntity(String internalName, String name, String description, String collectionName) {
TurNLPEntity turNLPEntity = new TurNLPEntity(internalName,name,description, collectionName, 0, 1);
turNLPEntityRepository.save(turNLPEntity);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public void createDefaultRows() {
saveNLPFeature("Dependency Parsing");
saveNLPFeature("Sentiment Analysis");
saveNLPFeature("Mention Detection");
saveNLPFeature("Coreference");
saveNLPFeature("Conference");
saveNLPFeature("Open IE");
}
}
Expand Down
Loading

0 comments on commit df8bc3b

Please sign in to comment.