diff --git a/Dockerfile.delft b/Dockerfile.delft index cf5942e0eb..d33e4ffa80 100644 --- a/Dockerfile.delft +++ b/Dockerfile.delft @@ -142,7 +142,7 @@ RUN python3 preload_embeddings.py --registry ./resources-registry.json && \ RUN mkdir delft && \ cp ./resources-registry.json delft/ -ENV GROBID_SERVICE_OPTS "--add-opens java.base/java.lang=ALL-UNNAMED" +ENV GROBID_SERVICE_OPTS "--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED" CMD ["./grobid-service/bin/grobid-service"] diff --git a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java index b24ad47d07..1b017c8a1f 100755 --- a/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java +++ b/grobid-core/src/main/java/org/grobid/core/lexicon/Lexicon.java @@ -16,6 +16,7 @@ import java.util.Set; import java.util.StringTokenizer; import java.util.regex.*; +import java.util.stream.Collectors; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; @@ -36,6 +37,7 @@ import org.grobid.core.utilities.Utilities; import org.grobid.core.utilities.TextUtilities; import org.grobid.core.analyzers.GrobidAnalyzer; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,10 +45,9 @@ /** * Class for managing all the lexical resources. - * */ public class Lexicon { - private static final Logger LOGGER = LoggerFactory.getLogger(Lexicon.class); + private static final Logger LOGGER = LoggerFactory.getLogger(Lexicon.class); // private static volatile Boolean instanceController = false; private static volatile Lexicon instance; @@ -59,7 +60,7 @@ public class Lexicon { private Set countries = null; // retrieve basic naming information about a research infrastructure (key must be lower case!) - private Map > researchOrganizations = null; + private Map> researchOrganizations = null; // fast matchers for efficient and flexible pattern matching in layout token sequence or strings private FastMatcher abbrevJournalPattern = null; @@ -67,21 +68,21 @@ public class Lexicon { private FastMatcher publisherPattern = null; private FastMatcher journalPattern = null; private FastMatcher cityPattern = null; - private FastMatcher organisationPattern = null; + private FastMatcher organisationPattern = null; private FastMatcher researchInfrastructurePattern = null; - private FastMatcher locationPattern = null; + private FastMatcher locationPattern = null; private FastMatcher countryPattern = null; - private FastMatcher orgFormPattern = null; + private FastMatcher orgFormPattern = null; private FastMatcher collaborationPattern = null; private FastMatcher funderPattern = null; private FastMatcher personTitlePattern = null; - private FastMatcher personSuffixPattern = null; + private FastMatcher personSuffixPattern = null; public static Lexicon getInstance() { if (instance == null) { synchronized (Lexicon.class) { if (instance == null) { - getNewInstance(); + getNewInstance(); } } } @@ -91,11 +92,11 @@ public static Lexicon getInstance() { /** * Creates a new instance. */ - private static synchronized void getNewInstance() { - LOGGER.debug("Get new instance of Lexicon"); - GrobidProperties.getInstance(); - instance = new Lexicon(); - } + private static synchronized void getNewInstance() { + LOGGER.debug("Get new instance of Lexicon"); + GrobidProperties.getInstance(); + instance = new Lexicon(); + } /** * Hidden constructor @@ -103,24 +104,24 @@ private static synchronized void getNewInstance() { private Lexicon() { initDictionary(); initNames(); - // the loading of the journal and conference names is lazy + // the loading of the journal and conference names is lazy addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"wordforms"+File.separator+"english.wf", Language.EN); + "lexicon" + File.separator + "wordforms" + File.separator + "english.wf", Language.EN); addDictionary(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"wordforms"+File.separator+"german.wf", Language.EN); + "lexicon" + File.separator + "wordforms" + File.separator + "german.wf", Language.EN); + addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + + "lexicon" + File.separator + "names" + File.separator + "names.family"); addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"names.family"); - addLastNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"lastname.5k"); + "lexicon" + File.separator + "names" + File.separator + "lastname.5k"); + addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + + "lexicon" + File.separator + "names" + File.separator + "names.female"); addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"names.female"); + "lexicon" + File.separator + "names" + File.separator + "names.male"); addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"names.male"); - addFirstNames(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"names"+File.separator+"firstname.5k"); + "lexicon" + File.separator + "names" + File.separator + "firstname.5k"); initCountryCodes(); addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"countries"+File.separator+"CountryCodes.xml"); + "lexicon" + File.separator + "countries" + File.separator + "CountryCodes.xml"); } /** @@ -139,7 +140,7 @@ public OrganizationRecord(String name, String fullName, String lang) { } private void initDictionary() { - LOGGER.info("Initiating dictionary"); + LOGGER.info("Initiating dictionary"); dictionary_en = new HashSet<>(); dictionary_de = new HashSet<>(); LOGGER.info("End of Initialization of dictionary"); @@ -149,11 +150,11 @@ public final void addDictionary(String path, String lang) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add entries to dictionary (language '" + lang + - "'), because file '" + file.getAbsolutePath() + "' does not exists."); + "'), because file '" + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add entries to dictionary (language '" + lang + - "'), because cannot read file '" + file.getAbsolutePath() + "'."); + "'), because cannot read file '" + file.getAbsolutePath() + "'."); } InputStream ist = null; InputStreamReader isr = null; @@ -202,14 +203,14 @@ public boolean isCountry(String tok) { } private void initNames() { - LOGGER.info("Initiating names"); + LOGGER.info("Initiating names"); firstNames = new HashSet(); lastNames = new HashSet(); LOGGER.info("End of initialization of names"); } private void initCountryCodes() { - LOGGER.info("Initiating country codes"); + LOGGER.info("Initiating country codes"); countryCodes = new HashMap(); countries = new HashSet(); countryPattern = new FastMatcher(); @@ -220,11 +221,11 @@ private void addCountryCodes(String path) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add country codes to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add country codes to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; //InputStreamReader isr = null; @@ -262,7 +263,7 @@ public void initCountryPatterns() { if (countries == null || countries.size() == 0) { // it should never be the case addCountryCodes(GrobidProperties.getGrobidHomePath() + File.separator + - "lexicon"+File.separator+"countries"+File.separator+"CountryCodes.xml"); + "lexicon" + File.separator + "countries" + File.separator + "CountryCodes.xml"); } for (String country : countries) { @@ -274,11 +275,11 @@ public final void addFirstNames(String path) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add first names to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add first names to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; BufferedReader dis = null; @@ -318,11 +319,11 @@ public final void addLastNames(String path) { File file = new File(path); if (!file.exists()) { throw new GrobidResourceException("Cannot add last names to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add last names to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; BufferedReader dis = null; @@ -360,6 +361,7 @@ public final void addLastNames(String path) { /** * Lexical look-up, default is English + * * @param s a string to test * @return true if in the dictionary */ @@ -415,13 +417,13 @@ public boolean inDictionary(String s, String lang) { public void initJournals() { try { abbrevJournalPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/abbrev_journals.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/abbrev_journals.txt")); journalPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/journals.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/journals.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException( - "Error when compiling lexicon matcher for abbreviated journal names.", e); + "Error when compiling lexicon matcher for abbreviated journal names.", e); } } @@ -429,7 +431,7 @@ public void initConferences() { // ArrayList conferences = new ArrayList(); try { conferencePattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/proceedings.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/journals/proceedings.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for conference names.", e); } @@ -438,7 +440,7 @@ public void initConferences() { public void initPublishers() { try { publisherPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/publishers/publishers.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/publishers/publishers.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for conference names.", e); } @@ -447,7 +449,7 @@ public void initPublishers() { public void initCities() { try { cityPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/cities15000.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/cities15000.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for cities.", e); } @@ -458,56 +460,56 @@ public void initCollaborations() { //collaborationPattern = new FastMatcher(new // File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/collaborations.txt")); collaborationPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/inspire_collaborations.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/inspire_collaborations.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for collaborations.", e); } } - public void initOrganisations() { + public void initOrganisations() { try { organisationPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/WikiOrganizations.lst")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + - "/lexicon/organisations/government.government_agency")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + - "/lexicon/organisations/known_corporations.lst")); - organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + - "/lexicon/organisations/venture_capital.venture_funded_company")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/WikiOrganizations.lst")); + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + "/lexicon/organisations/government.government_agency")); + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + "/lexicon/organisations/known_corporations.lst")); + organisationPattern.loadTerms(new File(GrobidProperties.getGrobidHomePath() + + "/lexicon/organisations/venture_capital.venture_funded_company")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e); } catch (IOException e) { throw new GrobidResourceException("Cannot add term to matcher, because the lexicon resource file " + - "does not exist or cannot be read.", e); + "does not exist or cannot be read.", e); } catch (Exception e) { - throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); - } + throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); + } } - public void initOrgForms() { + public void initOrgForms() { try { - orgFormPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt")); + orgFormPattern = new FastMatcher(new + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/orgClosings.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for organisations.", e); } catch (Exception e) { - throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); - } + throw new GrobidException("An exception occured while running Grobid Lexicon init.", e); + } } - public void initLocations() { + public void initLocations() { try { locationPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/location.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/places/location.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for locations.", e); } } - public void initPersonTitles() { + public void initPersonTitles() { try { personTitlePattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/VincentNgPeopleTitles.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/VincentNgPeopleTitles.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for person titles.", e); } @@ -516,7 +518,7 @@ public void initPersonTitles() { public void initPersonSuffix() { try { personSuffixPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/suffix.txt")); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/names/suffix.txt")); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for person name suffix.", e); } @@ -525,8 +527,8 @@ public void initPersonSuffix() { public void initFunders() { try { funderPattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), - GrobidAnalyzer.getInstance(), true); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/funders.txt"), + GrobidAnalyzer.getInstance(), true); } catch (PatternSyntaxException e) { throw new GrobidResourceException("Error when compiling lexicon matcher for funders.", e); } catch (Exception e) { @@ -537,19 +539,19 @@ public void initFunders() { public void initResearchInfrastructures() { try { researchInfrastructurePattern = new FastMatcher(new - File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), - GrobidAnalyzer.getInstance(), true); + File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures.txt"), + GrobidAnalyzer.getInstance(), true); // store some name mapping researchOrganizations = new TreeMap<>(); File file = new File(GrobidProperties.getGrobidHomePath() + "/lexicon/organisations/research_infrastructures_map.txt"); if (!file.exists()) { throw new GrobidResourceException("Cannot add research infrastructure names to dictionary, because file '" + - file.getAbsolutePath() + "' does not exists."); + file.getAbsolutePath() + "' does not exists."); } if (!file.canRead()) { throw new GrobidResourceException("Cannot add research infrastructure to dictionary, because cannot read file '" + - file.getAbsolutePath() + "'."); + file.getAbsolutePath() + "'."); } InputStream ist = null; BufferedReader dis = null; @@ -651,7 +653,7 @@ public List getOrganizationNamingInfo(String name) { /** * Map the language codes used by the language identifier component to the normal * language name. - * + *

* Note: due to an older bug, kr is currently map to Korean too - this should * disappear at some point in the future after retraining of models * @@ -847,7 +849,7 @@ public List tokenPositionsCityNames(List s) { /** Organisation names **/ - /** + /** * Soft look-up in organisation name gazetteer for a given string with token positions */ public List tokenPositionsOrganisationNames(String s) { @@ -913,7 +915,7 @@ public List charPositionsOrganisationNames(List s) return results; } - /** + /** * Soft look-up in organisation form name gazetteer for a given string with token positions */ public List tokenPositionsOrgForm(String s) { @@ -992,7 +994,7 @@ public List tokenPositionsLocationNames(List s) { /** * Soft look-up in location name gazetteer for a string, return a list of positions referring * to the character positions within the string. - * + *

* For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19) * * @param s the input string @@ -1009,7 +1011,7 @@ public List charPositionsLocationNames(String s) { /** * Soft look-up in location name gazetteer for a list of LayoutToken, return a list of * positions referring to the character positions in the input sequence. - * + *

* For example "The car is in Milan" as Milan is a location, would return OffsetPosition(14,19) * * @param s the input list of LayoutToken @@ -1023,7 +1025,7 @@ public List charPositionsLocationNames(List s) { return results; } - /** + /** * Soft look-up in person title gazetteer for a given string with token positions */ public List tokenPositionsPersonTitle(String s) { @@ -1185,7 +1187,7 @@ public static List characterPositionsUrlPattern(List * This will produce better quality recognized URL, avoiding missing suffixes and problems * with break lines and spaces. **/ @@ -1226,8 +1228,8 @@ public static OffsetPosition getTokenPositions(int startPos, int endPos, List urlTokens = new ArrayList<>(); int tokenPos = 0; int tokenIndex = 0; - for(LayoutToken localToken : layoutTokens) { - if (startPos <= tokenPos && (tokenPos+localToken.getText().length() <= endPos) ) { + for (LayoutToken localToken : layoutTokens) { + if (startPos <= tokenPos && (tokenPos + localToken.getText().length() <= endPos)) { urlTokens.add(localToken); if (startTokenIndex == -1) startTokenIndex = tokenIndex; @@ -1249,14 +1251,14 @@ public static OffsetPosition getTokenPositions(int startPos, int endPos, List characterPositionsUrlPatternWithPdfAnnotations( - List layoutTokens, - List pdfAnnotations) { + List layoutTokens, + List pdfAnnotations) { List urlPositions = Lexicon.characterPositionsUrlPattern(layoutTokens); List resultPositions = new ArrayList<>(); // Do we need to extend the url position based on additional position of the corresponding // PDF annotation? - for(OffsetPosition urlPosition : urlPositions) { + for (OffsetPosition urlPosition : urlPositions) { int startPos = urlPosition.start; int endPos = urlPosition.end; @@ -1272,7 +1274,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation continue; } - List urlTokens = new ArrayList<>(layoutTokens.subList(startTokenIndex, endTokensIndex+1)); + List urlTokens = new ArrayList<>(layoutTokens.subList(startTokenIndex, endTokensIndex + 1)); String urlString = LayoutTokensUtil.toText(urlTokens); @@ -1282,11 +1284,8 @@ public static List characterPositionsUrlPatternWithPdfAnnotation if (CollectionUtils.isNotEmpty(urlTokens)) { LayoutToken lastToken = urlTokens.get(urlTokens.size() - 1); if (pdfAnnotations != null) { - targetAnnotation = pdfAnnotations.stream() - .filter(pdfAnnotation -> - pdfAnnotation.getType() != null && pdfAnnotation.getType() == PDFAnnotation.Type.URI && pdfAnnotation.cover(lastToken)) - .findFirst() - .orElse(null); + targetAnnotation = matchPdfAnnotationsBasedOnCoordinatesDestinationOrLastTokens(pdfAnnotations, urlTokens); + correctedLastTokenIndex = urlTokens.size() - 1; // If we cannot match, maybe the regex got some characters too much, e.g. dots, parenthesis,etc.. @@ -1296,14 +1295,10 @@ public static List characterPositionsUrlPatternWithPdfAnnotation String lastTokenText = lastToken.getText(); int index = urlTokens.size() - 1; // The error should be within a few characters, so we stop if the token length is greater than 1 - while(index > 0 && lastTokenText.length() == 1 && !Character.isLetterOrDigit(lastTokenText.charAt(0)) && targetAnnotation==null) { + while (index > 0 && lastTokenText.length() == 1 && !Character.isLetterOrDigit(lastTokenText.charAt(0)) && targetAnnotation == null) { index -= 1; LayoutToken finalLastToken1 = urlTokens.get(index); - targetAnnotation = pdfAnnotations.stream() - .filter(pdfAnnotation -> - pdfAnnotation.getType() != null && pdfAnnotation.getType() == PDFAnnotation.Type.URI && pdfAnnotation.cover(finalLastToken1)) - .findFirst() - .orElse(null); + targetAnnotation = matchPdfAnnotationsBasedOnCoordinatesDestinationOrLastTokens(pdfAnnotations, urlTokens); correctedLastTokenIndex = index; } @@ -1315,7 +1310,13 @@ public static List characterPositionsUrlPatternWithPdfAnnotation String destination = targetAnnotation.getDestination(); int destinationPos = 0; - if (destination.contains(urlString)) { + if (urlString.replaceAll("\\s", "").equals(destination)) { + // Nothing to do here, we ignore the correctedLastTokenIndex because the regex got everything we need + } else if ( + destination.contains(urlString) + || destination.contains(urlString.replaceAll("\\s", "")) + || destination.contains(StringUtils.stripEnd(urlString, "-")) + ) { //In this case the regex did not catch all the URL, so we need to extend it using the // destination URL from the annotation destinationPos = destination.indexOf(urlString) + urlString.length(); @@ -1327,7 +1328,7 @@ public static List characterPositionsUrlPatternWithPdfAnnotation if ("\n".equals(nextToken.getText()) || " ".equals(nextToken.getText()) || - nextToken.getText().length() == 0) { + nextToken.getText().isEmpty()) { endPos += nextToken.getText().length(); additionalSpaces += nextToken.getText().length(); additionalTokens += 1; @@ -1355,8 +1356,6 @@ public static List characterPositionsUrlPatternWithPdfAnnotation endPos -= additionalSpaces; } } - } else if (urlString.replaceAll("\\s", "").equals(destination)) { - // Nothing to do here, we ignore the correctedLastTokenIndex because the regex got everything we need } else if (urlString.contains(destination) || urlString.replaceAll("\\s", "").contains(destination)) { // In this case the regex has catches too much, usually this should be limited to a few characters, // but we cannot know it for sure. Here we first find the difference between the destination and the @@ -1406,6 +1405,63 @@ public static List characterPositionsUrlPatternWithPdfAnnotation return resultPositions; } + @Nullable + private static PDFAnnotation matchPdfAnnotationsBasedOnCoordinatesDestinationOrLastTokens(List pdfAnnotations, List urlTokens) { + LayoutToken lastToken = urlTokens.get(urlTokens.size() - 1); + String urlString = LayoutTokensUtil.toText(urlTokens); + + List possibleTargetAnnotations = pdfAnnotations.stream() + .filter(pdfAnnotation -> + pdfAnnotation.getType() != null + && pdfAnnotation.getType() == PDFAnnotation.Type.URI + && pdfAnnotation.cover(lastToken) + ).collect(Collectors.toList()); + + PDFAnnotation targetAnnotation; + if (possibleTargetAnnotations.size() > 1) { + possibleTargetAnnotations = possibleTargetAnnotations.stream() + .filter(pdfAnnotation -> + pdfAnnotation.getDestination().contains(urlString) + ) + .collect(Collectors.toList()); + + if (possibleTargetAnnotations.size() > 1) { + // If the lastToken is any of ./:_ we should add the token before + int index = urlTokens.size() - 1; + if (urlTokens.size() > 1 && lastToken.getText().matches("[.:_\\-/]")) { + index -= 1; + } + + while (index > 0 && possibleTargetAnnotations.size() > 1) { + final String lastTokenText2 = LayoutTokensUtil.toText(urlTokens.subList(index - 1, urlTokens.size())); + + possibleTargetAnnotations = possibleTargetAnnotations.stream() + .filter(pdfAnnotation -> + pdfAnnotation.getDestination().contains(lastTokenText2) + ) + .collect(Collectors.toList()); + index--; + } + + targetAnnotation = possibleTargetAnnotations.stream() + .findFirst() + .orElse(null); + + } else { + targetAnnotation = possibleTargetAnnotations.stream() + .findFirst() + .orElse(null); + } + + } else { + targetAnnotation = possibleTargetAnnotations.stream() + .findFirst() + .orElse(null); + } + + return targetAnnotation; + } + /** * Identify in tokenized input the positions of an email address pattern with token positions diff --git a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java index 4436cbd979..8672a11801 100644 --- a/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java +++ b/grobid-core/src/test/java/org/grobid/core/lexicon/LexiconTest.java @@ -409,7 +409,7 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC PDFAnnotation annotation1 = new PDFAnnotation(); annotation1.setPageNumber(10); List boundingBoxes = new ArrayList<>(); - boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 378.093, 625.354, 167.51799999999997, 10.599999999999909)); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 378.093, 625.354, 167.51799999999997, 10.599999999999909)); annotation1.setBoundingBoxes(boundingBoxes); annotation1.setDestination("https://github.com/shijuanchen/shift_cult"); annotation1.setType(PDFAnnotation.Type.URI); @@ -417,7 +417,7 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC PDFAnnotation annotation2 = new PDFAnnotation(); annotation2.setPageNumber(10); List boundingBoxes2 = new ArrayList<>(); - boundingBoxes2.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26,10.60)); + boundingBoxes2.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26, 10.60)); annotation2.setBoundingBoxes(boundingBoxes2); annotation2.setDestination("https://sites.google.com/view/shijuanchen/research/shift_cult"); annotation2.setType(PDFAnnotation.Type.URI); @@ -432,6 +432,64 @@ public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnC assertThat(input.substring(url1.start, url1.end), is("https://sites.google. \ncom/view/shijuanchen/research/shift_cult")); } + @Test + public void testCharacterPositionsUrlPatternWithPDFAnnotations_DuplicatedMatchingPDFAnnotations_shouldReturnCorrectIntervalBasedOnText4() throws Exception { + final String input = "Google Earth Engine applications to visualize the \n" + + "datasets: https://github.com/shijuanchen/shift_cult \n" + + "Map products visualization: https://sites.google. \n" + + "com/view/shijuanchen/research/shift_cult \n"; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(28); + lastTokenOfTheURL1.setPage(10); + lastTokenOfTheURL1.setX(504.75295121951217); + lastTokenOfTheURL1.setY(626.353); + lastTokenOfTheURL1.setWidth(40.858048780487806); + lastTokenOfTheURL1.setHeight(9.3999); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(44); + lastTokenOfTheURL2.setPage(10); + lastTokenOfTheURL2.setX(526.9964666666667); + lastTokenOfTheURL2.setY(638.853); + lastTokenOfTheURL2.setWidth(22.0712); + lastTokenOfTheURL2.setHeight(9.3999); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(10); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(10, 378.093, 625.354, 167.51799999999997, 10.599999999999909)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://github.com/shijuanchen/shift_cult"); + annotation1.setType(PDFAnnotation.Type.URI); + + PDFAnnotation annotation2 = new PDFAnnotation(); + annotation2.setPageNumber(10); + List boundingBoxes2 = new ArrayList<>(); + boundingBoxes2.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26, 10.60)); + annotation2.setBoundingBoxes(boundingBoxes2); + annotation2.setDestination("https://www.google.com"); + annotation2.setType(PDFAnnotation.Type.URI); + + PDFAnnotation annotation3 = new PDFAnnotation(); + annotation3.setPageNumber(10); + List boundingBoxes3 = new ArrayList<>(); + boundingBoxes3.add(BoundingBox.fromPointAndDimensions(10, 475.497, 637.854, 77.26, 10.60)); + annotation3.setBoundingBoxes(boundingBoxes3); + annotation3.setDestination("https://sites.google.com/view/shijuanchen/research/shift_cult"); + annotation3.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1, annotation2, annotation3); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(2)); + OffsetPosition url0 = offsetPositions.get(0); + assertThat(input.substring(url0.start, url0.end), is("https://github.com/shijuanchen/shift_cult")); + OffsetPosition url1 = offsetPositions.get(1); + assertThat(input.substring(url1.start, url1.end), is("https://sites.google. \ncom/view/shijuanchen/research/shift_cult")); + } + + @Test public void testCharacterPositionsUrlPatternWithPDFAnnotations_URL_shouldReturnCorrectIntervalBasedOnText5() throws Exception { final String input = ", accessible through the University of Hawaii Sea Level Center with station ID of UHSLC ID 57 \n" + @@ -494,4 +552,235 @@ public void testGetTokenPosition() throws Exception { } + @Test + public void testCharacterPositionsUrlPattern_URLRegexMatchesTooLittle_shouldReturnCorrectInterval_1() throws Exception { + final String input = "We appreciate assistance from The Research Support Center, Research Center for Human Disease Modeling, \n" + + "and Kyushu University Graduate School of Medical Sciences. We thank Dr. Mitsuru Watanabe and Ms. Eriko \n" + + "Matsuo from the Department of Neurology, Kyushu University, for the technical assistance in the flow cytometric \n" + + "analysis. We thank Ms. Sachiko Koyama and Hideko Noguchi from the Department of Neuropathology, Kyushu \n" + + "University, for excellent technical assistance in the histological analysis. We thank Mr. Tetsuo Kishi from the \n" + + "Department of Medicine, Kyushu University School of Medicine for the immunohistochemical analysis. We \n" + + "thank J. Ludovic Croxford, PhD, from Edanz (https:// jp. edanz. com/ ac) for editing a draft of this manuscript."; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //These have to overlap with the regex output to make sure that the annotation is selected + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(219); + lastTokenOfTheURL1.setPage(15); + lastTokenOfTheURL1.setX(322.49060000000003); + lastTokenOfTheURL1.setY(454.586); + lastTokenOfTheURL1.setWidth(16.338); + lastTokenOfTheURL1.setHeight(9.099); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(220); + lastTokenOfTheURL2.setPage(15); + lastTokenOfTheURL2.setX(338.8286); + lastTokenOfTheURL2.setY(454.586); + lastTokenOfTheURL2.setWidth(3.2676); + lastTokenOfTheURL2.setHeight(9.099); + + LayoutToken lastTokenOfTheURL3 = tokenisedInput.get(221); + lastTokenOfTheURL3.setPage(15); + lastTokenOfTheURL3.setX(342.0962); + lastTokenOfTheURL3.setY(454.586); + lastTokenOfTheURL3.setWidth(3.2676); + lastTokenOfTheURL3.setHeight(9.099); + + LayoutToken lastTokenOfTheURL4 = tokenisedInput.get(222); + lastTokenOfTheURL4.setPage(15); + lastTokenOfTheURL4.setX(345.3638); + lastTokenOfTheURL4.setY(454.586); + lastTokenOfTheURL4.setWidth(3.2676); + lastTokenOfTheURL4.setHeight(9.099); + + LayoutToken lastTokenOfTheURL5 = tokenisedInput.get(224); + lastTokenOfTheURL5.setPage(15); + lastTokenOfTheURL5.setX(348.667); + lastTokenOfTheURL5.setY(454.586); + lastTokenOfTheURL5.setWidth(5.868599999999999); + lastTokenOfTheURL5.setHeight(9.099); + + LayoutToken lastTokenOfTheURL6 = tokenisedInput.get(225); + lastTokenOfTheURL6.setPage(15); + lastTokenOfTheURL6.setX(354.5356); + lastTokenOfTheURL6.setY(454.586); + lastTokenOfTheURL6.setWidth(2.9342999999999995); + lastTokenOfTheURL6.setHeight(9.099); + + LayoutToken lastTokenOfTheURL7 = tokenisedInput.get(227); + lastTokenOfTheURL7.setPage(15); + lastTokenOfTheURL7.setX(357.514); + lastTokenOfTheURL7.setY(454.586); + lastTokenOfTheURL7.setWidth(19.5645); + lastTokenOfTheURL7.setHeight(9.099); + + LayoutToken lastTokenOfTheURL10 = tokenisedInput.get(231); + lastTokenOfTheURL10.setPage(15); + lastTokenOfTheURL10.setX(395.106375); + lastTokenOfTheURL10.setY(454.586); + lastTokenOfTheURL10.setWidth(4.690125); + lastTokenOfTheURL10.setHeight(9.099); + + LayoutToken lastTokenOfTheURL11 = tokenisedInput.get(233); + lastTokenOfTheURL11.setPage(15); + lastTokenOfTheURL11.setX(399.842); + lastTokenOfTheURL11.setY(454.586); + lastTokenOfTheURL11.setWidth(7.295399999999999); + lastTokenOfTheURL11.setHeight(9.099); + + LayoutToken lastTokenOfTheURL12 = tokenisedInput.get(234); + lastTokenOfTheURL12.setPage(15); + lastTokenOfTheURL12.setX(407.13739999999996); + lastTokenOfTheURL12.setY(454.586); + lastTokenOfTheURL12.setWidth(3.6476999999999995); + lastTokenOfTheURL12.setHeight(9.099); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(15); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(15, 322.37, 451.55, 85.305, 12.140999999999963)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://jp.edanz.com/ac"); + annotation1.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(1)); + OffsetPosition url0 = offsetPositions.get(0); + assertThat(input.substring(url0.start, url0.end), is("https:// jp. edanz. com/ ac")); + } + + @Test + public void testCharacterPositionsUrlPattern_URLRegexMatchesTooLittle_shouldReturnCorrectInterval_2() throws Exception { + /* + * This test only aims for the last link + */ + final String input = ", \n" + + "based on the sorted BAM files generated by using BWA-MEM (v.0.7.17; http:// \n" + + "biobwa.sourceforge.net/) and SAMtools (v1.546; http://www.htslib.org/). MetaBAT2 \n" + + "was applied to bin the assemblies with contig depth results under the default \n" + + "parameters (minimum contig length ≥ 1500 bp). CheckM v.1.0.3 (https://ecogenom \n" + + "ics.github.io/CheckM/) with the lineage_wf workflow was used to estimate the complete \n" + + "ness and contamination of MAGs "; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //These have to overlap with the regex output to make sure that the annotation is selected + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(132); + lastTokenOfTheURL1.setPage(5); + lastTokenOfTheURL1.setX(331.7820588235294); + lastTokenOfTheURL1.setY(467.682); + lastTokenOfTheURL1.setWidth(4.307294117647059); + lastTokenOfTheURL1.setHeight(10.818); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(133); + lastTokenOfTheURL2.setPage(5); + lastTokenOfTheURL2.setX(336.08935294117646); + lastTokenOfTheURL2.setY(467.682); + lastTokenOfTheURL2.setWidth(4.307294117647059); + lastTokenOfTheURL2.setHeight(10.818); + + LayoutToken lastTokenOfTheURL3 = tokenisedInput.get(134); + lastTokenOfTheURL3.setPage(5); + lastTokenOfTheURL3.setX(340.39664705882353); + lastTokenOfTheURL3.setY(467.682); + lastTokenOfTheURL3.setWidth(34.45835294117647); + lastTokenOfTheURL3.setHeight(10.818); + + LayoutToken lastTokenOfTheURL5 = tokenisedInput.get(137); + lastTokenOfTheURL5.setPage(5); + lastTokenOfTheURL5.setX(41.9999); + lastTokenOfTheURL5.setY(479.682); + lastTokenOfTheURL5.setWidth(11.487272727272726); + lastTokenOfTheURL5.setHeight(10.818); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(5); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 41.00, 468.50, 335.00, 23.00)); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 134.01, 454.50, 170.18, 24.00)); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 123.68, 481.50, 0.00, 9.00)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://ecogenomics.github.io/CheckM/"); + annotation1.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(3)); + OffsetPosition url2 = offsetPositions.get(2); + assertThat(input.substring(url2.start, url2.end), is("https://ecogenom \n" + + "ics.github.io/CheckM/")); + } + + @Test + public void testCharacterPositionsUrlPattern_URLContainsSpuriosBreklineHypen_shouldReturnCorrectInterval() throws Exception { + /* + * This test only aims for the last link + */ + final String input = "Details and code for using the IntOGen framework are available at \n" + + "https://intogen.readthedocs.io/en/latest/index.html. The specific \n" + + "code to perform this analysis is available in the Genomics England \n" + + "research environment (https://re-docs.genomicsengland.co.uk/ \n" + + "access/) under /re_gecip/shared_allGeCIPs/pancancer_drivers/code/. \n" + + "The link to becoming a member of the Genomics England research \n" + + "network and obtaining access can be found at https://www.genomic-\n" + + "sengland.co.uk/research/academic/join-gecip. The code to perform \n" + + "the canSAR chemogenomics analysis is available through Zenodo \n" + + "(https://doi.org/10.5281/zenodo.8329054) (ref. "; + + List tokenisedInput = GrobidAnalyzer.getInstance().tokenizeWithLayoutToken(input); + + //These have to overlap with the regex output to make sure that the annotation is selected + LayoutToken lastTokenOfTheURL0 = tokenisedInput.get(153); + lastTokenOfTheURL0.setPage(11); + lastTokenOfTheURL0.setX(523.39535); + lastTokenOfTheURL0.setY(436.559); + lastTokenOfTheURL0.setWidth(4.205850000000001); + lastTokenOfTheURL0.setHeight(8.217); + + LayoutToken lastTokenOfTheURL1 = tokenisedInput.get(154); + lastTokenOfTheURL1.setPage(11); + lastTokenOfTheURL1.setX(527.6012); + lastTokenOfTheURL1.setY(436.559); + lastTokenOfTheURL1.setWidth(29.44095); + lastTokenOfTheURL1.setHeight(8.217); + + LayoutToken lastTokenOfTheURL2 = tokenisedInput.get(155); + lastTokenOfTheURL2.setPage(11); + lastTokenOfTheURL2.setX(557.04215); + lastTokenOfTheURL2.setY(436.559); + lastTokenOfTheURL2.setWidth(8.217); + lastTokenOfTheURL2.setHeight(10.818); + + LayoutToken lastTokenOfTheURL3 = tokenisedInput.get(157); + lastTokenOfTheURL3.setPage(11); + lastTokenOfTheURL3.setX(306.141); + lastTokenOfTheURL3.setY(447.309); + lastTokenOfTheURL3.setWidth(31.902000000000005); + lastTokenOfTheURL3.setHeight(8.217); + + PDFAnnotation annotation1 = new PDFAnnotation(); + annotation1.setPageNumber(11); + List boundingBoxes = new ArrayList<>(); + boundingBoxes.add(BoundingBox.fromPointAndDimensions(11,477.14,434.60,84.12,10.18)); +// boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 134.01, 454.50, 170.18, 24.00)); +// boundingBoxes.add(BoundingBox.fromPointAndDimensions(5, 123.68, 481.50, 0.00, 9.00)); + annotation1.setBoundingBoxes(boundingBoxes); + annotation1.setDestination("https://www.genomicsengland.co.uk/research/academic/join-gecip"); + annotation1.setType(PDFAnnotation.Type.URI); + + List pdfAnnotations = List.of(annotation1); + + List offsetPositions = Lexicon.characterPositionsUrlPatternWithPdfAnnotations(tokenisedInput, pdfAnnotations); + + assertThat(offsetPositions, hasSize(4)); + OffsetPosition url2 = offsetPositions.get(2); + assertThat(input.substring(url2.start, url2.end), is("https://www.genomic-\n" + + "sengland.co.uk/research/academic/join-gecip")); + } + } diff --git a/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml b/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml index 59bdd0ce98..39251fe8a6 100644 --- a/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml +++ b/grobid-trainer/resources/dataset/fulltext/corpus/tei/s41598-020-58065-9.training.fulltext.tei.xml @@ -14,11 +14,11 @@

The thermodynamic parameters of the LaH 10 superconductor were calculated by means of Eliashberg equations on the imaginary axis 23 :

- π μ Δ = Ω − Ω − Ω + Δ Δ =− Z k T K [ ( ) ( )] , n n B m M M n m m m m m 2 2 + π μ Δ = Ω − Ω − Ω + Δ Δ =− Z k T K [ ( ) ( )] , () n n B m M M n m m m m m 2 2

and

- π = + Ω − Ω Ω + Δ . =− Z kT K Z 1 ( ) n B m M M n m m m m n m 2 2 + π = + Ω − Ω Ω + Δ . =− Z kT K Z 1 ( ) () n B m M M n m m m m n m 2 2

The symbols Δ = Δ Ω i ( ) n n and = Z Z i ( ) n n denote the order parameter and the wave function renormalization factor, respectively. The quantity Ω n represents the Matsubara frequency: π Ω = k T n ( 2 1) n B , where k B is the Boltzmann constant. The pairing kernel is defined by: λ Ω − Ω = Ω − Ω +Ω K( ) n m ( ) C n m C 2 2 2 , where λ denotes the elec-tron-phonon coupling constant. We determined the value of λ on the basis of experimental data 20,21 and the condition: Δ = = = [ ] 0 n T T 1 C . The fitting between the theory and the experimental results is presented in Fig. 1. We obtained λ a = 2.187 for p a = 150 GPa and λ b = 2.818 for p b = 190 GPa. The symbol Ω C represents the character-istic phonon frequency, its value being assumed as Ω C = 100 meV.

@@ -32,19 +32,19 @@
Figure 1. The dependence of the maximum value of the order parameter on the electron-phonon coupling constant. We consider two cases: = T 215 C a K (p a = 150 GPa) and = T 260 C b K (p b = 190 GPa).
- ρ π Δ = − Ω + Δ − Ω ×      Ω + Δ       = F k T Z Z (0) 2 ( ) , B n M n n n n S n N n n n 1 2 2 2 2 + ρ π Δ = − Ω + Δ − Ω ×      Ω + Δ       = F k T Z Z (0) 2 ( ) , () B n M n n n n S n N n n n 1 2 2 2 2

where ρ(0) denotes the value of electronic density of states at Fermi surface; Z n S and Z n N are the wave function normalization factors for the superconducting and the normal state, respectively. Note that ΔF is equal to zero exactly for T = T C . This fact results from the overt dependence of free energy on solutions of Eliashberg equations (Δ n and Z n ) that have been adjusted to the experimental value of critical temperature by appropriate selection of electron-phonon coupling constant (see Fig. 1). Thermodynamic critical field should be calculated from the formula:

- ρ π ρ = − Δ . H F (0) 8 [ / (0)] C + ρ π ρ = − Δ . H F (0) 8 [ / (0)] () C

The difference in the specific heat between the superconducting and the normal state (ΔC = C S − C N ) is given by:

- ρ ρ Δ = − Δ . C T k k T d F d k T ( ) (0) [ / (0)] ( ) B B B 2 2 + ρ ρ Δ = − Δ . C T k k T d F d k T ( ) (0) [ / (0)] ( ) () B B B 2 2

The most convenient way of estimation the specific heat for the normal state is using the expression:

- ρ γ = . C T k k T ( ) (0) N B B + ρ γ = . C T k k T ( ) (0) () N B B
Figure 2. The dependence of the order parameter on temperature. The insets present the influence of temperature on the value of effective electron mass to the band electron mass ratio. Blue or red disks represent numerical results. Black curves were obtained from the analytical formulae: Δ = Δ Γ T T T T ( ) ( ) 1 ( / ) C 0 and = + Γ m m Z T Z T T T ZT / [ ( ) ( )]( / ) ( ) e e C C 0 0 , where λ = + Z T ( ) 1 C , Γ a = 3.5 and Γ b = 3.4. The predictions of the BCS theory we marked with grey circles.
@@ -59,28 +59,28 @@

Nevertheless, a sensible qualitative analysis can be made with respect to the influence of the atomic mass of the X element on a value of the critical temperature (since the mass of the X element determines Ω max ). In this regard, let us refer to the theoretical results obtained within the Eliashberg formalism for H 2 S and H 3 S superconduc-tors 5,6 . They prove that contributions to the Eliashberg function (α Ω F( ) 2 ) coming from sulphur and from hydro-gen are separated due to a huge difference between atomic masses of these two elements. To be precise, the electron-phonon interaction derived from sulphur is crucial in the frequency range from 0 meV to Ω max S equal to about 70 meV, while the contribution derived from hydrogen (Ω = 220 max H meV) is significant above ~100 meV. It is noteworthy that we come upon a similar situation in the case of the LaH 10 compound 30 . Therefore the follow-ing factorization of the Eliashberg function for the LaXH compound can be assumed:

α λ θ λ θ λ θ Ω =         − Ω +          − Ω +          − Ω F( ) ( ) ( ) ( ) , - 2 L a max La 2 max La X max X 2 max X H max H 2 max H + () 2 L a max La 2 max La X max X 2 max X H max H 2 max H

where λ La , λ X , and λ H are the contributions to the electron-phonon coupling constant derived from both metals (La, X) and hydrogen, respectively. Similarly, the symbols Ω max La , Ω max X , and Ω max H represent the respective maxi-mum phonon frequencies. The value of the critical temperature can be assessed from the generalised formula of the BCS theory 7 :

- λ λ λμ = . − . + − + . k T f f 1 27 exp 1 14(1 ) (1 0 163 ) , B C 1 2 ln + λ λ λμ = . − . + − + . k T f f 1 27 exp 1 14(1 ) (1 0 163 ) , () B C 1 2 ln

while the symbols appearing in Eq. (8) are defined in Table 1.

Let us calculate explicitly the relevant quantities:

- λ λ λ λ = + + , + λ λ λ λ = + + , ()
La X H Quantity λ = α +∞ Ω Ω d 2 F 0 2 ( ) ( ) , Ω = λ α +∞ d exp l n( ) F ln 2 0 2 ( ) , α Ω = Ω Ω λ +∞ d F( ) 2 2 0 2 , = + λ Λ ( ) f 1 1 1 3 2 1 3 , = + λ λ           + Λ f 1 2 2 ln 1 2 2 2 2 , Λ 1 = 2.4 − 0.14μ ' , μ Λ = . + Ω Ω (0 1 9 )( / ) 2 2 ln . Table 1. The quantities: λ (electron-phonon coupling constant), Ω ln (logarithmic phonon frequency), Ω 2 (second moment of the normalized weight function), f 1 (strong-coupling correction function), and f 2 (shape correction function) μ.
λ λ λ λ λ λ λ λ λ λ λ λ Ω = + +   Ω    × + +   Ω    × + +   Ω    exp l n( ) 1 2 exp l n( ) 1 2 exp l n( ) 1 2 , - + () ln La La X H max La X La X H max X H La X H max H

and

- λ λ λ λ λ λ λ λ λ λ λ λ Ω = + + + + + + + + . ( ) 2 ( ) 2 ( ) 2 2 La La X H max La 2 X La X H max X 2 H La X H max H 2 + λ λ λ λ λ λ λ λ λ λ λ λ Ω = + + + + + + + + . ( ) 2 ( ) 2 ( ) 2 () 2 La La X H max La 2 X La X H max X 2 H La X H max H 2

We are going to consider the case Ω < Ω < ~40 meV 100 meV max La max X . It means that we are interested in such an X element, the contribution of which to the Eliashberg function fills the gap between contributions com-ing from lanthanum and hydrogen. It can be assumed that 0 < λ X < 1, while keeping in mind that λ La = 0.68 31 . Additionally, the previous calculations discussed in the work allow to write that λ La + λ H is equal to λ a = 2.187 for p a = 150 GPa or to λ b = 2.818 for p b = 190 GPa. The quantity  μ occurring in the Eq. (8) serves now as the fitting parameter. One should remember that the formula for the critical temperature given by the Eq. (8) was derived with the use of significant simplifying assumptions (the value of the cut-off frequency is neglected, as well as the retardation effects modeled by the Matsubara frequency). Therefore the value of the Coulomb pseudopo-tential determined from the full Eliashberg equations usually differs from the value of  μ calculated analytically. The experimental data for the LaH 10 superconductor can be reproduced using Eq. (8) and assuming that μ = . 0 170 a and μ = . 0 276 b .

diff --git a/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml b/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml index 7da5707c04..deab9cc6f4 100644 --- a/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml +++ b/grobid-trainer/resources/dataset/header/corpus/tei/10.1371_journal.pone.0210163.training.header.tei.xml @@ -65,7 +65,7 @@ © 2019 Rake, Haeussler. This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited. Data Availability Statement: -
Data on clinical trials conducted in India obtained from ClinicalTrials.gov is made available in the corresponding Supporting Information file. Other authors can also access this information through ClinicalTrials.gov. We obtained publication data from the Scopus database which is a proprietary database (www.scopus.com). Researchers interested in replicating our study can access data on trial-related publications following the search procedure described in the paper. Researchers do not need special privileges to access the Scopus database, however, a subscription may be required. The authors did not have special access privileges to the data.
+ Data on clinical trials conducted in India obtained from ClinicalTrials.gov is made available in the corresponding Supporting Information file. Other authors can also access this information through ClinicalTrials.gov. We obtained publication data from the Scopus database which is a proprietary database (www.scopus.com). Researchers interested in replicating our study can access data on trial-related publications following the search procedure described in the paper. Researchers do not need special privileges to access the Scopus database, however, a subscription may be required. The authors did not have special access privileges to the data. Funding: The authors received no specific funding for this work.