Skip to content

Commit

Permalink
[WIP] Extract PDF References (JabRef#10437)
Browse files Browse the repository at this point in the history
* add grobid reference processing

* add references processing action

* add grobid enabled check

* Add test document for reference parsing

* shorten PDFs

* Add test library

* Add CHANGELOG.md entry

---------

Co-authored-by: Oliver Kopp <[email protected]>
  • Loading branch information
aqurilla and koppor authored Mar 11, 2024
1 parent a7a69db commit 4c64706
Show file tree
Hide file tree
Showing 13 changed files with 571 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv
- We added ability to push entries to TeXworks. [#3197](https://github.com/JabRef/jabref/issues/3197)
- We added the ability to zoom in and out in the document viewer using <kbd>Ctrl</kbd> + <kbd>Scroll</kbd>. [#10964](https://github.com/JabRef/jabref/pull/10964)
- We added a Cleanup for removing non-existent files and grouped the related options [#10929](https://github.com/JabRef/jabref/issues/10929)
- We added the functionality to parse the bibliography of PDFs using the GROBID online service. [#10200](https://github.com/JabRef/jabref/issues/10200)

### Changed

Expand Down
1 change: 1 addition & 0 deletions src/main/java/org/jabref/gui/actions/StandardActions.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ public enum StandardActions implements Action {
REBUILD_FULLTEXT_SEARCH_INDEX(Localization.lang("Rebuild fulltext search index"), IconTheme.JabRefIcons.FILE),
REDOWNLOAD_MISSING_FILES(Localization.lang("Redownload missing files"), IconTheme.JabRefIcons.DOWNLOAD),
OPEN_EXTERNAL_FILE(Localization.lang("Open file"), IconTheme.JabRefIcons.FILE, KeyBinding.OPEN_FILE),
EXTRACT_FILE_REFERENCES(Localization.lang("Extract references from file"), IconTheme.JabRefIcons.FILE_STAR),
OPEN_URL(Localization.lang("Open URL or DOI"), IconTheme.JabRefIcons.WWW, KeyBinding.OPEN_URL_OR_DOI),
SEARCH_SHORTSCIENCE(Localization.lang("Search ShortScience")),
MERGE_WITH_FETCHED_ENTRY(Localization.lang("Get bibliographic data from %0", "DOI/ISBN/...")),
Expand Down
1 change: 1 addition & 0 deletions src/main/java/org/jabref/gui/icon/IconTheme.java
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ public enum JabRefIcons implements JabRefIcon {
DELETE_ENTRY(MaterialDesignD.DELETE),
SEARCH(MaterialDesignM.MAGNIFY),
FILE_SEARCH(MaterialDesignF.FILE_FIND),
FILE_STAR(MaterialDesignF.FILE_STAR),
PDF_METADATA_READ(MaterialDesignF.FORMAT_ALIGN_TOP),
PDF_METADATA_WRITE(MaterialDesignF.FORMAT_ALIGN_BOTTOM),
ADVANCED_SEARCH(Color.CYAN, MaterialDesignM.MAGNIFY),
Expand Down
101 changes: 101 additions & 0 deletions src/main/java/org/jabref/gui/maintable/ExtractReferencesAction.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package org.jabref.gui.maintable;

import java.nio.file.Path;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.Callable;

import org.jabref.gui.DialogService;
import org.jabref.gui.StateManager;
import org.jabref.gui.actions.ActionHelper;
import org.jabref.gui.actions.SimpleCommand;
import org.jabref.gui.importer.ImportEntriesDialog;
import org.jabref.gui.util.BackgroundTask;
import org.jabref.gui.util.TaskExecutor;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.util.GrobidService;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.io.FileUtil;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.LinkedFile;
import org.jabref.preferences.PreferencesService;

public class ExtractReferencesAction extends SimpleCommand {
private final int FILES_LIMIT = 10;

private final DialogService dialogService;
private final StateManager stateManager;
private final PreferencesService preferencesService;
private final BibEntry entry;
private final LinkedFile linkedFile;
private final TaskExecutor taskExecutor;

public ExtractReferencesAction(DialogService dialogService,
StateManager stateManager,
PreferencesService preferencesService,
TaskExecutor taskExecutor) {
this(dialogService, stateManager, preferencesService, null, null, taskExecutor);
}

public ExtractReferencesAction(DialogService dialogService,
StateManager stateManager,
PreferencesService preferencesService,
BibEntry entry,
LinkedFile linkedFile,
TaskExecutor taskExecutor) {
this.dialogService = dialogService;
this.stateManager = stateManager;
this.preferencesService = preferencesService;
this.entry = entry;
this.linkedFile = linkedFile;
this.taskExecutor = taskExecutor;

if (this.linkedFile == null) {
this.executable.bind(
ActionHelper.needsEntriesSelected(stateManager)
.and(ActionHelper.hasLinkedFileForSelectedEntries(stateManager))
.and(this.preferencesService.getGrobidPreferences().grobidEnabledProperty())
);
} else {
this.setExecutable(true);
}
}

@Override
public void execute() {
extractReferences();
}

private void extractReferences() {
stateManager.getActiveDatabase().ifPresent(databaseContext -> {
List<BibEntry> selectedEntries = new LinkedList<>();
if (entry == null) {
selectedEntries = stateManager.getSelectedEntries();
} else {
selectedEntries.add(entry);
}

List<Path> fileList = FileUtil.getListOfLinkedFiles(selectedEntries, databaseContext.getFileDirectories(preferencesService.getFilePreferences()));
if (fileList.size() > FILES_LIMIT) {
boolean continueOpening = dialogService.showConfirmationDialogAndWait(Localization.lang("Processing a large number of files"),
Localization.lang("You are about to process %0 files. Continue?", fileList.size()),
Localization.lang("Continue"), Localization.lang("Cancel"));
if (!continueOpening) {
return;
}
}

Callable<ParserResult> parserResultCallable = () -> new ParserResult(
new GrobidService(this.preferencesService.getGrobidPreferences()).processReferences(fileList, preferencesService.getImportFormatPreferences())
);
BackgroundTask<ParserResult> task = BackgroundTask.wrap(parserResultCallable)
.withInitialMessage(Localization.lang("Processing PDF(s)"));

task.onFailure(dialogService::showErrorDialogAndWait);

ImportEntriesDialog dialog = new ImportEntriesDialog(stateManager.getActiveDatabase().get(), task);
dialog.setTitle(Localization.lang("Extract References"));
dialogService.showCustomDialogAndWait(dialog);
});
}
}
1 change: 1 addition & 0 deletions src/main/java/org/jabref/gui/maintable/RightClickMenu.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ public static ContextMenu create(BibEntryTableViewModel entry,
factory.createMenuItem(StandardActions.ATTACH_FILE_FROM_URL, new AttachFileFromURLAction(dialogService, stateManager, taskExecutor, preferencesService)),
factory.createMenuItem(StandardActions.OPEN_FOLDER, new OpenFolderAction(dialogService, stateManager, preferencesService, taskExecutor)),
factory.createMenuItem(StandardActions.OPEN_EXTERNAL_FILE, new OpenExternalFileAction(dialogService, stateManager, preferencesService, taskExecutor)),
factory.createMenuItem(StandardActions.EXTRACT_FILE_REFERENCES, new ExtractReferencesAction(dialogService, stateManager, preferencesService, taskExecutor)),

factory.createMenuItem(StandardActions.OPEN_URL, new OpenUrlAction(dialogService, stateManager, preferencesService)),
factory.createMenuItem(StandardActions.SEARCH_SHORTSCIENCE, new SearchShortScienceAction(dialogService, stateManager, preferencesService)),
Expand Down
29 changes: 29 additions & 0 deletions src/main/java/org/jabref/logic/importer/util/GrobidService.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

Expand Down Expand Up @@ -84,6 +85,34 @@ public List<BibEntry> processPDF(Path filePath, ImportFormatPreferences importFo

String httpResponse = response.body();

return getBibEntries(importFormatPreferences, httpResponse);
}

public List<BibEntry> processReferences(List<Path> pathList, ImportFormatPreferences importFormatPreferences) throws IOException, ParseException {
List<BibEntry> entries = new ArrayList<>();
for (Path filePath: pathList) {
entries.addAll(processReferences(filePath, importFormatPreferences));
}

return entries;
}

public List<BibEntry> processReferences(Path filePath, ImportFormatPreferences importFormatPreferences) throws IOException, ParseException {
Connection.Response response = Jsoup.connect(grobidPreferences.getGrobidURL() + "/api/processReferences")
.header("Accept", MediaTypes.APPLICATION_BIBTEX)
.data("input", filePath.toString(), Files.newInputStream(filePath))
.data("consolidateCitations", String.valueOf(ConsolidateCitations.WITH_METADATA))
.method(Connection.Method.POST)
.ignoreContentType(true)
.timeout(20000)
.execute();

String httpResponse = response.body();

return getBibEntries(importFormatPreferences, httpResponse);
}

private static List<BibEntry> getBibEntries(ImportFormatPreferences importFormatPreferences, String httpResponse) throws IOException, ParseException {
if (httpResponse == null || "@misc{-1,\n author = {}\n}\n".equals(httpResponse)) { // This filters empty BibTeX entries
throw new IOException("The GROBID server response does not contain anything.");
}
Expand Down
6 changes: 6 additions & 0 deletions src/main/resources/l10n/JabRef_en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,12 @@ Export\ preferences\ to\ file=Export preferences to file
Export\ to\ clipboard=Export to clipboard
Export\ to\ text\ file.=Export to text file.

Extract\ references\ from\ file=Extract references from file
Extract\ References=Extract References
Processing\ PDF(s)=Processing PDF(s)
Processing\ a\ large\ number\ of\ files=Processing a large number of files
You\ are\ about\ to\ process\ %0\ files.\ Continue?=You are about to process %0 files. Continue?

Exporting\ %0=Exporting %0
Could\ not\ export\ file\ '%0'\ (reason\:\ %1)=Could not export file '%0' (reason: %1)
Unknown\ export\ format\ %0=Unknown export format %0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

import org.jabref.logic.importer.ImportFormatPreferences;
Expand Down Expand Up @@ -100,4 +101,22 @@ public void processPdfTest() throws IOException, ParseException, URISyntaxExcept
// assertEquals(Optional.of("Paper Title"), be0.getField(StandardField.TITLE));
// assertEquals(Optional.of("2014-10-05"), be0.getField(StandardField.DATE));
}

@Test
public void extractsReferencesFromPdf() throws IOException, ParseException, URISyntaxException {
BibEntry ref1 = new BibEntry(StandardEntryType.Article)
.withField(StandardField.AUTHOR, "Kopp, O")
.withField(StandardField.ADDRESS, "Berlin Heidelberg")
.withField(StandardField.DATE, "2013")
.withField(StandardField.JOURNAL, "All links were last followed on October")
.withField(StandardField.PAGES, "700--704")
.withField(StandardField.PUBLISHER, "Springer")
.withField(StandardField.TITLE, "Winery -A Modeling Tool for TOSCA-based Cloud Applications")
.withField(StandardField.VOLUME, "8274")
.withField(StandardField.YEAR, "2013");

Path file = Path.of(Objects.requireNonNull(PdfGrobidImporterTest.class.getResource("LNCS-minimal.pdf")).toURI());
List<BibEntry> extractedReferences = grobidService.processReferences(file, importFormatPreferences);
assertEquals(List.of(ref1), extractedReferences);
}
}
Loading

0 comments on commit 4c64706

Please sign in to comment.