Reworking fetcher, parser for CiteSeer (JabRef#9882)

* First Commit - Created separate parser class for CiteSeer - Able to query CiteSeer to get search results, albeit without using Jabref logic classes * Implemented Unirest for fetch requests - Utilized Unirest Java library for requesting CiteSeer information. - Starting changes for unpacking inputstream to CiteSeer parser. - Cleaned up imports. * Reworking CiteSeerFetcher - Reworking methods to adhere to SearchBasedFetcher - Minor changes to parser * First Attempt at QueryTransformer for CiteSeer - Implemented first version of query transformer to create JSON payload (not tested yet) - Minor changes to accommodate for query transformer in CiteSeer file * Expanded on CiteSeerQueryTransformer - Creating JSON payload for through QueryTransformer - Implemented necessary parameters for JSON payload for POST requests to CiteSeer - Minor changes to CiteSeer and CiteSeerParser - Starting out CiteSeerQueryTransformerTest class * Converted CiteSeerParser Gson to Kong Unirest - Refactored class to use Kong JSON objects rather than Gson for consistency * Adding some more tests for CiteSeer files - Added tests for CiteSeerQueryTransformer. - Reworking some tests for CiteSeer fetcher as the the queryString formatting has changed - Minor changes to CiteSeerParser and CiteSeerQueryTransformer * More tests for CiteSeerQueryTransformer - Added some more tests CiteSeerQueryTransformerTest - Changes to CiteSeerQueryTransformer as it currently does not support boolean operators, and modifying the type fpr payload parameters - Unsure of how to implement pageSize with previously mentioned Jabref global parameter * Updating parser and tests for CiteSeer - Reworked parsing author data for JSON object - Updating tests for CiteSeer Fetcher - Removed obsolete comments and code * Additional testing for CiteSeer functionality - CiteSeer sortBy:Year option is buggy on their website, inconsistent with entries without pdfs - Added tests for searching with CiteSeer fetcher - Added test for QueryTransformer where we can try different variations of year parameters * Reworking parseAuthors method with AuthorListParser - Modified tests to match expected behavior. * Minor changes and cleanup * Adding Cite Seer response ID as DOI field - Updated Cite Seer tests to reflect these changes * Implement FulltextFetcher for CiteSeer - Implemented findFullText method for Cite Seer fetcher. - Added some test cases for FulltextFetcher implementation. * Fixing Tests for Actions - Fixed code style check error. - Addressed issue with using ApacheCommonsLang3 helper functions by reworking parseAuthors method. - Made parseBibEntry method more modular. * Updating WebFetcher Tests - Updated WebFetcher tests to include CiteSeer as FullTextFetcher and SearchBasedFetcher. * Update ManageStudyDefinitionViewModelTest.java --------- Co-authored-by: Carl Christian Snethlage <[email protected]> Co-authored-by: Christoph <[email protected]>
koppor · Jul 12, 2023 · fd82bef · fd82bef
1 parent 2abcb95
commit fd82bef
Show file tree

Hide file tree

Showing 8 changed files with 478 additions and 76 deletions.
diff --git a/src/main/java/org/jabref/logic/importer/WebFetchers.java b/src/main/java/org/jabref/logic/importer/WebFetchers.java
@@ -14,6 +14,7 @@
 import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem;
 import org.jabref.logic.importer.fetcher.BiodiversityLibrary;
 import org.jabref.logic.importer.fetcher.BvbFetcher;
+import org.jabref.logic.importer.fetcher.CiteSeer;
 import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher;
 import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher;
 import org.jabref.logic.importer.fetcher.CrossRef;
@@ -110,7 +111,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
         set.add(new DBLPFetcher(importFormatPreferences));
         set.add(new SpringerFetcher(importerPreferences));
         set.add(new CrossRef());
-        // set.add(new CiteSeer());
+         set.add(new CiteSeer());
         set.add(new DOAJFetcher(importFormatPreferences));
         set.add(new IEEE(importFormatPreferences, importerPreferences));
         set.add(new CompositeSearchBasedFetcher(set, 30));
@@ -203,6 +204,7 @@ public static Set<FulltextFetcher> getFullTextFetchers(ImportFormatPreferences i
         // Meta search
         // fetchers.add(new JstorFetcher(importFormatPreferences));
         // fetchers.add(new GoogleScholar(importFormatPreferences));
+        fetchers.add(new CiteSeer());
         fetchers.add(new OpenAccessDoi());
         fetchers.add(new SemanticScholar());
         fetchers.add(new ResearchGate(importFormatPreferences));

diff --git a/src/main/java/org/jabref/logic/importer/fetcher/CiteSeer.java b/src/main/java/org/jabref/logic/importer/fetcher/CiteSeer.java
@@ -1,36 +1,36 @@
 package org.jabref.logic.importer.fetcher;
 
-import java.io.BufferedReader;
-import java.io.InputStreamReader;
-import java.net.MalformedURLException;
-import java.net.URISyntaxException;
+import java.io.IOException;
 import java.net.URL;
-import java.util.ArrayList;
 import java.util.List;
+import java.util.Objects;
 import java.util.Optional;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-import org.jabref.logic.cleanup.FieldFormatterCleanup;
-import org.jabref.logic.cleanup.Formatter;
-import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter;
-import org.jabref.logic.formatter.casechanger.TitleCaseFormatter;
+
 import org.jabref.logic.help.HelpFile;
 import org.jabref.logic.importer.FetcherException;
-import org.jabref.logic.importer.Parser;
-import org.jabref.logic.importer.SearchBasedParserFetcher;
-import org.jabref.logic.importer.fetcher.transformers.DefaultQueryTransformer;
-import org.jabref.logic.importer.fileformat.CoinsParser;
-import org.jabref.logic.util.OS;
+import org.jabref.logic.importer.FulltextFetcher;
+import org.jabref.logic.importer.ParseException;
+import org.jabref.logic.importer.SearchBasedFetcher;
+import org.jabref.logic.importer.fetcher.transformers.CiteSeerQueryTransformer;
+import org.jabref.logic.importer.fileformat.CiteSeerParser;
 import org.jabref.model.entry.BibEntry;
-import org.jabref.model.entry.field.InternalField;
 import org.jabref.model.entry.field.StandardField;
 
-import org.apache.http.client.utils.URIBuilder;
+import kong.unirest.JsonNode;
+import kong.unirest.Unirest;
+import kong.unirest.json.JSONArray;
+import kong.unirest.json.JSONElement;
 import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;
 
-public class CiteSeer implements SearchBasedParserFetcher {
+public class CiteSeer implements SearchBasedFetcher, FulltextFetcher {
+
+    private static final String BASE_URL = "citeseerx.ist.psu.edu";
+
+    private static final String API_URL = "https://citeseerx.ist.psu.edu/api/search";
+
+    private static final String PDF_URL = "https://" + BASE_URL + "/document?repid=rep1&type=pdf&doi=%s";
+
+    private CiteSeerQueryTransformer transformer;
 
     public CiteSeer() {
     }
@@ -46,52 +46,59 @@ public Optional<HelpFile> getHelpPage() {
     }
 
     @Override
-    public URL getURLForQuery(QueryNode luceneQuery) throws URISyntaxException, MalformedURLException, FetcherException {
-        URIBuilder uriBuilder = new URIBuilder("https://citeseer.ist.psu.edu/search");
-        uriBuilder.addParameter("sort", "rlv"); // Sort by relevance
-        uriBuilder.addParameter("q", new DefaultQueryTransformer().transformLuceneQuery(luceneQuery).orElse("")); // Query
-        uriBuilder.addParameter("t", "doc"); // Type: documents
-        // uriBuilder.addParameter("start", "0"); // Start index (not supported at the moment)
-        return uriBuilder.build().toURL();
-    }
+    public List<BibEntry> performSearch(QueryNode luceneQuery) throws FetcherException {
+        // ADR-0014
+        try {
+            JSONElement payload = getPayloadJSON(luceneQuery);
+            JsonNode requestResponse = Unirest.post(API_URL)
+                                              .header("authority", BASE_URL)
+                                              .header("accept", "application/json, text/plain, */*")
+                                              .header("content-type", "application/json;charset=UTF-8")
+                                              .header("origin", "https://" + BASE_URL)
+                                              .body(payload)
+                                              .asJson().getBody();
 
-    @Override
-    public Parser getParser() {
-        // MathSciNet returns COinS result embedded in HTML
-        // So we extract the data string from the <span class="Z3988" title="<data>"></span> tags and pass the content to the COinS parser
-        return inputStream -> {
-            String response = new BufferedReader(new InputStreamReader(inputStream)).lines().collect(Collectors.joining(OS.NEWLINE));
-            List<BibEntry> entries = new ArrayList<>();
-            CoinsParser parser = new CoinsParser();
-            Pattern pattern = Pattern.compile("<span class=\"Z3988\" title=\"(.*)\"></span>");
-            Matcher matcher = pattern.matcher(response);
-            while (matcher.find()) {
-                String encodedDataString = matcher.group(1);
-                entries.addAll(parser.parseEntries(encodedDataString));
+            Optional<JSONArray> jsonResponse = Optional.of(requestResponse)
+                                                    .map(JsonNode::getObject)
+                                                    .filter(Objects::nonNull)
+                                                    .map(response -> response.optJSONArray("response"))
+                                                    .filter(Objects::nonNull);
+
+            if (!jsonResponse.isPresent()) {
+                return List.of();
             }
-            return entries;
-        };
+
+            CiteSeerParser parser = new CiteSeerParser();
+            List<BibEntry> fetchedEntries = parser.parseCiteSeerResponse(jsonResponse.orElse(new JSONArray()));
+            return fetchedEntries;
+        } catch (ParseException ex) {
+            throw new FetcherException("An internal parser error occurred while parsing CiteSeer entries, ", ex);
+        }
+    }
+
+    private JSONElement getPayloadJSON(QueryNode luceneQuery) {
+        transformer = new CiteSeerQueryTransformer();
+        String transformedQuery = transformer.transformLuceneQuery(luceneQuery).orElse("");
+        return transformer.getJSONPayload();
     }
 
     @Override
-    public void doPostCleanup(BibEntry entry) {
-        // CiteSeer escapes some characters in a way that is not recognized by the normal html to unicode formatter
-        // We, of course, also want to convert these special characters
-        Formatter extendedHtmlFormatter = new HtmlToUnicodeFormatter() {
-            @Override
-            public String format(String fieldText) {
-                String formatted = super.format(fieldText);
-                formatted = formatted.replaceAll("%3A", ":");
-                formatted = formatted.replaceAll("%3Cem%3", "");
-                formatted = formatted.replaceAll("%3C%2Fem%3E", "");
-                formatted = formatted.replaceAll("%2C\\+", " ");
-                formatted = formatted.replaceAll("\\+", " ");
-                return formatted;
-            }
-        };
-        new FieldFormatterCleanup(InternalField.INTERNAL_ALL_FIELD, extendedHtmlFormatter).cleanup(entry);
+    public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
+        Objects.requireNonNull(entry);
+
+        // does not use a valid DOI, but Cite Seer's id / hash available for each entry
+        Optional<String> id = entry.getField(StandardField.DOI);
+        if (id.isPresent()) {
+            String source = String.format(PDF_URL, id.get());
+            return Optional.of(new URL(source));
+        }
+
+        // if using id fails, we can try the source URL
+        Optional<String> urlString = entry.getField(StandardField.URL);
+        if (urlString.isPresent()) {
+            return Optional.of(new URL(urlString.get()));
+        }
 
-        // Many titles in the CiteSeer database have all-capital titles, for convenience we convert them to title case
-        new FieldFormatterCleanup(StandardField.TITLE, new TitleCaseFormatter()).cleanup(entry);
+        return Optional.empty();
     }
 }
diff --git a/src/main/java/org/jabref/logic/importer/fetcher/transformers/CiteSeerQueryTransformer.java b/src/main/java/org/jabref/logic/importer/fetcher/transformers/CiteSeerQueryTransformer.java
@@ -0,0 +1,120 @@
+package org.jabref.logic.importer.fetcher.transformers;
+
+import java.util.Calendar;
+import java.util.Optional;
+
+import org.jabref.model.strings.StringUtil;
+
+import kong.unirest.json.JSONArray;
+import kong.unirest.json.JSONObject;
+
+public class CiteSeerQueryTransformer extends AbstractQueryTransformer {
+
+    private JSONObject payload = new JSONObject();
+
+    /**
+     * Default values for necessary parameters set in constructor
+     */
+    public CiteSeerQueryTransformer() {
+        handlePage("1");
+        handlePageSize("20");
+        this.getJSONPayload().put("must_have_pdf", "false");
+        handleSortBy("relevance");
+    }
+
+    @Override
+    protected String getLogicalAndOperator() {
+        return " ";
+    }
+
+    @Override
+    protected String getLogicalOrOperator() {
+        return " ";
+    }
+
+    @Override
+    protected String getLogicalNotOperator() {
+        return "";
+    }
+
+    @Override
+    protected String handleAuthor(String author) {
+        if (!getJSONPayload().has("author")) {
+            this.getJSONPayload().put("author", new JSONArray());
+        }
+        this.getJSONPayload().getJSONArray("author").put(author).toString();
+        return StringUtil.quoteStringIfSpaceIsContained(author);
+    }
+
+    @Override
+    protected String handleTitle(String title) {
+        this.getJSONPayload().put("queryString", title);
+        return StringUtil.quoteStringIfSpaceIsContained(title);
+    }
+
+    @Override
+    protected String handleJournal(String journalTitle) {
+        this.getJSONPayload().put("journal", journalTitle);
+        return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
+    }
+
+    @Override
+    protected String handleYear(String year) {
+        this.getJSONPayload().put("yearStart", Integer.parseInt(year));
+        this.getJSONPayload().put("yearEnd", Integer.parseInt(year));
+        return StringUtil.quoteStringIfSpaceIsContained(year);
+    }
+
+    @Override
+    protected String handleYearRange(String yearRange) {
+         parseYearRange(yearRange);
+         if (endYear == Integer.MAX_VALUE) { // invalid year range
+             Calendar calendar = Calendar.getInstance();
+             this.getJSONPayload().put("yearEnd", calendar.get(Calendar.YEAR));
+             return "";
+         }
+         this.getJSONPayload().put("yearStart", startYear);
+         this.getJSONPayload().put("yearEnd", endYear);
+         return yearRange;
+    }
+
+    /**
+     * covers the five fields that are required to make a POST request
+     * except "must_have_pdf" as FullTextFetcher is not yet implemented for CiteSeer
+     */
+    @Override
+    protected Optional<String> handleOtherField(String fieldAsString, String term) {
+        return switch (fieldAsString) {
+            case "page" -> handlePage(term);
+            case "pageSize" -> handlePageSize(term);
+            case "must_have_pdf" -> handleMustHavePdf(term);
+            case "sortBy" -> handleSortBy(term);
+            default -> super.handleOtherField(fieldAsString, term);
+        };
+    }
+
+    // as mentioned before, there may be a Jabref constant for page/page-size
+    private Optional<String> handlePage(String page) {
+        this.getJSONPayload().put("page", StringUtil.intValueOf(page));
+        return Optional.of(page);
+    }
+
+    private Optional<String> handlePageSize(String pageSize) {
+        this.getJSONPayload().put("pageSize", StringUtil.intValueOf(pageSize));
+        return Optional.of(pageSize);
+    }
+
+    private Optional<String> handleMustHavePdf(String mustHavePdf) {
+        this.getJSONPayload().put("must_have_pdf", mustHavePdf);
+        return Optional.of(mustHavePdf);
+    }
+
+    private Optional<String> handleSortBy(String sortBy) {
+        this.getJSONPayload().put("sortBy", sortBy);
+        return Optional.of(sortBy);
+    }
+
+    public JSONObject getJSONPayload() {
+        return this.payload;
+    }
+}
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/CiteSeerParser.java b/src/main/java/org/jabref/logic/importer/fileformat/CiteSeerParser.java
@@ -0,0 +1,65 @@
+package org.jabref.logic.importer.fileformat;
+
+import java.net.CookieHandler;
+import java.net.CookieManager;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+import org.jabref.logic.importer.AuthorListParser;
+import org.jabref.logic.importer.ParseException;
+import org.jabref.model.entry.BibEntry;
+import org.jabref.model.entry.field.StandardField;
+import org.jabref.model.strings.StringUtil;
+
+import kong.unirest.json.JSONArray;
+import kong.unirest.json.JSONObject;
+
+public class CiteSeerParser {
+
+    public List<BibEntry> parseCiteSeerResponse(JSONArray jsonResponse) throws ParseException {
+        List<BibEntry> response = new ArrayList<>();
+        CookieHandler.setDefault(new CookieManager());
+        for (int i = 0; i < jsonResponse.length(); ++i) {
+            response.add(parseBibEntry(jsonResponse.getJSONObject(i)));
+        }
+        return response;
+    }
+
+    /***
+     * WARNING: The DOI for each parsed BibEntry is not a valid DOI.
+     * Cite Seer associates an id with each response as a unique hash.
+     * However, it is not a valid variation of a DOI value.
+     *
+     * @param jsonObj Search response as a JSON Object
+     * @return BibEntry
+     * @throws ParseException
+     */
+    private BibEntry parseBibEntry(JSONObject jsonObj) throws ParseException {
+        BibEntry bibEntry = new BibEntry();
+        bibEntry.setField(StandardField.DOI, jsonObj.optString("id"));
+        bibEntry.setField(StandardField.TITLE, jsonObj.optString("title"));
+        bibEntry.setField(StandardField.VENUE, jsonObj.optString("venue"));
+        bibEntry.setField(StandardField.YEAR, jsonObj.optString("year"));
+        bibEntry.setField(StandardField.PUBLISHER, jsonObj.optString("publisher"));
+        bibEntry.setField(StandardField.ABSTRACT, jsonObj.optString("abstract"));
+        bibEntry.setField(StandardField.AUTHOR, parseAuthors(Optional.ofNullable(jsonObj.optJSONArray("authors"))));
+        bibEntry.setField(StandardField.JOURNAL, jsonObj.optString("journal"));
+        bibEntry.setField(StandardField.URL, jsonObj.optString("source"));
+        return bibEntry;
+    }
+
+    private String parseAuthors(Optional<JSONArray> authorsOpt) {
+        if (!authorsOpt.isPresent()) {
+            return "";
+        }
+        String separator = " and ";
+        JSONArray authorsArray = authorsOpt.get();
+        StringBuilder authorsStringBuilder = new StringBuilder();
+        for (int i = 0; i < authorsArray.length() - 1; i++) {
+            authorsStringBuilder.append(StringUtil.shaveString(authorsArray.getString(i))).append(separator);
+        }
+        authorsStringBuilder.append(authorsArray.getString(authorsArray.length() - 1));
+        return new AuthorListParser().parse(authorsStringBuilder.toString()).getAsLastFirstNamesWithAnd(false);
+    }
+}
diff --git a/src/test/java/org/jabref/gui/slr/ManageStudyDefinitionViewModelTest.java b/src/test/java/org/jabref/gui/slr/ManageStudyDefinitionViewModelTest.java
@@ -38,6 +38,7 @@ public void emptyStudyConstructorFillsDatabasesCorrectly() {
                 new StudyCatalogItem("ArXiv", false),
                 new StudyCatalogItem("Bibliotheksverbund Bayern (Experimental)", false),
                 new StudyCatalogItem("Biodiversity Heritage", false),
+                new StudyCatalogItem("CiteSeerX", false),
                 new StudyCatalogItem("Collection of Computer Science Bibliographies", false),
                 new StudyCatalogItem("Crossref", false),
                 new StudyCatalogItem("DBLP", true),
@@ -78,6 +79,7 @@ public void studyConstructorFillsDatabasesCorrectly(@TempDir Path tempDir) {
                 new StudyCatalogItem("ArXiv", false),
                 new StudyCatalogItem("Bibliotheksverbund Bayern (Experimental)", false),
                 new StudyCatalogItem("Biodiversity Heritage", false),
+                new StudyCatalogItem("CiteSeerX", false),
                 new StudyCatalogItem("Collection of Computer Science Bibliographies", false),
                 new StudyCatalogItem("Crossref", false),
                 new StudyCatalogItem("DBLP", false),