Skip to content

Commit

Permalink
Reworking fetcher, parser for CiteSeer (JabRef#9882)
Browse files Browse the repository at this point in the history
* First Commit

- Created separate parser class for CiteSeer
- Able to query CiteSeer to get search results, albeit without using Jabref logic classes

* Implemented Unirest for fetch requests

- Utilized Unirest Java library for requesting CiteSeer information.
- Starting changes for unpacking inputstream to CiteSeer parser.
- Cleaned up imports.

* Reworking CiteSeerFetcher

- Reworking methods to adhere to SearchBasedFetcher
- Minor changes to parser

* First Attempt at QueryTransformer for CiteSeer

- Implemented first version of query transformer to create JSON payload (not tested yet)
- Minor changes to accommodate for query transformer in CiteSeer file

* Expanded on CiteSeerQueryTransformer

- Creating JSON payload for through QueryTransformer
- Implemented necessary parameters for JSON payload for POST requests to CiteSeer
- Minor changes to CiteSeer and CiteSeerParser
- Starting out CiteSeerQueryTransformerTest class

* Converted CiteSeerParser Gson to Kong Unirest

- Refactored class to use Kong JSON objects rather than Gson for consistency

* Adding some more tests for CiteSeer files

- Added tests for CiteSeerQueryTransformer.
- Reworking some tests for CiteSeer fetcher as the the queryString formatting has changed
- Minor changes to CiteSeerParser and CiteSeerQueryTransformer

* More tests for CiteSeerQueryTransformer

- Added some more tests CiteSeerQueryTransformerTest
- Changes to CiteSeerQueryTransformer as it currently does not support boolean operators, and modifying the type fpr payload parameters
- Unsure of how to implement pageSize with previously mentioned Jabref global parameter

* Updating parser and tests for CiteSeer

- Reworked parsing author data for JSON object
- Updating tests for CiteSeer Fetcher
- Removed obsolete comments and code

* Additional testing for CiteSeer functionality

- CiteSeer sortBy:Year option is buggy on their website, inconsistent with entries without pdfs
- Added tests for searching with CiteSeer fetcher
- Added test for QueryTransformer where we can try different variations of year parameters

* Reworking parseAuthors method with AuthorListParser

- Modified tests to match expected behavior.

* Minor changes and cleanup

* Adding Cite Seer response ID as DOI field

- Updated Cite Seer tests to reflect these changes

* Implement FulltextFetcher for CiteSeer

- Implemented findFullText method for Cite Seer fetcher.
- Added some test cases for FulltextFetcher implementation.

* Fixing Tests for Actions

- Fixed code style check error.
- Addressed issue with using ApacheCommonsLang3 helper functions by reworking parseAuthors method.
- Made parseBibEntry method more modular.

* Updating WebFetcher Tests

- Updated WebFetcher tests to include CiteSeer as FullTextFetcher and SearchBasedFetcher.

* Update ManageStudyDefinitionViewModelTest.java

---------

Co-authored-by: Carl Christian Snethlage <[email protected]>
Co-authored-by: Christoph <[email protected]>
  • Loading branch information
3 people authored Jul 12, 2023
1 parent 2abcb95 commit fd82bef
Show file tree
Hide file tree
Showing 8 changed files with 478 additions and 76 deletions.
4 changes: 3 additions & 1 deletion src/main/java/org/jabref/logic/importer/WebFetchers.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import org.jabref.logic.importer.fetcher.AstrophysicsDataSystem;
import org.jabref.logic.importer.fetcher.BiodiversityLibrary;
import org.jabref.logic.importer.fetcher.BvbFetcher;
import org.jabref.logic.importer.fetcher.CiteSeer;
import org.jabref.logic.importer.fetcher.CollectionOfComputerScienceBibliographiesFetcher;
import org.jabref.logic.importer.fetcher.CompositeSearchBasedFetcher;
import org.jabref.logic.importer.fetcher.CrossRef;
Expand Down Expand Up @@ -110,7 +111,7 @@ public static SortedSet<SearchBasedFetcher> getSearchBasedFetchers(ImportFormatP
set.add(new DBLPFetcher(importFormatPreferences));
set.add(new SpringerFetcher(importerPreferences));
set.add(new CrossRef());
// set.add(new CiteSeer());
set.add(new CiteSeer());
set.add(new DOAJFetcher(importFormatPreferences));
set.add(new IEEE(importFormatPreferences, importerPreferences));
set.add(new CompositeSearchBasedFetcher(set, 30));
Expand Down Expand Up @@ -203,6 +204,7 @@ public static Set<FulltextFetcher> getFullTextFetchers(ImportFormatPreferences i
// Meta search
// fetchers.add(new JstorFetcher(importFormatPreferences));
// fetchers.add(new GoogleScholar(importFormatPreferences));
fetchers.add(new CiteSeer());
fetchers.add(new OpenAccessDoi());
fetchers.add(new SemanticScholar());
fetchers.add(new ResearchGate(importFormatPreferences));
Expand Down
131 changes: 69 additions & 62 deletions src/main/java/org/jabref/logic/importer/fetcher/CiteSeer.java
Original file line number Diff line number Diff line change
@@ -1,36 +1,36 @@
package org.jabref.logic.importer.fetcher;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.jabref.logic.cleanup.FieldFormatterCleanup;
import org.jabref.logic.cleanup.Formatter;
import org.jabref.logic.formatter.bibtexfields.HtmlToUnicodeFormatter;
import org.jabref.logic.formatter.casechanger.TitleCaseFormatter;

import org.jabref.logic.help.HelpFile;
import org.jabref.logic.importer.FetcherException;
import org.jabref.logic.importer.Parser;
import org.jabref.logic.importer.SearchBasedParserFetcher;
import org.jabref.logic.importer.fetcher.transformers.DefaultQueryTransformer;
import org.jabref.logic.importer.fileformat.CoinsParser;
import org.jabref.logic.util.OS;
import org.jabref.logic.importer.FulltextFetcher;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.SearchBasedFetcher;
import org.jabref.logic.importer.fetcher.transformers.CiteSeerQueryTransformer;
import org.jabref.logic.importer.fileformat.CiteSeerParser;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.InternalField;
import org.jabref.model.entry.field.StandardField;

import org.apache.http.client.utils.URIBuilder;
import kong.unirest.JsonNode;
import kong.unirest.Unirest;
import kong.unirest.json.JSONArray;
import kong.unirest.json.JSONElement;
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode;

public class CiteSeer implements SearchBasedParserFetcher {
public class CiteSeer implements SearchBasedFetcher, FulltextFetcher {

private static final String BASE_URL = "citeseerx.ist.psu.edu";

private static final String API_URL = "https://citeseerx.ist.psu.edu/api/search";

private static final String PDF_URL = "https://" + BASE_URL + "/document?repid=rep1&type=pdf&doi=%s";

private CiteSeerQueryTransformer transformer;

public CiteSeer() {
}
Expand All @@ -46,52 +46,59 @@ public Optional<HelpFile> getHelpPage() {
}

@Override
public URL getURLForQuery(QueryNode luceneQuery) throws URISyntaxException, MalformedURLException, FetcherException {
URIBuilder uriBuilder = new URIBuilder("https://citeseer.ist.psu.edu/search");
uriBuilder.addParameter("sort", "rlv"); // Sort by relevance
uriBuilder.addParameter("q", new DefaultQueryTransformer().transformLuceneQuery(luceneQuery).orElse("")); // Query
uriBuilder.addParameter("t", "doc"); // Type: documents
// uriBuilder.addParameter("start", "0"); // Start index (not supported at the moment)
return uriBuilder.build().toURL();
}
public List<BibEntry> performSearch(QueryNode luceneQuery) throws FetcherException {
// ADR-0014
try {
JSONElement payload = getPayloadJSON(luceneQuery);
JsonNode requestResponse = Unirest.post(API_URL)
.header("authority", BASE_URL)
.header("accept", "application/json, text/plain, */*")
.header("content-type", "application/json;charset=UTF-8")
.header("origin", "https://" + BASE_URL)
.body(payload)
.asJson().getBody();

@Override
public Parser getParser() {
// MathSciNet returns COinS result embedded in HTML
// So we extract the data string from the <span class="Z3988" title="<data>"></span> tags and pass the content to the COinS parser
return inputStream -> {
String response = new BufferedReader(new InputStreamReader(inputStream)).lines().collect(Collectors.joining(OS.NEWLINE));
List<BibEntry> entries = new ArrayList<>();
CoinsParser parser = new CoinsParser();
Pattern pattern = Pattern.compile("<span class=\"Z3988\" title=\"(.*)\"></span>");
Matcher matcher = pattern.matcher(response);
while (matcher.find()) {
String encodedDataString = matcher.group(1);
entries.addAll(parser.parseEntries(encodedDataString));
Optional<JSONArray> jsonResponse = Optional.of(requestResponse)
.map(JsonNode::getObject)
.filter(Objects::nonNull)
.map(response -> response.optJSONArray("response"))
.filter(Objects::nonNull);

if (!jsonResponse.isPresent()) {
return List.of();
}
return entries;
};

CiteSeerParser parser = new CiteSeerParser();
List<BibEntry> fetchedEntries = parser.parseCiteSeerResponse(jsonResponse.orElse(new JSONArray()));
return fetchedEntries;
} catch (ParseException ex) {
throw new FetcherException("An internal parser error occurred while parsing CiteSeer entries, ", ex);
}
}

private JSONElement getPayloadJSON(QueryNode luceneQuery) {
transformer = new CiteSeerQueryTransformer();
String transformedQuery = transformer.transformLuceneQuery(luceneQuery).orElse("");
return transformer.getJSONPayload();
}

@Override
public void doPostCleanup(BibEntry entry) {
// CiteSeer escapes some characters in a way that is not recognized by the normal html to unicode formatter
// We, of course, also want to convert these special characters
Formatter extendedHtmlFormatter = new HtmlToUnicodeFormatter() {
@Override
public String format(String fieldText) {
String formatted = super.format(fieldText);
formatted = formatted.replaceAll("%3A", ":");
formatted = formatted.replaceAll("%3Cem%3", "");
formatted = formatted.replaceAll("%3C%2Fem%3E", "");
formatted = formatted.replaceAll("%2C\\+", " ");
formatted = formatted.replaceAll("\\+", " ");
return formatted;
}
};
new FieldFormatterCleanup(InternalField.INTERNAL_ALL_FIELD, extendedHtmlFormatter).cleanup(entry);
public Optional<URL> findFullText(BibEntry entry) throws IOException, FetcherException {
Objects.requireNonNull(entry);

// does not use a valid DOI, but Cite Seer's id / hash available for each entry
Optional<String> id = entry.getField(StandardField.DOI);
if (id.isPresent()) {
String source = String.format(PDF_URL, id.get());
return Optional.of(new URL(source));
}

// if using id fails, we can try the source URL
Optional<String> urlString = entry.getField(StandardField.URL);
if (urlString.isPresent()) {
return Optional.of(new URL(urlString.get()));
}

// Many titles in the CiteSeer database have all-capital titles, for convenience we convert them to title case
new FieldFormatterCleanup(StandardField.TITLE, new TitleCaseFormatter()).cleanup(entry);
return Optional.empty();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
package org.jabref.logic.importer.fetcher.transformers;

import java.util.Calendar;
import java.util.Optional;

import org.jabref.model.strings.StringUtil;

import kong.unirest.json.JSONArray;
import kong.unirest.json.JSONObject;

public class CiteSeerQueryTransformer extends AbstractQueryTransformer {

private JSONObject payload = new JSONObject();

/**
* Default values for necessary parameters set in constructor
*/
public CiteSeerQueryTransformer() {
handlePage("1");
handlePageSize("20");
this.getJSONPayload().put("must_have_pdf", "false");
handleSortBy("relevance");
}

@Override
protected String getLogicalAndOperator() {
return " ";
}

@Override
protected String getLogicalOrOperator() {
return " ";
}

@Override
protected String getLogicalNotOperator() {
return "";
}

@Override
protected String handleAuthor(String author) {
if (!getJSONPayload().has("author")) {
this.getJSONPayload().put("author", new JSONArray());
}
this.getJSONPayload().getJSONArray("author").put(author).toString();
return StringUtil.quoteStringIfSpaceIsContained(author);
}

@Override
protected String handleTitle(String title) {
this.getJSONPayload().put("queryString", title);
return StringUtil.quoteStringIfSpaceIsContained(title);
}

@Override
protected String handleJournal(String journalTitle) {
this.getJSONPayload().put("journal", journalTitle);
return StringUtil.quoteStringIfSpaceIsContained(journalTitle);
}

@Override
protected String handleYear(String year) {
this.getJSONPayload().put("yearStart", Integer.parseInt(year));
this.getJSONPayload().put("yearEnd", Integer.parseInt(year));
return StringUtil.quoteStringIfSpaceIsContained(year);
}

@Override
protected String handleYearRange(String yearRange) {
parseYearRange(yearRange);
if (endYear == Integer.MAX_VALUE) { // invalid year range
Calendar calendar = Calendar.getInstance();
this.getJSONPayload().put("yearEnd", calendar.get(Calendar.YEAR));
return "";
}
this.getJSONPayload().put("yearStart", startYear);
this.getJSONPayload().put("yearEnd", endYear);
return yearRange;
}

/**
* covers the five fields that are required to make a POST request
* except "must_have_pdf" as FullTextFetcher is not yet implemented for CiteSeer
*/
@Override
protected Optional<String> handleOtherField(String fieldAsString, String term) {
return switch (fieldAsString) {
case "page" -> handlePage(term);
case "pageSize" -> handlePageSize(term);
case "must_have_pdf" -> handleMustHavePdf(term);
case "sortBy" -> handleSortBy(term);
default -> super.handleOtherField(fieldAsString, term);
};
}

// as mentioned before, there may be a Jabref constant for page/page-size
private Optional<String> handlePage(String page) {
this.getJSONPayload().put("page", StringUtil.intValueOf(page));
return Optional.of(page);
}

private Optional<String> handlePageSize(String pageSize) {
this.getJSONPayload().put("pageSize", StringUtil.intValueOf(pageSize));
return Optional.of(pageSize);
}

private Optional<String> handleMustHavePdf(String mustHavePdf) {
this.getJSONPayload().put("must_have_pdf", mustHavePdf);
return Optional.of(mustHavePdf);
}

private Optional<String> handleSortBy(String sortBy) {
this.getJSONPayload().put("sortBy", sortBy);
return Optional.of(sortBy);
}

public JSONObject getJSONPayload() {
return this.payload;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package org.jabref.logic.importer.fileformat;

import java.net.CookieHandler;
import java.net.CookieManager;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import org.jabref.logic.importer.AuthorListParser;
import org.jabref.logic.importer.ParseException;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.strings.StringUtil;

import kong.unirest.json.JSONArray;
import kong.unirest.json.JSONObject;

public class CiteSeerParser {

public List<BibEntry> parseCiteSeerResponse(JSONArray jsonResponse) throws ParseException {
List<BibEntry> response = new ArrayList<>();
CookieHandler.setDefault(new CookieManager());
for (int i = 0; i < jsonResponse.length(); ++i) {
response.add(parseBibEntry(jsonResponse.getJSONObject(i)));
}
return response;
}

/***
* WARNING: The DOI for each parsed BibEntry is not a valid DOI.
* Cite Seer associates an id with each response as a unique hash.
* However, it is not a valid variation of a DOI value.
*
* @param jsonObj Search response as a JSON Object
* @return BibEntry
* @throws ParseException
*/
private BibEntry parseBibEntry(JSONObject jsonObj) throws ParseException {
BibEntry bibEntry = new BibEntry();
bibEntry.setField(StandardField.DOI, jsonObj.optString("id"));
bibEntry.setField(StandardField.TITLE, jsonObj.optString("title"));
bibEntry.setField(StandardField.VENUE, jsonObj.optString("venue"));
bibEntry.setField(StandardField.YEAR, jsonObj.optString("year"));
bibEntry.setField(StandardField.PUBLISHER, jsonObj.optString("publisher"));
bibEntry.setField(StandardField.ABSTRACT, jsonObj.optString("abstract"));
bibEntry.setField(StandardField.AUTHOR, parseAuthors(Optional.ofNullable(jsonObj.optJSONArray("authors"))));
bibEntry.setField(StandardField.JOURNAL, jsonObj.optString("journal"));
bibEntry.setField(StandardField.URL, jsonObj.optString("source"));
return bibEntry;
}

private String parseAuthors(Optional<JSONArray> authorsOpt) {
if (!authorsOpt.isPresent()) {
return "";
}
String separator = " and ";
JSONArray authorsArray = authorsOpt.get();
StringBuilder authorsStringBuilder = new StringBuilder();
for (int i = 0; i < authorsArray.length() - 1; i++) {
authorsStringBuilder.append(StringUtil.shaveString(authorsArray.getString(i))).append(separator);
}
authorsStringBuilder.append(authorsArray.getString(authorsArray.length() - 1));
return new AuthorListParser().parse(authorsStringBuilder.toString()).getAsLastFirstNamesWithAnd(false);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public void emptyStudyConstructorFillsDatabasesCorrectly() {
new StudyCatalogItem("ArXiv", false),
new StudyCatalogItem("Bibliotheksverbund Bayern (Experimental)", false),
new StudyCatalogItem("Biodiversity Heritage", false),
new StudyCatalogItem("CiteSeerX", false),
new StudyCatalogItem("Collection of Computer Science Bibliographies", false),
new StudyCatalogItem("Crossref", false),
new StudyCatalogItem("DBLP", true),
Expand Down Expand Up @@ -78,6 +79,7 @@ public void studyConstructorFillsDatabasesCorrectly(@TempDir Path tempDir) {
new StudyCatalogItem("ArXiv", false),
new StudyCatalogItem("Bibliotheksverbund Bayern (Experimental)", false),
new StudyCatalogItem("Biodiversity Heritage", false),
new StudyCatalogItem("CiteSeerX", false),
new StudyCatalogItem("Collection of Computer Science Bibliographies", false),
new StudyCatalogItem("Crossref", false),
new StudyCatalogItem("DBLP", false),
Expand Down
Loading

0 comments on commit fd82bef

Please sign in to comment.