forked from JabRef/jabref
-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Reworking fetcher, parser for CiteSeer (JabRef#9882)
* First Commit - Created separate parser class for CiteSeer - Able to query CiteSeer to get search results, albeit without using Jabref logic classes * Implemented Unirest for fetch requests - Utilized Unirest Java library for requesting CiteSeer information. - Starting changes for unpacking inputstream to CiteSeer parser. - Cleaned up imports. * Reworking CiteSeerFetcher - Reworking methods to adhere to SearchBasedFetcher - Minor changes to parser * First Attempt at QueryTransformer for CiteSeer - Implemented first version of query transformer to create JSON payload (not tested yet) - Minor changes to accommodate for query transformer in CiteSeer file * Expanded on CiteSeerQueryTransformer - Creating JSON payload for through QueryTransformer - Implemented necessary parameters for JSON payload for POST requests to CiteSeer - Minor changes to CiteSeer and CiteSeerParser - Starting out CiteSeerQueryTransformerTest class * Converted CiteSeerParser Gson to Kong Unirest - Refactored class to use Kong JSON objects rather than Gson for consistency * Adding some more tests for CiteSeer files - Added tests for CiteSeerQueryTransformer. - Reworking some tests for CiteSeer fetcher as the the queryString formatting has changed - Minor changes to CiteSeerParser and CiteSeerQueryTransformer * More tests for CiteSeerQueryTransformer - Added some more tests CiteSeerQueryTransformerTest - Changes to CiteSeerQueryTransformer as it currently does not support boolean operators, and modifying the type fpr payload parameters - Unsure of how to implement pageSize with previously mentioned Jabref global parameter * Updating parser and tests for CiteSeer - Reworked parsing author data for JSON object - Updating tests for CiteSeer Fetcher - Removed obsolete comments and code * Additional testing for CiteSeer functionality - CiteSeer sortBy:Year option is buggy on their website, inconsistent with entries without pdfs - Added tests for searching with CiteSeer fetcher - Added test for QueryTransformer where we can try different variations of year parameters * Reworking parseAuthors method with AuthorListParser - Modified tests to match expected behavior. * Minor changes and cleanup * Adding Cite Seer response ID as DOI field - Updated Cite Seer tests to reflect these changes * Implement FulltextFetcher for CiteSeer - Implemented findFullText method for Cite Seer fetcher. - Added some test cases for FulltextFetcher implementation. * Fixing Tests for Actions - Fixed code style check error. - Addressed issue with using ApacheCommonsLang3 helper functions by reworking parseAuthors method. - Made parseBibEntry method more modular. * Updating WebFetcher Tests - Updated WebFetcher tests to include CiteSeer as FullTextFetcher and SearchBasedFetcher. * Update ManageStudyDefinitionViewModelTest.java --------- Co-authored-by: Carl Christian Snethlage <[email protected]> Co-authored-by: Christoph <[email protected]>
- Loading branch information
1 parent
2abcb95
commit fd82bef
Showing
8 changed files
with
478 additions
and
76 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
120 changes: 120 additions & 0 deletions
120
src/main/java/org/jabref/logic/importer/fetcher/transformers/CiteSeerQueryTransformer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
package org.jabref.logic.importer.fetcher.transformers; | ||
|
||
import java.util.Calendar; | ||
import java.util.Optional; | ||
|
||
import org.jabref.model.strings.StringUtil; | ||
|
||
import kong.unirest.json.JSONArray; | ||
import kong.unirest.json.JSONObject; | ||
|
||
public class CiteSeerQueryTransformer extends AbstractQueryTransformer { | ||
|
||
private JSONObject payload = new JSONObject(); | ||
|
||
/** | ||
* Default values for necessary parameters set in constructor | ||
*/ | ||
public CiteSeerQueryTransformer() { | ||
handlePage("1"); | ||
handlePageSize("20"); | ||
this.getJSONPayload().put("must_have_pdf", "false"); | ||
handleSortBy("relevance"); | ||
} | ||
|
||
@Override | ||
protected String getLogicalAndOperator() { | ||
return " "; | ||
} | ||
|
||
@Override | ||
protected String getLogicalOrOperator() { | ||
return " "; | ||
} | ||
|
||
@Override | ||
protected String getLogicalNotOperator() { | ||
return ""; | ||
} | ||
|
||
@Override | ||
protected String handleAuthor(String author) { | ||
if (!getJSONPayload().has("author")) { | ||
this.getJSONPayload().put("author", new JSONArray()); | ||
} | ||
this.getJSONPayload().getJSONArray("author").put(author).toString(); | ||
return StringUtil.quoteStringIfSpaceIsContained(author); | ||
} | ||
|
||
@Override | ||
protected String handleTitle(String title) { | ||
this.getJSONPayload().put("queryString", title); | ||
return StringUtil.quoteStringIfSpaceIsContained(title); | ||
} | ||
|
||
@Override | ||
protected String handleJournal(String journalTitle) { | ||
this.getJSONPayload().put("journal", journalTitle); | ||
return StringUtil.quoteStringIfSpaceIsContained(journalTitle); | ||
} | ||
|
||
@Override | ||
protected String handleYear(String year) { | ||
this.getJSONPayload().put("yearStart", Integer.parseInt(year)); | ||
this.getJSONPayload().put("yearEnd", Integer.parseInt(year)); | ||
return StringUtil.quoteStringIfSpaceIsContained(year); | ||
} | ||
|
||
@Override | ||
protected String handleYearRange(String yearRange) { | ||
parseYearRange(yearRange); | ||
if (endYear == Integer.MAX_VALUE) { // invalid year range | ||
Calendar calendar = Calendar.getInstance(); | ||
this.getJSONPayload().put("yearEnd", calendar.get(Calendar.YEAR)); | ||
return ""; | ||
} | ||
this.getJSONPayload().put("yearStart", startYear); | ||
this.getJSONPayload().put("yearEnd", endYear); | ||
return yearRange; | ||
} | ||
|
||
/** | ||
* covers the five fields that are required to make a POST request | ||
* except "must_have_pdf" as FullTextFetcher is not yet implemented for CiteSeer | ||
*/ | ||
@Override | ||
protected Optional<String> handleOtherField(String fieldAsString, String term) { | ||
return switch (fieldAsString) { | ||
case "page" -> handlePage(term); | ||
case "pageSize" -> handlePageSize(term); | ||
case "must_have_pdf" -> handleMustHavePdf(term); | ||
case "sortBy" -> handleSortBy(term); | ||
default -> super.handleOtherField(fieldAsString, term); | ||
}; | ||
} | ||
|
||
// as mentioned before, there may be a Jabref constant for page/page-size | ||
private Optional<String> handlePage(String page) { | ||
this.getJSONPayload().put("page", StringUtil.intValueOf(page)); | ||
return Optional.of(page); | ||
} | ||
|
||
private Optional<String> handlePageSize(String pageSize) { | ||
this.getJSONPayload().put("pageSize", StringUtil.intValueOf(pageSize)); | ||
return Optional.of(pageSize); | ||
} | ||
|
||
private Optional<String> handleMustHavePdf(String mustHavePdf) { | ||
this.getJSONPayload().put("must_have_pdf", mustHavePdf); | ||
return Optional.of(mustHavePdf); | ||
} | ||
|
||
private Optional<String> handleSortBy(String sortBy) { | ||
this.getJSONPayload().put("sortBy", sortBy); | ||
return Optional.of(sortBy); | ||
} | ||
|
||
public JSONObject getJSONPayload() { | ||
return this.payload; | ||
} | ||
} |
65 changes: 65 additions & 0 deletions
65
src/main/java/org/jabref/logic/importer/fileformat/CiteSeerParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package org.jabref.logic.importer.fileformat; | ||
|
||
import java.net.CookieHandler; | ||
import java.net.CookieManager; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Optional; | ||
|
||
import org.jabref.logic.importer.AuthorListParser; | ||
import org.jabref.logic.importer.ParseException; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.strings.StringUtil; | ||
|
||
import kong.unirest.json.JSONArray; | ||
import kong.unirest.json.JSONObject; | ||
|
||
public class CiteSeerParser { | ||
|
||
public List<BibEntry> parseCiteSeerResponse(JSONArray jsonResponse) throws ParseException { | ||
List<BibEntry> response = new ArrayList<>(); | ||
CookieHandler.setDefault(new CookieManager()); | ||
for (int i = 0; i < jsonResponse.length(); ++i) { | ||
response.add(parseBibEntry(jsonResponse.getJSONObject(i))); | ||
} | ||
return response; | ||
} | ||
|
||
/*** | ||
* WARNING: The DOI for each parsed BibEntry is not a valid DOI. | ||
* Cite Seer associates an id with each response as a unique hash. | ||
* However, it is not a valid variation of a DOI value. | ||
* | ||
* @param jsonObj Search response as a JSON Object | ||
* @return BibEntry | ||
* @throws ParseException | ||
*/ | ||
private BibEntry parseBibEntry(JSONObject jsonObj) throws ParseException { | ||
BibEntry bibEntry = new BibEntry(); | ||
bibEntry.setField(StandardField.DOI, jsonObj.optString("id")); | ||
bibEntry.setField(StandardField.TITLE, jsonObj.optString("title")); | ||
bibEntry.setField(StandardField.VENUE, jsonObj.optString("venue")); | ||
bibEntry.setField(StandardField.YEAR, jsonObj.optString("year")); | ||
bibEntry.setField(StandardField.PUBLISHER, jsonObj.optString("publisher")); | ||
bibEntry.setField(StandardField.ABSTRACT, jsonObj.optString("abstract")); | ||
bibEntry.setField(StandardField.AUTHOR, parseAuthors(Optional.ofNullable(jsonObj.optJSONArray("authors")))); | ||
bibEntry.setField(StandardField.JOURNAL, jsonObj.optString("journal")); | ||
bibEntry.setField(StandardField.URL, jsonObj.optString("source")); | ||
return bibEntry; | ||
} | ||
|
||
private String parseAuthors(Optional<JSONArray> authorsOpt) { | ||
if (!authorsOpt.isPresent()) { | ||
return ""; | ||
} | ||
String separator = " and "; | ||
JSONArray authorsArray = authorsOpt.get(); | ||
StringBuilder authorsStringBuilder = new StringBuilder(); | ||
for (int i = 0; i < authorsArray.length() - 1; i++) { | ||
authorsStringBuilder.append(StringUtil.shaveString(authorsArray.getString(i))).append(separator); | ||
} | ||
authorsStringBuilder.append(authorsArray.getString(authorsArray.length() - 1)); | ||
return new AuthorListParser().parse(authorsStringBuilder.toString()).getAsLastFirstNamesWithAnd(false); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.