Skip to content

Commit

Permalink
Modified read-in of pubdate in JATS parser (#133)
Browse files Browse the repository at this point in the history
* correctly input pubdate for jats

* formatting fixes:

* modified electronic vs print date condition

* added detagging to license field in jats

* updated list of accepted date types

* updating rebase

* adding missing tests in jats

---------

Co-authored-by: Mugdha Polimera <[email protected]>
Co-authored-by: Mugdha Polimera <[email protected]>
  • Loading branch information
3 people authored Sep 30, 2024
1 parent e097a83 commit 9bd6801
Show file tree
Hide file tree
Showing 30 changed files with 11,267 additions and 85 deletions.
1 change: 1 addition & 0 deletions adsingestp/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ class BaseBeautifulSoupParser(IngestBase):
"comments": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br", "p"],
"affiliations": ["email", "orcid"],
"keywords": HTML_TAGS_HTML,
"license": HTML_TAGS_MATH + HTML_TAGS_HTML + ["a", "pre", "br"],
}

HTML_TAGS_DANGER = ["php", "script", "css"]
Expand Down
13 changes: 10 additions & 3 deletions adsingestp/parsers/jats.py
Original file line number Diff line number Diff line change
Expand Up @@ -938,21 +938,27 @@ def _parse_pubdate(self):
for d in pub_dates:
pub_format = d.get("publication-format", "")
pub_type = d.get("pub-type", "")
date_type = d.get("date-type", "")
accepted_date_types = ["pub", "", "first_release"]
pubdate = self._get_date(d)
if (
pub_format == "print"
or pub_type == "ppub"
or pub_type == "cover"
or (pub_type == "" and pub_format == "")
):
) and (date_type == "pub" or date_type == ""):
self.base_metadata["pubdate_print"] = pubdate

if (
pub_format == "electronic"
or pub_type == "epub"
or (pub_type == "" and pub_format == "")
):
) and (date_type in accepted_date_types):
self.base_metadata["pubdate_electronic"] = pubdate

elif (date_type != "pub") and (date_type != ""):
self.base_metadata["pubdate_other"] = [{"type": date_type, "date": pubdate}]

if pub_type == "open-access":
self.base_metadata.setdefault("openAccess", {}).setdefault("open", True)

Expand All @@ -969,7 +975,8 @@ def _parse_permissions(self):
license_text = p.find("license-p")
if license_text:
self.base_metadata.setdefault("openAccess", {}).setdefault(
"license", license_text.get_text()
"license",
self._detag(license_text.get_text(), self.HTML_TAGSET["license"]).strip(),
)
license_uri = license_text.find("ext-link")
if license_uri:
Expand Down
415 changes: 414 additions & 1 deletion tests/stubdata/input/jats_nature_41467_2023_Article_40261_nlm.xml

Large diffs are not rendered by default.

Loading

0 comments on commit 9bd6801

Please sign in to comment.