Skip to content

Commit 9b35555

Browse files
Avantol13Avantol13-machine-userpaulineribeyre
authored
feat(dois): dbGaP Parsing for DOI metadata automation (#183)
* feat(doi): rough support for DOI creation against DataCite API * Apply automatic documentation changes * feat(doi): add tests, refactor some support for various fields that are lists * feat(doi): get DOI core classes working, add tests, various fixes and updates for discovery metadata * Apply automatic documentation changes * chore(poetry): bump version * Apply automatic documentation changes * fix(logging): correct log to reflect actual HTTP method * Apply automatic documentation changes * fix(doi): fix PUT endpoint to use DOI identifier in URL * Apply automatic documentation changes * chore(cli): various cleanup/clarification on help messages * feat(dois): refactor external API locations, add new dbGaP support, add dbgap->DOI automation * fix(merge): fix an issue with metadata merging * feat(dois): correct typos, add docs, clearer logs * feat(doi): formatting and docs updates * feat(dois): ability to specify to use the GUID in the landing page URL * Apply automatic documentation changes * chore(version): bump * chore(discovery): refactor to reduce code duplication * Apply automatic documentation changes * Update gen3/external/nih/dbgap_doi.py Co-authored-by: Pauline Ribeyre <[email protected]> * fix(discovery_doi): updates after review, make support more generic and modular * Apply automatic documentation changes * feat(tests): add tests, refactor for clarity, fix issue with update * Apply automatic documentation changes * feat(dois): add support for excluding specific datasets from discovery DOI minting * Apply automatic documentation changes * fix(dois): add backoff, handle alternateIDs from DataCite correctly, update docs * Apply automatic documentation changes * fix(doi): fix test that was expecting old behavior * Apply automatic documentation changes * Update gen3/discovery_dois.py Co-authored-by: Pauline Ribeyre <[email protected]> * Apply automatic documentation changes --------- Co-authored-by: Alexander VT <[email protected]> Co-authored-by: Pauline Ribeyre <[email protected]>
1 parent b8585a8 commit 9b35555

27 files changed

+1661
-271
lines changed
-35 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.

docs/_build/html/_modules/gen3/metadata.html

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -725,9 +725,7 @@ <h1>Source code for gen3.metadata</h1><div class="highlight"><pre>
725725

726726
<span class="n">logging</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;hitting: </span><span class="si">{</span><span class="n">url_with_params</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
727727
<span class="n">logging</span><span class="o">.</span><span class="n">debug</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;data: </span><span class="si">{</span><span class="n">data</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
728-
<span class="n">response</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">put</span><span class="p">(</span>
729-
<span class="n">url_with_params</span><span class="p">,</span> <span class="n">json</span><span class="o">=</span><span class="n">data</span><span class="p">,</span> <span class="n">merge</span><span class="o">=</span><span class="n">merge</span><span class="p">,</span> <span class="n">auth</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_auth_provider</span>
730-
<span class="p">)</span>
728+
<span class="n">response</span> <span class="o">=</span> <span class="n">requests</span><span class="o">.</span><span class="n">put</span><span class="p">(</span><span class="n">url_with_params</span><span class="p">,</span> <span class="n">json</span><span class="o">=</span><span class="n">data</span><span class="p">,</span> <span class="n">auth</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_auth_provider</span><span class="p">)</span>
731729
<span class="n">response</span><span class="o">.</span><span class="n">raise_for_status</span><span class="p">()</span>
732730

733731
<span class="k">return</span> <span class="n">response</span><span class="o">.</span><span class="n">json</span><span class="p">()</span></div>

docs/_build/html/searchindex.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/_build/html/tools/indexing.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ <h1>Indexing Tools<a class="headerlink" href="#indexing-tools" title="Permalink
380380

381381
<dl class="py function">
382382
<dt class="sig sig-object py" id="gen3.tools.indexing.verify_manifest.async_verify_object_manifest">
383-
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.indexing.verify_manifest.</span></span><span class="sig-name descname"><span class="pre">async_verify_object_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'acl':</span> <span class="pre">&lt;function</span> <span class="pre">_get_acl_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'authz':</span> <span class="pre">&lt;function</span> <span class="pre">_get_authz_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_name':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_name_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_size':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_size_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'guid':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'md5':</span> <span class="pre">&lt;function</span> <span class="pre">_get_md5_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'urls':</span> <span class="pre">&lt;function</span> <span class="pre">_get_urls_from_row&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='verify-manifest-errors-1686340166.129575.log'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/indexing/verify_manifest.html#async_verify_object_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.indexing.verify_manifest.async_verify_object_manifest" title="Permalink to this definition"></a></dt>
383+
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.indexing.verify_manifest.</span></span><span class="sig-name descname"><span class="pre">async_verify_object_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'acl':</span> <span class="pre">&lt;function</span> <span class="pre">_get_acl_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'authz':</span> <span class="pre">&lt;function</span> <span class="pre">_get_authz_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_name':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_name_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_size':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_size_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'guid':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'md5':</span> <span class="pre">&lt;function</span> <span class="pre">_get_md5_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'urls':</span> <span class="pre">&lt;function</span> <span class="pre">_get_urls_from_row&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='verify-manifest-errors-1688591583.648493.log'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/indexing/verify_manifest.html#async_verify_object_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.indexing.verify_manifest.async_verify_object_manifest" title="Permalink to this definition"></a></dt>
384384
<dd><p>Verify all file object records into a manifest csv</p>
385385
<dl class="field-list simple">
386386
<dt class="field-odd">Parameters<span class="colon">:</span></dt>

docs/_build/html/tools/metadata.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ <h1>Metadata Tools<a class="headerlink" href="#metadata-tools" title="Permalink
101101

102102
<dl class="py function">
103103
<dt class="sig sig-object py" id="gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest">
104-
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.metadata.ingest_manifest.</span></span><span class="sig-name descname"><span class="pre">async_ingest_metadata_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auth=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'guid_for_row':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_for_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'indexed_file_object_guid':</span> <span class="pre">&lt;function</span> <span class="pre">_query_for_associated_indexd_record_guid&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='ingest-metadata-manifest-errors-1686340166.4061348.log'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_guid_from_file=True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_type=None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/metadata/ingest_manifest.html#async_ingest_metadata_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest" title="Permalink to this definition"></a></dt>
104+
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.metadata.ingest_manifest.</span></span><span class="sig-name descname"><span class="pre">async_ingest_metadata_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auth=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'guid_for_row':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_for_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'indexed_file_object_guid':</span> <span class="pre">&lt;function</span> <span class="pre">_query_for_associated_indexd_record_guid&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='ingest-metadata-manifest-errors-1688591583.9268854.log'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_guid_from_file=True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_type=None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/metadata/ingest_manifest.html#async_ingest_metadata_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest" title="Permalink to this definition"></a></dt>
105105
<dd><p>Ingest all metadata records into a manifest csv</p>
106106
<dl class="field-list simple">
107107
<dt class="field-odd">Parameters<span class="colon">:</span></dt>

docs/howto/discoveryMetadataTools.md

Lines changed: 88 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def test_manual_single_doi(publish_dois=False):
128128
# Setup
129129
gen3_auth = Gen3Auth()
130130
datacite = DataCite(
131-
api=DataCite.TEST_URL,
131+
use_prod=False,
132132
auth_provider=HTTPBasicAuth(
133133
os.environ.get("DATACITE_USERNAME"),
134134
os.environ.get("DATACITE_PASSWORD"),
@@ -162,7 +162,11 @@ def test_manual_single_doi(publish_dois=False):
162162
}
163163

164164
# Create/Mint the DOI in DataCite
165-
doi = DigitalObjectIdentifier(root_url=COMMONS_DISCOVERY_PAGE, **doi_metadata)
165+
# The default url generated is "root_url" + identifier
166+
# If your Discovery metadata records don't use the DOI as the GUID,
167+
# you may need to supply the URL yourself like below
168+
url = COMMONS_DISCOVERY_PAGE.rstrip("/") + f"/{gen3_metadata_guid}"
169+
doi = DigitalObjectIdentifier(url=url, **doi_metadata)
166170

167171
if publish_dois:
168172
logging.info(f"Publishing DOI `{identifier}`...")
@@ -212,6 +216,28 @@ DOI url.
212216
```json
213217
"discoveryConfig": {
214218
// ...
219+
"features": {
220+
// ...
221+
"search": {
222+
"searchBar": {
223+
"enabled": true,
224+
"searchableTextFields": [
225+
"doi_titles",
226+
"doi_version_information",
227+
"doi_citation",
228+
"doi_creators",
229+
"doi_publisher",
230+
"doi_identifier",
231+
"doi_alternateIdentifiers",
232+
"doi_contributors",
233+
"doi_descriptions",
234+
"doi_publication_year",
235+
"doi_resolveable_link",
236+
"doi_fundingReferences",
237+
"doi_relatedIdentifiers"
238+
]
239+
},
240+
// ...
215241
"detailView": {
216242
// ...
217243
"tabs": [
@@ -224,7 +250,7 @@ DOI url.
224250
{
225251
"type": "block",
226252
"label": "",
227-
"sourceField": "disclaimer",
253+
"sourceField": "doi_disclaimer",
228254
"default": ""
229255
},
230256
{
@@ -245,6 +271,12 @@ DOI url.
245271
"sourceField": "doi_is_available",
246272
"default": "None"
247273
},
274+
{
275+
"type": "text",
276+
"label": "Creators:",
277+
"sourceField": "doi_creators",
278+
"default": "Not specified"
279+
},
248280
{
249281
"type": "text",
250282
"label": "Citation:",
@@ -308,6 +340,18 @@ DOI url.
308340
"label": "Version:",
309341
"sourceField": "doi_version_information",
310342
"default": "Not specified"
343+
},
344+
{
345+
"type": "text",
346+
"label": "Contributors:",
347+
"sourceField": "doi_contributors",
348+
"default": "Not specified"
349+
},
350+
{
351+
"type": "text",
352+
"label": "Related Identifiers:",
353+
"sourceField": "doi_relatedIdentifiers",
354+
"default": "Not specified"
311355
}
312356
]
313357
},
@@ -327,12 +371,17 @@ DOI url.
327371
// ...
328372
```
329373

330-
#### Work in Progress. Script to automate dbGaP scraping for updating datasets and minting DOIs
374+
#### Automate DOI creation for Datasets
375+
376+
Automates the pulling of current datasets from Discovery, getting identifiers,
377+
scraping various APIs for DOI related metadata, and then going through
378+
the DOI creation loop to mint the DOI in Datacite and persist the metadata back in
379+
Gen3.
331380

332-
- TODO: Push DOI from submitted to registered
381+
See below for a full example using the dbGaP `DbgapMetadataInterface`.
333382

334-
See below for a full example of DOI metadata gathering, minting, and persisting
335-
into Gen3.
383+
More interfaces may exist in the future for doing this by querying non-dbGaP
384+
sources.
336385

337386
```python
338387
import os
@@ -341,7 +390,7 @@ from requests.auth import HTTPBasicAuth
341390
from cdislogging import get_logger
342391

343392
from gen3.auth import Gen3Auth
344-
from gen3.discovery_dois import mint_dois_for_dbgap_discovery_datasets
393+
from gen3.discovery_dois import mint_dois_for_discovery_datasets, DbgapMetadataInterface
345394
from gen3.utils import get_random_alphanumeric
346395

347396
logging = get_logger("__name__", log_level="info")
@@ -355,33 +404,54 @@ DOI_ACCESS_INFORMATION = "You can find information about how to access this reso
355404
DOI_ACCESS_INFORMATION_LINK = "https://example.com/more/info"
356405
DOI_CONTACT = "https://example.com/contact/"
357406

407+
def mint_discovery_dois():
408+
auth = Gen3Auth()
358409

359-
def get_doi_identifier():
360-
return (
361-
PREFIX + "/EXAMPLE-" + get_random_alphanumeric(4) + "-" + get_random_alphanumeric(4)
362-
)
410+
# this alternate ID is some globally unique ID other than the GUID that
411+
# will be needed to get DOI metadata (like the phsid for dbGaP)
412+
metadata_field_for_alternate_id = "dbgap_accession"
363413

414+
# you can choose to exclude certain Discovery Metadata datasets based on
415+
# their GUID or alternate ID (this means they won't get additional DOI metadata
416+
# or have DOIs minted, they'll be skipped)
417+
exclude_datasets=["MetadataGUID_to_exclude", "AlternateID_to_exclude", "..."]
364418

365-
def main():
366-
auth = Gen3Auth()
367-
dbgap_phsid_field = "dbgap_accession"
419+
# When this is True, you CANNOT REVERT THIS ACTION. A published DOI
420+
# cannot be deleted. It is recommended to test with "Draft" state DOIs first
421+
# (which is the default when publish_dois is not True).
422+
publish_dois = False
368423

369-
mint_dois_for_dbgap_discovery_datasets(
424+
mint_dois_for_discovery_datasets(
370425
gen3_auth=auth,
371426
datacite_auth=HTTPBasicAuth(
372427
os.environ.get("DATACITE_USERNAME"),
373428
os.environ.get("DATACITE_PASSWORD"),
374429
),
375-
dbgap_phsid_field=dbgap_phsid_field,
430+
metadata_field_for_alternate_id=metadata_field_for_alternate_id,
376431
get_doi_identifier_function=get_doi_identifier,
377-
publisher=PUBLISHER,
432+
metadata_interface=DbgapMetadataInterface,
433+
doi_publisher=PUBLISHER,
378434
commons_discovery_page=COMMONS_DISCOVERY_PAGE,
379435
doi_disclaimer=DOI_DISCLAIMER,
380436
doi_access_information=DOI_ACCESS_INFORMATION,
381437
doi_access_information_link=DOI_ACCESS_INFORMATION_LINK,
382438
doi_contact=DOI_CONTACT,
439+
publish_dois=publish_dois,
440+
datacite_use_prod=False,
441+
exclude_datasets=["MetadataGUID_to_exclude", "AlternateID_to_exclude", "..."]
383442
)
384443

444+
445+
def get_doi_identifier():
446+
return (
447+
PREFIX + "/EXAMPLE-" + get_random_alphanumeric(4) + "-" + get_random_alphanumeric(4)
448+
)
449+
450+
451+
def main():
452+
mint_discovery_dois()
453+
454+
385455
if __name__ == "__main__":
386456
main()
387457

gen3/cli/__main__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -34,18 +34,18 @@ def get(self):
3434
"--auth",
3535
"auth_config",
3636
default=os.getenv("GEN3_API_KEY", None),
37-
help="""authentication source:
38-
"idp://wts/<idp>" is an identity provider in a Gen3 workspace,
39-
"accesstoken:///<token>" is an access token,
40-
otherwise a path to an api key or basename of key under ~/.gen3/;
41-
default value is "credentials" if ~/.gen3/credentials.json exists, otherwise "idp://wts/local"
37+
help="""authentication source, by default expects an API key in "~/.gen3/credentials.json".
38+
Has special support for token service: "idp://wts/<idp>", and raw access tokens
39+
"accesstoken:///<token>",
40+
otherwise a path to an API key or basename of key under ~/.gen3/ can be used.
41+
Default value is "credentials" if ~/.gen3/credentials.json exists, otherwise "idp://wts/local"
4242
""",
4343
)
4444
@click.option(
4545
"--endpoint",
4646
"endpoint",
4747
default=os.getenv("GEN3_ENDPOINT", "default"),
48-
help="commons hostname - optional if API Key given",
48+
help="commons hostname - optional if API Key given in `auth`",
4949
)
5050
@click.option(
5151
"-v",

gen3/cli/auth.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def wts_list():
7272

7373
@click.group()
7474
def auth():
75-
"""Gen3 sdk auth commands"""
75+
"""Commands for authentication and authorization"""
7676
pass
7777

7878

0 commit comments

Comments
 (0)