Skip to content

Commit f3506f2

Browse files
authored
Merge pull request #184 from uc-cdis/feat/disco-update
HP-1017 Feat/disco update
2 parents bfe5111 + b3d6607 commit f3506f2

File tree

13 files changed

+610
-398
lines changed

13 files changed

+610
-398
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,3 +115,4 @@ indexing-output-manifest.csv
115115
object-manifest.csv
116116
output_manifest.csv
117117

118+
.dccache
0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.

docs/_build/html/searchindex.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/_build/html/tools/indexing.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ <h1>Indexing Tools<a class="headerlink" href="#indexing-tools" title="Permalink
380380

381381
<dl class="py function">
382382
<dt class="sig sig-object py" id="gen3.tools.indexing.verify_manifest.async_verify_object_manifest">
383-
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.indexing.verify_manifest.</span></span><span class="sig-name descname"><span class="pre">async_verify_object_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'acl':</span> <span class="pre">&lt;function</span> <span class="pre">_get_acl_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'authz':</span> <span class="pre">&lt;function</span> <span class="pre">_get_authz_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_name':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_name_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_size':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_size_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'guid':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'md5':</span> <span class="pre">&lt;function</span> <span class="pre">_get_md5_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'urls':</span> <span class="pre">&lt;function</span> <span class="pre">_get_urls_from_row&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='verify-manifest-errors-1684967531.9473238.log'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/indexing/verify_manifest.html#async_verify_object_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.indexing.verify_manifest.async_verify_object_manifest" title="Permalink to this definition"></a></dt>
383+
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.indexing.verify_manifest.</span></span><span class="sig-name descname"><span class="pre">async_verify_object_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'acl':</span> <span class="pre">&lt;function</span> <span class="pre">_get_acl_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'authz':</span> <span class="pre">&lt;function</span> <span class="pre">_get_authz_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_name':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_name_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'file_size':</span> <span class="pre">&lt;function</span> <span class="pre">_get_file_size_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'guid':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'md5':</span> <span class="pre">&lt;function</span> <span class="pre">_get_md5_from_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'urls':</span> <span class="pre">&lt;function</span> <span class="pre">_get_urls_from_row&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='verify-manifest-errors-1686160040.2199411.log'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/indexing/verify_manifest.html#async_verify_object_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.indexing.verify_manifest.async_verify_object_manifest" title="Permalink to this definition"></a></dt>
384384
<dd><p>Verify all file object records into a manifest csv</p>
385385
<dl class="field-list simple">
386386
<dt class="field-odd">Parameters<span class="colon">:</span></dt>

docs/_build/html/tools/metadata.html

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ <h1>Metadata Tools<a class="headerlink" href="#metadata-tools" title="Permalink
101101

102102
<dl class="py function">
103103
<dt class="sig sig-object py" id="gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest">
104-
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.metadata.ingest_manifest.</span></span><span class="sig-name descname"><span class="pre">async_ingest_metadata_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auth=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'guid_for_row':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_for_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'indexed_file_object_guid':</span> <span class="pre">&lt;function</span> <span class="pre">_query_for_associated_indexd_record_guid&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='ingest-metadata-manifest-errors-1684967532.2885046.log'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_guid_from_file=True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_type=None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/metadata/ingest_manifest.html#async_ingest_metadata_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest" title="Permalink to this definition"></a></dt>
104+
<em class="property"><span class="k"><span class="pre">async</span></span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">gen3.tools.metadata.ingest_manifest.</span></span><span class="sig-name descname"><span class="pre">async_ingest_metadata_manifest</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">commons_url</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_source</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">auth=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_concurrent_requests=24</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_row_parsers={'guid_for_row':</span> <span class="pre">&lt;function</span> <span class="pre">_get_guid_for_row&gt;</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">'indexed_file_object_guid':</span> <span class="pre">&lt;function</span> <span class="pre">_query_for_associated_indexd_record_guid&gt;}</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">manifest_file_delimiter=None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">output_filename='ingest-metadata-manifest-errors-1686160040.6163409.log'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">get_guid_from_file=True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">metadata_type=None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/gen3/tools/metadata/ingest_manifest.html#async_ingest_metadata_manifest"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#gen3.tools.metadata.ingest_manifest.async_ingest_metadata_manifest" title="Permalink to this definition"></a></dt>
105105
<dd><p>Ingest all metadata records into a manifest csv</p>
106106
<dl class="field-list simple">
107107
<dt class="field-odd">Parameters<span class="colon">:</span></dt>

docs/howto/discoveryMetadataTools.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
**Table of Contents**
44

55
- [Overview](#overview)
6+
- [Export Discovery Metadata into File](#export-discovery-metadata-from-file)
7+
- [Publish Discovery Metadata from File]()
68
- [DOIs in Gen3](#dois-in-gen3-discovery-metadata-and-page-for-visualizing-public-doi-metadata)
79
- [dbGaP FHIR Metadata in Gen3 Discovery](#combine-dbgap-fhir-metadata-with-current-discovery-metadata)
810

@@ -20,6 +22,48 @@ So you can choose to use the CLI or write your own Python script and use the SDK
2022
functions yourself. Generally this provides the most flexibility, at less
2123
of a convenience.
2224

25+
### Export Discovery Metadata into File
26+
Gen3's SDK can be used to export discovery metadata from a certain Gen3 environment into a file by using the `output_expanded_discovery_metadata()` function. By default this function will query for metadata with `guid_type=discovery_metadata` for the dump, and export the metadata into a TSV file. User can also specify a different `guid_type` values for this operation, and/or choose to export the metadata into a JSON file. When using TSV format, some certain fields from metadata will be flattened or "jsonified" so that each metadata record can be fitted into one row.
27+
28+
Example of usage:
29+
```python
30+
from gen3.tools.metadata.discovery import (
31+
output_expanded_discovery_metadata,
32+
)
33+
from gen3.utils import get_or_create_event_loop_for_thread
34+
from gen3.auth import Gen3Auth
35+
36+
if __name__ == "__main__":
37+
auth = Gen3Auth()
38+
loop = get_or_create_event_loop_for_thread()
39+
loop.run_until_complete(
40+
output_expanded_discovery_metadata(
41+
auth, endpoint="GEN3_ENV_HOSTNAME", output_format="json"
42+
)
43+
)
44+
```
45+
46+
### Publish Discovery Metadata from File
47+
Gen3's SDK can also be used to publish discovery metadata onto a target Gen3 environment from a file by using the `publish_discovery_metadata()` function. Ideally the metadata file should be originated from a metadata dump obtained by using the `output_expanded_discovery_metadata()` function.
48+
49+
Example of usage:
50+
```python
51+
from gen3.tools.metadata.discovery import (
52+
publish_discovery_metadata,
53+
)
54+
from gen3.utils import get_or_create_event_loop_for_thread
55+
from gen3.auth import Gen3Auth
56+
57+
if __name__ == "__main__":
58+
auth = Gen3Auth()
59+
loop = get_or_create_event_loop_for_thread()
60+
loop.run_until_complete(
61+
publish_discovery_metadata(
62+
auth, "./metadata.tsv", endpoint=HOSTNAME, guid_field="_hdp_uid"
63+
)
64+
)
65+
```
66+
2367
### DOIs in Gen3: Discovery Metadata and Page for Visualizing Public DOI Metadata
2468

2569
Gen3's SDK supports minting DOIs from DataCite, storing DOI metadata in a Gen3 instance,

gen3/cli/discovery.py

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def discovery():
2323
"--default-file",
2424
"use_default_file",
2525
is_flag=True,
26-
help="Publishes {commons}-discovery_metadata.tsv from current directory",
26+
help="Publishes {commons}-{guid_type}.tsv from current directory",
2727
show_default=True,
2828
)
2929
@click.option(
@@ -43,15 +43,25 @@ def discovery():
4343
),
4444
show_default=True,
4545
)
46+
@click.option(
47+
"--guid_field",
48+
"guid_field",
49+
help=(
50+
'The column / field name within the metadata that will be used as GUIDs, if not specified, will try to find a column \ field named "guid" from the metadata.'
51+
"If that field doesn't exists in a certain metadata record, that record will be skipped from publishing."
52+
),
53+
default=None,
54+
show_default=True,
55+
)
4656
@click.pass_context
47-
def discovery_publish(ctx, file, use_default_file, omit_empty, guid_type):
57+
def discovery_publish(ctx, file, use_default_file, omit_empty, guid_type, guid_field):
4858
"""
49-
Run a discovery metadata ingestion on a given metadata TSV file with guid column.
50-
If [FILE] is omitted and --default-file not set, prompts for TSV file name.
59+
Run a discovery metadata ingestion on a given metadata TSV / JSON file with guid column / field.
60+
If [FILE] is omitted and --default-file not set, prompts for TSV / JSON file name.
5161
"""
5262
auth = ctx.obj["auth_factory"].get()
5363
if not file and not use_default_file:
54-
file = click.prompt("Enter discovery metadata TSV file to publish")
64+
file = click.prompt("Enter discovery metadata TSV / JSON file to publish")
5565

5666
loop = get_or_create_event_loop_for_thread()
5767
endpoint = ctx.obj.get("endpoint")
@@ -62,6 +72,7 @@ def discovery_publish(ctx, file, use_default_file, omit_empty, guid_type):
6272
endpoint=endpoint,
6373
omit_empty_values=omit_empty,
6474
guid_type=guid_type,
75+
guid_field=guid_field,
6576
)
6677
)
6778

@@ -80,18 +91,46 @@ def discovery_publish(ctx, file, use_default_file, omit_empty, guid_type):
8091
help="use aggregate metadata service instead of the metadata service",
8192
show_default=True,
8293
)
94+
@click.option(
95+
"--guid_type",
96+
"guid_type",
97+
help="value of intended GUID type for query",
98+
default="discovery_metadata",
99+
show_default=True,
100+
)
101+
@click.option(
102+
"--output_format",
103+
"output_format",
104+
help="format of output file (can only be either tsv or json)",
105+
default="tsv",
106+
show_default=True,
107+
)
108+
@click.option(
109+
"--output_filename_suffix",
110+
"output_filename_suffix",
111+
help="additional suffix for the output file name",
112+
default="",
113+
show_default=True,
114+
)
83115
@click.pass_context
84-
def discovery_read(ctx, limit, agg):
116+
def discovery_read(ctx, limit, agg, guid_type, output_format, output_filename_suffix):
85117
"""
86-
Download the metadata used to populate a commons' discovery page into a TSV.
87-
Outputs the TSV filename with format {commons-url}-discovery_metadata.tsv
118+
Download the metadata used to populate a commons' discovery page into a TSV or JSON file.
119+
Outputs the TSV / JSON filename with format {commons-url}-{guid_type}.tsv/.json
120+
If "output_filename_suffix" exists, file name will be something like {commons-url}-{guid_type}-{output_filename_suffix}
88121
"""
89122
auth = ctx.obj["auth_factory"].get()
90123
loop = get_or_create_event_loop_for_thread()
91124
endpoint = ctx.obj.get("endpoint")
92125
output_file = loop.run_until_complete(
93126
output_expanded_discovery_metadata(
94-
auth, endpoint=endpoint, limit=limit, use_agg_mds=agg
127+
auth,
128+
endpoint=endpoint,
129+
limit=limit,
130+
use_agg_mds=agg,
131+
guid_type=guid_type,
132+
output_format=output_format,
133+
output_filename_suffix=output_filename_suffix,
95134
)
96135
)
97136

0 commit comments

Comments
 (0)