Skip to content

Commit 23a5ecd

Browse files
feat(jobs): add support for Gen3 jobs API, update docs (#41)
* feat(jobs): add support for Gen3 jobs API, update docs * Apply automatic documentation changes * chore(readme): use generic url * fix(tests): add actual tests * Apply automatic documentation changes * chore(readme): cleanup commented out code * Apply automatic documentation changes Co-authored-by: Alexander VT <[email protected]>
1 parent 97603fd commit 23a5ecd

31 files changed

+1613
-133
lines changed

README.md

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ The docs here contain general descriptions of the different pieces of the SDK an
1616
- [Gen3Auth](#gen3auth)
1717
- [Gen3Index](#gen3index)
1818
- [Gen3Submission](#gen3submission)
19+
- [Gen3Jobs](#gen3jobs)
1920
- [Indexing Tools](#indexing-tools)
2021
- [Download Manifest](#download-manifest)
2122
- [Verify Manifest](#verify-manifest)
@@ -26,6 +27,7 @@ The docs here contain general descriptions of the different pieces of the SDK an
2627
- [Manifest Merge](#manifest-merge)
2728
- [Ideal Scenario \(Column to Column Match, Indexing:Metadata Manifest Rows\)](#ideal-scenario-column-to-column-match-indexingmetadata-manifest-rows)
2829
- [Non-Ideal Scenario \(Partial URL Matching\)](#non-ideal-scenario-partial-url-matching)
30+
- [Using Gen3 Jobs](#using-gen3-jobs)
2931

3032
---
3133

@@ -134,6 +136,145 @@ This is the client for interacting with the Indexd service for GUID brokering an
134136

135137
This is the client for interacting with the Gen3 submission service including GraphQL queries.
136138

139+
### Gen3Jobs
140+
141+
This is client for interacting with Gen3's job dispatching service. A complex example script which calls a job that combines dbGaP data with indexed file objects can be seen below:
142+
143+
144+
```python
145+
import sys
146+
import logging
147+
import asyncio
148+
149+
from gen3.index import Gen3Index
150+
from gen3.auth import Gen3Auth
151+
from gen3.jobs import Gen3Jobs, DBGAP_METADATA_JOB, INGEST_METADATA_JOB
152+
153+
# Gen3 Commons URL
154+
COMMONS = "https://example.org/"
155+
156+
# An API Key downloaded from the above commons' "Profile" page
157+
API_KEY_FILEPATH = "credentials.json"
158+
159+
logging.basicConfig(filename="output.log", level=logging.INFO)
160+
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
161+
162+
163+
def metadata_ingest():
164+
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
165+
jobs = Gen3Jobs(COMMONS, auth_provider=auth)
166+
167+
job_input = {
168+
"URL": "https://cdistest-public-test-bucket.s3.amazonaws.com/04_28_20_21_55_13_merged_metadata_manifest.tsv",
169+
"metadata_source": "dbgaptest",
170+
}
171+
172+
loop = asyncio.new_event_loop()
173+
asyncio.set_event_loop(loop)
174+
175+
job_output = loop.run_until_complete(
176+
jobs.async_run_job_and_wait(job_name=INGEST_METADATA_JOB, job_input=job_input)
177+
)
178+
print(job_output)
179+
180+
181+
def main():
182+
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
183+
jobs = Gen3Jobs(COMMONS, auth_provider=auth)
184+
185+
job_input = {
186+
"phsid_list": "phs000920 phs000921 phs000946 phs000951 phs000954 phs000956 phs000964 phs000972 phs000974 phs000988 phs000993 phs000997 phs001024 phs001032 phs001040 phs001062 phs001143 phs001189 phs001207 phs001211 phs001215 phs001217 phs001218 phs001237 phs001293 phs001345 phs001359 phs001368 phs001387 phs001402 phs001412 phs001416",
187+
"indexing_manifest_url": "https://cdistest-public-test-bucket.s3.amazonaws.com/release_manifest_no_dbgap_no_sample.csv",
188+
"manifests_mapping_config": {
189+
"guid_column_name": "guid",
190+
"row_column_name": "submitted_sample_id",
191+
"indexing_manifest_column_name": "gcp_uri",
192+
},
193+
"partial_match_or_exact_match": "partial_match",
194+
}
195+
196+
loop = asyncio.new_event_loop()
197+
asyncio.set_event_loop(loop)
198+
199+
job_output = loop.run_until_complete(
200+
jobs.async_run_job_and_wait(job_name=DBGAP_METADATA_JOB, job_input=job_input)
201+
)
202+
print(job_output)
203+
204+
205+
if __name__ == "__main__":
206+
metadata_ingest()
207+
208+
```
209+
210+
```python
211+
import sys
212+
import logging
213+
import asyncio
214+
215+
from gen3.auth import Gen3Auth
216+
from gen3.jobs import Gen3Jobs, DBGAP_METADATA_JOB
217+
218+
# Gen3 Commons URL
219+
COMMONS = "https://example.net/"
220+
221+
# An API Key downloaded from the above commons' "Profile" page
222+
API_KEY_FILEPATH = "credentials.json"
223+
224+
logging.basicConfig(filename="output.log", level=logging.INFO)
225+
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
226+
227+
# NOTE: The indexing_manifest_url must exist and be publically accessible
228+
JOB_INPUT = {
229+
"phsid_list": "phs000956 phs000920",
230+
"indexing_manifest_url": "https://example.com/public_indexing_manifest.csv",
231+
"manifests_mapping_config": {
232+
"guid_column_name": "guid",
233+
"row_column_name": "submitted_sample_id",
234+
"indexing_manifest_column_name": "urls",
235+
},
236+
"partial_match_or_exact_match": "partial_match",
237+
}
238+
239+
240+
def example_async_run_job():
241+
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
242+
jobs = Gen3Jobs(COMMONS, auth_provider=auth)
243+
244+
loop = asyncio.new_event_loop()
245+
asyncio.set_event_loop(loop)
246+
247+
job_output = loop.run_until_complete(
248+
jobs.async_run_job_and_wait(job_name=DBGAP_METADATA_JOB, job_input=JOB_INPUT)
249+
)
250+
print(job_output)
251+
252+
def example_non_async_run_job():
253+
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
254+
jobs = Gen3Jobs(COMMONS, auth_provider=auth)
255+
256+
is_healthy = jobs.is_healthy()
257+
print(is_healthy)
258+
259+
version = jobs.get_version()
260+
print(version)
261+
262+
create_job = jobs.create_job(job_name=DBGAP_METADATA_JOB, job_input=JOB_INPUT)
263+
print(create_job)
264+
265+
status = "Running"
266+
while status == "Running":
267+
status = jobs.get_status(create_job.get("uid")).get("status")
268+
print(status)
269+
270+
get_output = jobs.get_output(create_job.get("uid"))
271+
print(get_output)
272+
273+
274+
if __name__ == "__main__":
275+
example_async_run_job()
276+
```
277+
137278
## Metadata
138279

139280
For interacting with Gen3's metadata service.
@@ -787,3 +928,97 @@ The final output file will contain all the columns from the metadata manifest in
787928
```
788929
guid, submitted_sample_id, dbgap_subject_id, consent_short_name, body_site, ....
789930
```
931+
932+
### Using Gen3 Jobs
933+
934+
There are some Gen3 jobs that were tailored for metadata ingestions and getting metadata from a public dbGaP API. The following are some example scripts that could be useful for utilizing those new jobs:
935+
936+
> NOTE: All of these jobs require specific permissions in the Gen3 environment
937+
938+
```python
939+
import sys
940+
import logging
941+
import asyncio
942+
943+
from gen3.index import Gen3Index
944+
from gen3.auth import Gen3Auth
945+
from gen3.jobs import Gen3Jobs, DBGAP_METADATA_JOB, INGEST_METADATA_JOB
946+
947+
# Gen3 Commons URL
948+
COMMONS = "https://example.net/"
949+
950+
# An API Key downloaded from the above commons' "Profile" page
951+
API_KEY_FILEPATH = "credentials.json"
952+
953+
logging.basicConfig(filename="output.log", level=logging.INFO)
954+
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
955+
956+
def get_dbgap_merged_metadata_manifest():
957+
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
958+
jobs = Gen3Jobs(COMMONS, auth_provider=auth)
959+
960+
# this configuration tells the job to pull sample information from the public dbgap
961+
# api for the list of dbgap phsids (AKA study accession numbers) provided.
962+
#
963+
# The indexing_manifest_url is a publically available indexing manifest with at
964+
# a minimum columns to represent the GUID and some other field we can map to
965+
# a field from dbgap, in this example, we're doing a partial string match of
966+
# "submitted_sample_id" from dbgap to the indexing manifest's "urls" column
967+
#
968+
# If there is an exact match available, you can set "partial_match_or_exact_match"
969+
# to "exact_match" and this will perform the merging MUCH faster
970+
job_input = {
971+
"phsid_list": "phs000920 phs000921 phs000946 phs000951 phs000954 phs000956 phs000964 phs000972 phs000974 phs000988 phs000993 phs000997 phs001024 phs001032 phs001040 phs001062 phs001143 phs001189 phs001207 phs001211 phs001215 phs001217 phs001218 phs001237 phs001293 phs001345 phs001359 phs001368 phs001387 phs001402 phs001412 phs001416",
972+
"indexing_manifest_url": "https://example-test-bucket.s3.amazonaws.com/indexing_manifest_with_guids.csv",
973+
"manifests_mapping_config": {
974+
"guid_column_name": "guid",
975+
"row_column_name": "submitted_sample_id",
976+
"indexing_manifest_column_name": "urls",
977+
},
978+
"partial_match_or_exact_match": "partial_match",
979+
}
980+
981+
loop = asyncio.new_event_loop()
982+
asyncio.set_event_loop(loop)
983+
984+
job_output = loop.run_until_complete(
985+
jobs.async_run_job_and_wait(job_name=DBGAP_METADATA_JOB, job_input=job_input)
986+
)
987+
988+
# output contains signed URLs to download the new merged metadata manifest
989+
print(job_output)
990+
991+
992+
def metadata_ingest():
993+
auth = Gen3Auth(COMMONS, refresh_file=API_KEY_FILEPATH)
994+
jobs = Gen3Jobs(COMMONS, auth_provider=auth)
995+
996+
# provide a URL for a manifest that contains a GUID column along with arbitrary
997+
# other columns to add to the metadata service. The "metadata_source" namespaces
998+
# this data in the metadata service to support multiple different sources of metadata
999+
#
1000+
# For example, this will create a metadata blob like:
1001+
# {"dbgap": {"colA": "valA", "colB": valB}}
1002+
job_input = {
1003+
"URL": "https://example-bucket/merged_metadata_manifest.tsv",
1004+
"metadata_source": "dbgap",
1005+
}
1006+
1007+
loop = asyncio.new_event_loop()
1008+
asyncio.set_event_loop(loop)
1009+
1010+
job_output = loop.run_until_complete(
1011+
jobs.async_run_job_and_wait(job_name=INGEST_METADATA_JOB, job_input=job_input)
1012+
)
1013+
print(job_output)
1014+
1015+
1016+
if __name__ == "__main__":
1017+
get_dbgap_merged_metadata_manifest()
1018+
1019+
# TODO: QA the manifest from the above step, make it available to the next job for
1020+
# actual ingestion into the metadat service
1021+
1022+
metadata_ingest()
1023+
1024+
```

docs/_build/html/_modules/gen3/auth.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,9 @@ <h3>Navigation</h3>
179179
<ul>
180180
<li class="toctree-l1"><a class="reference internal" href="../../auth.html">Gen3 Auth Helper</a></li>
181181
<li class="toctree-l1"><a class="reference internal" href="../../file.html">Gen3 File Class</a></li>
182+
<li class="toctree-l1"><a class="reference internal" href="../../indexing.html">Gen3 Index Class</a></li>
183+
<li class="toctree-l1"><a class="reference internal" href="../../jobs.html">Gen3 Jobs Class</a></li>
184+
<li class="toctree-l1"><a class="reference internal" href="../../metadata.html">Gen3 Metadata Class</a></li>
182185
<li class="toctree-l1"><a class="reference internal" href="../../submission.html">Gen3 Submission Class</a></li>
183186
<li class="toctree-l1"><a class="reference internal" href="../../tools.html">Gen3 Tools</a></li>
184187
</ul>

docs/_build/html/_modules/gen3/file.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ <h3>Navigation</h3>
114114
<ul>
115115
<li class="toctree-l1"><a class="reference internal" href="../../auth.html">Gen3 Auth Helper</a></li>
116116
<li class="toctree-l1"><a class="reference internal" href="../../file.html">Gen3 File Class</a></li>
117+
<li class="toctree-l1"><a class="reference internal" href="../../indexing.html">Gen3 Index Class</a></li>
118+
<li class="toctree-l1"><a class="reference internal" href="../../jobs.html">Gen3 Jobs Class</a></li>
119+
<li class="toctree-l1"><a class="reference internal" href="../../metadata.html">Gen3 Metadata Class</a></li>
117120
<li class="toctree-l1"><a class="reference internal" href="../../submission.html">Gen3 Submission Class</a></li>
118121
<li class="toctree-l1"><a class="reference internal" href="../../tools.html">Gen3 Tools</a></li>
119122
</ul>

0 commit comments

Comments
 (0)