@@ -16,6 +16,7 @@ The docs here contain general descriptions of the different pieces of the SDK an
1616 - [ Gen3Auth] ( #gen3auth )
1717 - [ Gen3Index] ( #gen3index )
1818 - [ Gen3Submission] ( #gen3submission )
19+ - [ Gen3Jobs] ( #gen3jobs )
1920- [ Indexing Tools] ( #indexing-tools )
2021 - [ Download Manifest] ( #download-manifest )
2122 - [ Verify Manifest] ( #verify-manifest )
@@ -26,6 +27,7 @@ The docs here contain general descriptions of the different pieces of the SDK an
2627 - [ Manifest Merge] ( #manifest-merge )
2728 - [ Ideal Scenario \( Column to Column Match, Indexing: Metadata Manifest Rows\) ] ( #ideal-scenario-column-to-column-match-indexingmetadata-manifest-rows )
2829 - [ Non-Ideal Scenario \( Partial URL Matching\) ] ( #non-ideal-scenario-partial-url-matching )
30+ - [ Using Gen3 Jobs] ( #using-gen3-jobs )
2931
3032---
3133
@@ -134,6 +136,145 @@ This is the client for interacting with the Indexd service for GUID brokering an
134136
135137This is the client for interacting with the Gen3 submission service including GraphQL queries.
136138
139+ ### Gen3Jobs
140+
141+ This is client for interacting with Gen3's job dispatching service. A complex example script which calls a job that combines dbGaP data with indexed file objects can be seen below:
142+
143+
144+ ``` python
145+ import sys
146+ import logging
147+ import asyncio
148+
149+ from gen3.index import Gen3Index
150+ from gen3.auth import Gen3Auth
151+ from gen3.jobs import Gen3Jobs, DBGAP_METADATA_JOB , INGEST_METADATA_JOB
152+
153+ # Gen3 Commons URL
154+ COMMONS = " https://example.org/"
155+
156+ # An API Key downloaded from the above commons' "Profile" page
157+ API_KEY_FILEPATH = " credentials.json"
158+
159+ logging.basicConfig(filename = " output.log" , level = logging.INFO )
160+ logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
161+
162+
163+ def metadata_ingest ():
164+ auth = Gen3Auth(COMMONS , refresh_file = API_KEY_FILEPATH )
165+ jobs = Gen3Jobs(COMMONS , auth_provider = auth)
166+
167+ job_input = {
168+ " URL" : " https://cdistest-public-test-bucket.s3.amazonaws.com/04_28_20_21_55_13_merged_metadata_manifest.tsv" ,
169+ " metadata_source" : " dbgaptest" ,
170+ }
171+
172+ loop = asyncio.new_event_loop()
173+ asyncio.set_event_loop(loop)
174+
175+ job_output = loop.run_until_complete(
176+ jobs.async_run_job_and_wait(job_name = INGEST_METADATA_JOB , job_input = job_input)
177+ )
178+ print (job_output)
179+
180+
181+ def main ():
182+ auth = Gen3Auth(COMMONS , refresh_file = API_KEY_FILEPATH )
183+ jobs = Gen3Jobs(COMMONS , auth_provider = auth)
184+
185+ job_input = {
186+ " phsid_list" : " phs000920 phs000921 phs000946 phs000951 phs000954 phs000956 phs000964 phs000972 phs000974 phs000988 phs000993 phs000997 phs001024 phs001032 phs001040 phs001062 phs001143 phs001189 phs001207 phs001211 phs001215 phs001217 phs001218 phs001237 phs001293 phs001345 phs001359 phs001368 phs001387 phs001402 phs001412 phs001416" ,
187+ " indexing_manifest_url" : " https://cdistest-public-test-bucket.s3.amazonaws.com/release_manifest_no_dbgap_no_sample.csv" ,
188+ " manifests_mapping_config" : {
189+ " guid_column_name" : " guid" ,
190+ " row_column_name" : " submitted_sample_id" ,
191+ " indexing_manifest_column_name" : " gcp_uri" ,
192+ },
193+ " partial_match_or_exact_match" : " partial_match" ,
194+ }
195+
196+ loop = asyncio.new_event_loop()
197+ asyncio.set_event_loop(loop)
198+
199+ job_output = loop.run_until_complete(
200+ jobs.async_run_job_and_wait(job_name = DBGAP_METADATA_JOB , job_input = job_input)
201+ )
202+ print (job_output)
203+
204+
205+ if __name__ == " __main__" :
206+ metadata_ingest()
207+
208+ ```
209+
210+ ``` python
211+ import sys
212+ import logging
213+ import asyncio
214+
215+ from gen3.auth import Gen3Auth
216+ from gen3.jobs import Gen3Jobs, DBGAP_METADATA_JOB
217+
218+ # Gen3 Commons URL
219+ COMMONS = " https://example.net/"
220+
221+ # An API Key downloaded from the above commons' "Profile" page
222+ API_KEY_FILEPATH = " credentials.json"
223+
224+ logging.basicConfig(filename = " output.log" , level = logging.INFO )
225+ logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
226+
227+ # NOTE : The indexing_manifest_url must exist and be publically accessible
228+ JOB_INPUT = {
229+ " phsid_list" : " phs000956 phs000920" ,
230+ " indexing_manifest_url" : " https://example.com/public_indexing_manifest.csv" ,
231+ " manifests_mapping_config" : {
232+ " guid_column_name" : " guid" ,
233+ " row_column_name" : " submitted_sample_id" ,
234+ " indexing_manifest_column_name" : " urls" ,
235+ },
236+ " partial_match_or_exact_match" : " partial_match" ,
237+ }
238+
239+
240+ def example_async_run_job ():
241+ auth = Gen3Auth(COMMONS , refresh_file = API_KEY_FILEPATH )
242+ jobs = Gen3Jobs(COMMONS , auth_provider = auth)
243+
244+ loop = asyncio.new_event_loop()
245+ asyncio.set_event_loop(loop)
246+
247+ job_output = loop.run_until_complete(
248+ jobs.async_run_job_and_wait(job_name = DBGAP_METADATA_JOB , job_input = JOB_INPUT )
249+ )
250+ print (job_output)
251+
252+ def example_non_async_run_job ():
253+ auth = Gen3Auth(COMMONS , refresh_file = API_KEY_FILEPATH )
254+ jobs = Gen3Jobs(COMMONS , auth_provider = auth)
255+
256+ is_healthy = jobs.is_healthy()
257+ print (is_healthy)
258+
259+ version = jobs.get_version()
260+ print (version)
261+
262+ create_job = jobs.create_job(job_name = DBGAP_METADATA_JOB , job_input = JOB_INPUT )
263+ print (create_job)
264+
265+ status = " Running"
266+ while status == " Running" :
267+ status = jobs.get_status(create_job.get(" uid" )).get(" status" )
268+ print (status)
269+
270+ get_output = jobs.get_output(create_job.get(" uid" ))
271+ print (get_output)
272+
273+
274+ if __name__ == " __main__" :
275+ example_async_run_job()
276+ ```
277+
137278## Metadata
138279
139280For interacting with Gen3's metadata service.
@@ -787,3 +928,97 @@ The final output file will contain all the columns from the metadata manifest in
787928```
788929guid, submitted_sample_id, dbgap_subject_id, consent_short_name, body_site, ....
789930```
931+
932+ ### Using Gen3 Jobs
933+
934+ There are some Gen3 jobs that were tailored for metadata ingestions and getting metadata from a public dbGaP API. The following are some example scripts that could be useful for utilizing those new jobs:
935+
936+ > NOTE: All of these jobs require specific permissions in the Gen3 environment
937+
938+ ``` python
939+ import sys
940+ import logging
941+ import asyncio
942+
943+ from gen3.index import Gen3Index
944+ from gen3.auth import Gen3Auth
945+ from gen3.jobs import Gen3Jobs, DBGAP_METADATA_JOB , INGEST_METADATA_JOB
946+
947+ # Gen3 Commons URL
948+ COMMONS = " https://example.net/"
949+
950+ # An API Key downloaded from the above commons' "Profile" page
951+ API_KEY_FILEPATH = " credentials.json"
952+
953+ logging.basicConfig(filename = " output.log" , level = logging.INFO )
954+ logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
955+
956+ def get_dbgap_merged_metadata_manifest ():
957+ auth = Gen3Auth(COMMONS , refresh_file = API_KEY_FILEPATH )
958+ jobs = Gen3Jobs(COMMONS , auth_provider = auth)
959+
960+ # this configuration tells the job to pull sample information from the public dbgap
961+ # api for the list of dbgap phsids (AKA study accession numbers) provided.
962+ #
963+ # The indexing_manifest_url is a publically available indexing manifest with at
964+ # a minimum columns to represent the GUID and some other field we can map to
965+ # a field from dbgap, in this example, we're doing a partial string match of
966+ # "submitted_sample_id" from dbgap to the indexing manifest's "urls" column
967+ #
968+ # If there is an exact match available, you can set "partial_match_or_exact_match"
969+ # to "exact_match" and this will perform the merging MUCH faster
970+ job_input = {
971+ " phsid_list" : " phs000920 phs000921 phs000946 phs000951 phs000954 phs000956 phs000964 phs000972 phs000974 phs000988 phs000993 phs000997 phs001024 phs001032 phs001040 phs001062 phs001143 phs001189 phs001207 phs001211 phs001215 phs001217 phs001218 phs001237 phs001293 phs001345 phs001359 phs001368 phs001387 phs001402 phs001412 phs001416" ,
972+ " indexing_manifest_url" : " https://example-test-bucket.s3.amazonaws.com/indexing_manifest_with_guids.csv" ,
973+ " manifests_mapping_config" : {
974+ " guid_column_name" : " guid" ,
975+ " row_column_name" : " submitted_sample_id" ,
976+ " indexing_manifest_column_name" : " urls" ,
977+ },
978+ " partial_match_or_exact_match" : " partial_match" ,
979+ }
980+
981+ loop = asyncio.new_event_loop()
982+ asyncio.set_event_loop(loop)
983+
984+ job_output = loop.run_until_complete(
985+ jobs.async_run_job_and_wait(job_name = DBGAP_METADATA_JOB , job_input = job_input)
986+ )
987+
988+ # output contains signed URLs to download the new merged metadata manifest
989+ print (job_output)
990+
991+
992+ def metadata_ingest ():
993+ auth = Gen3Auth(COMMONS , refresh_file = API_KEY_FILEPATH )
994+ jobs = Gen3Jobs(COMMONS , auth_provider = auth)
995+
996+ # provide a URL for a manifest that contains a GUID column along with arbitrary
997+ # other columns to add to the metadata service. The "metadata_source" namespaces
998+ # this data in the metadata service to support multiple different sources of metadata
999+ #
1000+ # For example, this will create a metadata blob like:
1001+ # {"dbgap": {"colA": "valA", "colB": valB}}
1002+ job_input = {
1003+ " URL" : " https://example-bucket/merged_metadata_manifest.tsv" ,
1004+ " metadata_source" : " dbgap" ,
1005+ }
1006+
1007+ loop = asyncio.new_event_loop()
1008+ asyncio.set_event_loop(loop)
1009+
1010+ job_output = loop.run_until_complete(
1011+ jobs.async_run_job_and_wait(job_name = INGEST_METADATA_JOB , job_input = job_input)
1012+ )
1013+ print (job_output)
1014+
1015+
1016+ if __name__ == " __main__" :
1017+ get_dbgap_merged_metadata_manifest()
1018+
1019+ # TODO : QA the manifest from the above step, make it available to the next job for
1020+ # actual ingestion into the metadat service
1021+
1022+ metadata_ingest()
1023+
1024+ ```
0 commit comments