Skip to content

Commit

Permalink
Sprint task (#65)
Browse files Browse the repository at this point in the history
* adding dug parsers for kids first and cancer commons

* dir for new dag tasks

* adding steps for kfdrc and crdc

* bump dug version

* adding indexes

* changing create index for v2.8.4

* adding sprint parsers

* fixing merge issue

* adding dug new code

* correct sprint path

* bump dug version

* bump dug version

Co-authored-by: Yaphetkg <[email protected]>
  • Loading branch information
YaphetKG and Yaphetkg authored Oct 5, 2022
1 parent 3f09957 commit 7459128
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 3 deletions.
7 changes: 6 additions & 1 deletion dags/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

from dug_helpers.dug_utils import DugUtil, get_topmed_files, get_dbgap_files,\
get_nida_files, get_sparc_files, get_anvil_files,\
get_cancer_data_commons_files, get_kids_first_files
get_cancer_data_commons_files, get_kids_first_files,\
get_sprint_files
from roger.dag_util import default_args, create_python_task

DAG_ID = 'annotate_dug'
Expand Down Expand Up @@ -64,6 +65,10 @@
prepare_files = create_python_task(dag, "get_kids_first_files", get_kids_first_files)
annotate_files = create_python_task(dag, "annotate_kids_first_files",
DugUtil.annotate_kids_first_files)
elif data_set == "sprint":
prepare_files = create_python_task(dag, "get_sprint_files", get_sprint_files)
annotate_files = create_python_task(dag, "annotate_sprint_files",
DugUtil.annotate_sprint_files)
intro >> prepare_files
prepare_files >> clear_annotation_items
clear_annotation_items >> annotate_files
Expand Down
14 changes: 14 additions & 0 deletions dags/dug_helpers/dug_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,17 @@ def annotate_sparc_files(config=None, to_string=False, files=None):
output_log = dug.log_stream.getvalue() if to_string else ''
return output_log

@staticmethod
def annotate_sprint_files(config=None, to_string=False, files=None):
with Dug(config, to_string=to_string) as dug:
if files is None:
files = Util.dug_sprint_objects()
parser_name = "SPRINT"
dug.annotate_files(parser_name=parser_name,
parsable_files=files)
output_log = dug.log_stream.getvalue() if to_string else ''
return output_log

@staticmethod
def annotate_topmed_files(config=None, to_string=False, files=None):
with Dug(config, to_string=to_string) as dug:
Expand Down Expand Up @@ -877,6 +888,9 @@ def get_cancer_data_commons_files(config: RogerConfig, to_string=False) -> List[
return get_versioned_files(config, "crdc", "crdc", data_store=config.dug_inputs.data_source, unzip=True)


def get_sprint_files(config: RogerConfig, to_string=False) -> List[str]:
return get_versioned_files(config, "sprint", "sprint", data_store=config.dug_inputs.data_source, unzip=True)


def get_topmed_files(config: RogerConfig, to_string=False) -> List[str]:
return get_versioned_files(config, "topmed", "topmed", data_store=config.dug_inputs.data_source, unzip=False)
Expand Down
8 changes: 7 additions & 1 deletion dags/metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,10 @@ dug_inputs:
files:
s3:
- "bdc/v1.0/CRDC.tar.gz"
format: crdc
format: crdc
- name: sprint
version: v1.0
files:
s3:
- "sprint/v1.0/StanfordSPRINT_DataDictionary_2020-12-16.tar.gz"
format: sprint
12 changes: 12 additions & 0 deletions dags/roger/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,11 @@ def dug_anvil_path():
"""Anvil source files"""
return Util.dug_input_files_path('anvil')

@staticmethod
def dug_sprint_path():
"""Anvil source files"""
return Util.dug_input_files_path('sprint')

@staticmethod
def dug_crdc_path():
"""Anvil source files"""
Expand Down Expand Up @@ -319,6 +324,13 @@ def dug_anvil_objects():
lambda file_name: not file_name.startswith('GapExchange_') and file_name.endswith('.xml'), file_path)
return sorted([str(f) for f in files])

@staticmethod
def dug_sprint_objects():
file_path = Util.dug_sprint_path()
files = Util.get_files_recursive(
lambda file_name: file_name.endswith('.xml'), file_path)
return sorted([str(f) for f in files])

@staticmethod
def dug_crdc_objects():
file_path = Util.dug_crdc_path()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ redisgraph-bulk-loader==0.9.5
requests<2.24.0
pytest==6.2.2
PyYAML==5.3.1
git+https://github.com/helxplatform/[email protected].4
git+https://github.com/helxplatform/[email protected].5
elasticsearch==7.11.0
biolinkml>=1.5.10
orjson
Expand Down

0 comments on commit 7459128

Please sign in to comment.