From 3f05008b78b827dc1df068f2b74550fd02ac8809 Mon Sep 17 00:00:00 2001 From: Andras Stirling Date: Fri, 16 Jul 2021 16:08:43 +0200 Subject: [PATCH] Fix get_relations function --- .gitignore | 1 - .../sources/clinicaltrials/__init__.py | 48 ++++++++++--------- tests/test_clinicaltrials.py | 7 +++ 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 3d58b1a48..30ed74218 100644 --- a/.gitignore +++ b/.gitignore @@ -129,6 +129,5 @@ dmypy.json .pyre/ import.report - .idea .idea/* \ No newline at end of file diff --git a/src/indra_cogex/sources/clinicaltrials/__init__.py b/src/indra_cogex/sources/clinicaltrials/__init__.py index 9adcd5dbb..4a412d1f5 100644 --- a/src/indra_cogex/sources/clinicaltrials/__init__.py +++ b/src/indra_cogex/sources/clinicaltrials/__init__.py @@ -5,6 +5,7 @@ from indra_cogex.representation import Node, Relation drug_pattern = re.compile(r"^Drug: ([a-zA-Z ]|\d)+$") +#id_pattern = re.compile(r'^https://ClinicalTrials.gov/show/NCT(\d+)$') class ClinicaltrialsProcessor(Processor): @@ -36,27 +37,30 @@ def get_nodes(self): ) def get_relations(self): - for conditions in self.df["Conditions"]: - for condition in conditions.split("|"): + for index, row in self.df.iterrows(): + for condition in row["Conditions"].split("|"): cond_matches = gilda.ground(condition) if cond_matches: - for interventions in self.df["Interventions"]: - if not pd.isna(interventions): - for intervention in interventions.split("|"): - if drug_pattern.match(intervention): - int_matches = gilda.ground(intervention[6:]) - if int_matches: - yield Relation( - source_ns=cond_matches[0].term.db, - source_id=cond_matches[0].term.id, - target_ns=int_matches[0].term.db, - target_id=int_matches[0].term.id, - rel_type="has_trial", - ) - yield Relation( - source_ns=cond_matches[0].term.db, - source_id=cond_matches[0].term.id, - target_ns=int_matches[0].term.db, - target_id=int_matches[0].term.id, - rel_type="tested_in", - ) + source_ns = cond_matches[0].term.db + source_id = cond_matches[0].term.id + if not pd.isna(row["Interventions"]): + for intervention in row["Interventions"].split("|"): + if drug_pattern.match(intervention): + int_matches = gilda.ground(intervention[6:]) + if int_matches: + target_ns = int_matches[0].term.db + target_id = row["URL"][32:] + yield Relation( + source_ns=source_ns, + source_id=source_id, + target_ns=target_ns, + target_id=target_id, + rel_type="has_trial" + ) + yield Relation( + source_ns=source_ns, + source_id=source_id, + target_ns=target_ns, + target_id=target_id, + rel_type="tested_in" + ) diff --git a/tests/test_clinicaltrials.py b/tests/test_clinicaltrials.py index 6ff2c58d0..390cd90eb 100644 --- a/tests/test_clinicaltrials.py +++ b/tests/test_clinicaltrials.py @@ -7,3 +7,10 @@ def test_get_nodes(): cp = ClinicaltrialsProcessor(path) nodes = list(cp.get_nodes()) assert len(nodes) is not 0 + + +def test_get_nodes(): + path = os.path.join(os.path.dirname(__file__), "test_search_results.tsv") + cp = ClinicaltrialsProcessor(path) + relations = list(cp.get_relations()) + # TODO: Test get_relations