Skip to content

Commit

Permalink
Merge pull request #9 from monarch-initiative/robot-template
Browse files Browse the repository at this point in the history
Robot template
  • Loading branch information
joeflack4 authored Aug 31, 2023
2 parents 3c33648 + 1dc9894 commit 1621375
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 18 deletions.
18 changes: 12 additions & 6 deletions .github/workflows/buid_and_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
name: Build and release

on:
# schedule:
# - cron: "0 0 * * 0" # weekly on Sunday at midnight
workflow_dispatch:
schedule:
- cron: "0 0 * * 0" # weekly on Sunday at midnight

jobs:
build_and_release:
Expand All @@ -15,7 +15,9 @@ jobs:
steps:
- uses: actions/checkout@v2
- name: build
run: make all
# todo: reactivate `make all`, pending pending https://github.com/monarch-initiative/medgen/issues/11
# run: make all
run: make minimal
- name: Get current time
uses: josStorer/[email protected]
id: current-time
Expand All @@ -29,7 +31,11 @@ jobs:
automatic_release_tag: "${{ steps.current-time.outputs.formattedTime }}"
title: "${{ steps.current-time.outputs.formattedTime }}"
prerelease: false
# todo: add back `release/medgen-disease-extract.owl`, pending https://github.com/monarch-initiative/medgen/issues/11
# todo: add `medgen.sssom.tsv`, pending https://github.com/monarch-initiative/medgen/issues/6
# output/release/medgen-disease-extract.owl
# output/release/medgen.sssom.tsv
files: |
release/medgen.obo
release/medgen-disease-extract.obo
release/medgen-disease-extract.owl
output/release/medgen.obo
output/release/medgen-disease-extract.obo
output/release/medgen-xrefs.robot.template.tsv
42 changes: 30 additions & 12 deletions makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,34 @@
# Running `make all` will run the full pipeline. Note that if the FTP files have already been downloaded, it'll skip
# that part. In order to force re-download, run `make all -B`.
.DEFAULT_GOAL := all
.PHONY: all build stage stage-% analyze clean deploy-release
.PHONY: all build stage stage-% analyze clean deploy-release build-lite minimal

OBO=http://purl.obolibrary.org/obo
PRODUCTS=medgen-disease-extract.obo medgen-disease-extract.owl
TODAY ?=$(shell date +%Y-%m-%d)
VERSION=v$(TODAY)

minimal: build-lite stage-lite clean
# stage-lite: These commented out files are produced by `all` but not by `minimal`. Just left here for reference. See: https://github.com/monarch-initiative/medgen/issues/11
stage-lite: | output/release/
# mv medgen-disease-extract.owl output/release/
# mv medgen.sssom.tsv output/release/
mv medgen.obo output/release/
mv medgen-disease-extract.obo output/release/
mv medgen-xrefs.robot.template.tsv output/release/
build-lite: medgen-disease-extract.obo medgen-xrefs.robot.template.tsv

all: build stage clean analyze
# analyze: runs more than just this file; that goal creates multiple files
analyze: output/medgen_terms_mapping_status.tsv
build: $(PRODUCTS) medgen.sssom.tsv
build: $(PRODUCTS)
stage: $(patsubst %, stage-%, $(PRODUCTS))
mv medgen.obo output/release/
mv medgen.sssom.tsv output/release/
stage-%: % | output/release/
mv $< output/release/
clean:
rm medgen.obographs.json
rm uid2cui.tsv
rm *.obo
rm -f medgen.obographs.json
rm -f uid2cui.tsv
rm -f *.obo

# ----------------------------------------
# Setup dirs
Expand All @@ -36,18 +44,18 @@ output/release/:
# ----------------------------------------
# ETL
# ----------------------------------------
ftp.ncbi.nlm.nih.gov:
ftp.ncbi.nlm.nih.gov/:
wget -r -np ftp://ftp.ncbi.nlm.nih.gov/pub/medgen/ && touch $@

uid2cui.tsv:
uid2cui.tsv: ftp.ncbi.nlm.nih.gov/
./src/make_uid2cui.pl > $@

# ----------------------------------------
# Main artefacts
# ----------------------------------------
# Hacky conversion to obo ----------------
# Relies on MGCONSO.RRF.gz etc being made by 'ftp.ncbi.nlm.nih.gov' step
medgen.obo: ftp.ncbi.nlm.nih.gov uid2cui.tsv
# Relies on MGCONSO.RRF.gz etc being made by 'ftp.ncbi.nlm.nih.gov/' step
medgen.obo: ftp.ncbi.nlm.nih.gov/ uid2cui.tsv
./src/medgen2obo.pl > $@.tmp && mv $@.tmp $@

# We only care about diseases for now
Expand Down Expand Up @@ -92,5 +100,15 @@ tmp/input/mondo.sssom.tsv: | tmp/input/
wget http://purl.obolibrary.org/obo/mondo/mappings/mondo.sssom.tsv -O $@

# creates more than just this file; that goal creates multiple files
output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: | output/
output/medgen_terms_mapping_status.tsv output/obsoleted_medgen_terms_in_mondo.txt: tmp/input/mondo.sssom.tsv | output/
python src/mondo_mapping_status.py

# ----------------------------------------
# Robot templates
# ----------------------------------------
ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt: ftp.ncbi.nlm.nih.gov/
gzip -d ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt.gz

# todo: Ideally I wanted this done at the end of the ingest, permuting from medgen.sssom.tsv, but there were some problems with that file. Eventually changing to that feels like it makes more sense. Will have already been pre-curated by disease. And some of the logic in this Python script is duplicative.
medgen-xrefs.robot.template.tsv: ftp.ncbi.nlm.nih.gov/pub/medgen/MedGenIDMappings.txt
python src/mondo_robot_template.py -i $< -o $@
75 changes: 75 additions & 0 deletions src/mondo_robot_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Medgen->Mondo robot template
Create a robot template to be used by Mondo to add MedGen xrefs curated by MedGen.
See also:
- PR: https://github.com/monarch-initiative/medgen/pull/9
- Used here: https://github.com/monarch-initiative/mondo/pull/6560
"""
from argparse import ArgumentParser
from copy import copy
from pathlib import Path
from typing import Dict, List

import pandas as pd

SRC_DIR = Path(__file__).parent
PROJECT_DIR = SRC_DIR.parent
FTP_DIR = PROJECT_DIR / "ftp.ncbi.nlm.nih.gov" / "pub" / "medgen"
INPUT_FILE = str(FTP_DIR / "MedGenIDMappings.txt")
OUTPUT_FILE = str(PROJECT_DIR / "medgen-xrefs.robot.template.tsv")


def _prefixed_id_rows_from_common_df(source_df: pd.DataFrame, mondo_col='mondo_id', xref_col='xref_id') -> List[Dict]:
"""From worksheets having same common format, get prefixed xrefs for the namespaces we're looking to cover
Note: This same exact function is used in:
- mondo repo: medgen_conflicts_add_xrefs.py
- medgen repo: mondo_robot_template.py"""
df = copy(source_df)
df[xref_col] = df[xref_col].apply(
lambda x: f'MEDGENCUI:{x}' if x.startswith('CN') # "CUI Novel"
else f'UMLS:{x}' if x.startswith('C') # CUI: will be created twice: one for MEDGENCUI, one for UMLS
else f'MEDGEN:{x}') # UID
rows = df.to_dict('records')
rows2 = [{mondo_col: x[mondo_col], xref_col: x[xref_col].replace('UMLS', 'MEDGENCUI')} for x in rows if
x[xref_col].startswith('UMLS')]
return rows + rows2


def run(input_file: str = INPUT_FILE, output_file: str = OUTPUT_FILE):
"""Create robot template"""
# Read input
df = pd.read_csv(input_file, sep='|').rename(columns={'#CUI': 'xref_id'})

# Get explicit Medgen (CUI, CN) -> Mondo mappings
df_medgen_mondo = df[df['source'] == 'MONDO'][['source_id', 'xref_id']].rename(columns={'source_id': 'mondo_id'})
out_df_cui_cn = pd.DataFrame(_prefixed_id_rows_from_common_df(df_medgen_mondo))

# Get Medgen (UID) -> Mondo mappings
# - Done by proxy: UID <-> CUI <-> MONDO
df_medgen_medgenuid = df[df['source'] == 'MedGen'][['source_id', 'xref_id']].rename(
columns={'source_id': 'medgen_uid'})
out_df_uid = pd.merge(df_medgen_mondo, df_medgen_medgenuid, on='xref_id')[['mondo_id', 'medgen_uid']]\
.rename(columns={'medgen_uid': 'xref_id'})
out_df_uid['xref_id'] = out_df_uid['xref_id'].apply(lambda x: f'MEDGEN:{x}')

# Save
out_df = pd.concat([out_df_cui_cn, out_df_uid]).sort_values(['xref_id', 'mondo_id']).drop_duplicates()
out_df = pd.concat([pd.DataFrame([{'mondo_id': 'ID', 'xref_id': 'A oboInOwl:hasDbXref'}]), out_df])
out_df.to_csv(output_file, index=False, sep='\t')

def cli():
"""Command line interface."""
parser = ArgumentParser(
prog='"Medgen->Mondo robot template',
description='Create a robot template to be used by Mondo to add MedGen xrefs curated by MedGen.')
parser.add_argument(
'-i', '--input-file', default=INPUT_FILE, help='Mapping file sourced from MedGen')
parser.add_argument(
'-o', '--output-file', default=OUTPUT_FILE, help='ROBOT template to be used to add xrefs')
run(**vars(parser.parse_args()))


if __name__ == '__main__':
cli()

0 comments on commit 1621375

Please sign in to comment.