Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Add Finngen mappings version as a parameter to the disease LUT script #24

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Changes made (Jan 2019):

```bash
# Set parameters.
export INSTANCE_NAME=v2d_data
export INSTANCE_NAME=v2d-data
export INSTANCE_ZONE=europe-west1-d

# Create the instance and SSH.
Expand All @@ -50,8 +50,13 @@ sudo apt install -yf \
openjdk-13-jre-headless \
python3-pip
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh
bash ~/miniconda.sh -b
~/miniconda3/bin/conda init
exec bash

# Clone repo
git clone https://github.com/opentargets/genetics-v2d-data.git
cd genetics-v2d-data

# Install dependencies into isolated environment
conda env create -n v2d_data --file environment.yaml
Expand All @@ -68,7 +73,6 @@ rm -r www.ebi.ac.uk/gwas/
# (I've gotten snakemake problems on subsequent attempts when this happens too)
tmux


# May want to use a smaller machine for step 1, then scale up to more
# cores for step 2, and back down to a small machine for step 3
export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell"
Expand Down
7 changes: 4 additions & 3 deletions configs/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,20 @@ gwas_cat_cluster_multi_proportion: 0.3 # For a given study, if more than this pr

# Summary statistics finemapping pipeline output files
sumstats_gcs_path: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas'
toploci: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/top_loci.json.gz'
credsets: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake
toploci: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/top_loci.json.gz'
credsets: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake

# UK Biobank (Neale V2 and SAIGE) sumstat manifest files
ukb_manifest: 'gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_manifest.190430.tsv'

# FINNGEN sumstat manifest file
FINNGEN_manifest: "https://r6.finngen.fi/api/phenos"
FINNGEN_version: 6

#Config files for disease mapping lut:
ukb_efo_original_curation: 'gs://genetics-portal-input/ukb_phenotypes/ukb_efo_annotation.190828.json'
ukb_efo_updated_curation: 'docs.google.com/spreadsheets/d/1PotmUEirkV36dh-vpZ3GgxQg_LcOefZKbyTq0PNQ6NY/edit?usp=sharing'
FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/e/2PACX-1vR4Dh1UVeLZ7TtmpU-QzRwO4GPzGt_3j9nMp5hn0R1Z_JGpwgAgU155UPsNwJcKdA1ra7nee-l7iBiz/pub?output=csv&gid=1853278839&single=true"
FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing"

# LD table
url_1000G: 'gs://genetics-portal-input/1000Genomes_phase3/plink_format_b38'
Expand Down
34 changes: 18 additions & 16 deletions scripts/make_FINNGEN_study_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,26 @@ def main(input_path: str, output_path: str) -> None:

# Read manifest
manifest = (
pd.read_json(input_path, orient='records')
.filter(items=['phenocode', 'phenosring', 'category', 'num_cases', 'num_controls'])

pd.read_json(input_path, orient='records').filter(
items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls']
)
# When phenostring is not provided, phenotype extracted from the phenocode
.assign(phenostring=lambda df: df.apply(
lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'],
axis=1)
.assign(
phenostring=lambda df: df.apply(
lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'],
axis=1,
)
)

# Renaming columns to accomodate OTG schema:
.rename(columns={
'phenocode': 'study_id',
'phenostring': 'trait',
'category': 'trait_category',
'num_cases': 'n_cases',
'num_controls': 'n_controls',
})
.rename(
columns={
'phenocode': 'study_id',
'phenostring': 'trait',
'category': 'trait_category',
'num_cases': 'n_cases',
'num_controls': 'n_controls',
}
)
)

logging.info(f"{input_path} has been loaded. Formatting...")
Expand Down Expand Up @@ -83,8 +86,7 @@ def parse_args():
parser.add_argument('--input', metavar="<str>", type=str, required=True)
parser.add_argument('--output', metavar="<str>", help=("Output"), type=str, required=True)

args = parser.parse_args()
return args
return parser.parse_args()


if __name__ == '__main__':
Expand Down
49 changes: 36 additions & 13 deletions scripts/make_disease_mapping_lut.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,18 @@


def main(
studies: str, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str
studies: str,
finngen_mappings: str,
finngen_version: int,
ukb_original_mappings: str,
ukb_updated_mappings: str,
output_path: str,
) -> None:

# 1. Extract mappings per data source GWAS catalog traits from study table (these do not require OT mapping)
gwas_catalog_mappings = get_gwas_catalog_mappings(studies)
valid_ukb = get_ukb_mappings(ukb_original_mappings, ukb_updated_mappings)
valid_finngen = get_finngen_mappings(finngen_mappings)
valid_finngen = get_finngen_mappings(finngen_version, finngen_mappings)
# Assert there are no studies with a null mapping
for source in [gwas_catalog_mappings, valid_ukb, valid_finngen]:
if 'proposed_efos' not in source.columns:
Expand Down Expand Up @@ -162,8 +167,22 @@ def get_ukb_original_mappings(ukb_original_mappings: str) -> pd.DataFrame:
)


def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame:
"""Extracts Finngen trait mappings from the curation spreadsheet."""
def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.DataFrame:
"""
Extracts Finngen trait mappings from the curation spreadsheet

Args:
finngen_version (int): The version of the Finngen data that you are using.
finngen_mappings (str): The path to the Finngen trait mappings spreadsheet.

Returns:
A dataframe with the following columns:
- study_id
- trait_reported
- proposed_efos
"""

version = f'FINNGEN_R{finngen_version}_'

return (
read_input_file(finngen_mappings)
Expand All @@ -177,23 +196,20 @@ def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame:
.reset_index()
.rename(columns={'NAME': 'study_name', 'LONGNAME': 'trait_reported', 'efo_cls': 'proposed_efos'})
.explode('trait_reported')
.assign(study_id=lambda x: 'FINNGEN_R5_' + x.study_name)
.assign(study_id=lambda x: version + x.study_name)
.drop('study_name', axis=1)
)


def build_therapeutic_areas(genetics_mappings: pd.DataFrame) -> pd.DataFrame:
"""Therapeutic areas per trait are built into the mappings table."""

efo_tas_df = extract_therapeutic_areas_from_owl()
genetics_mappings_w_trait = genetics_mappings.merge(
efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left'
).drop('efo_id', axis=1)

return genetics_mappings_w_trait
return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop(
'efo_id', axis=1
)


def flatten_array(arr: List) -> List:
def flatten_array(arr: List) -> List: # sourcery skip: use-contextlib-suppress
"""Flattens a bidimensional array."""
try:
return [i for sublist in arr for i in sublist]
Expand All @@ -208,7 +224,13 @@ def flatten_array(arr: List) -> List:
'--finngen_mappings',
help='URL of the spreadsheet that contains all Finngen disease mappings',
nargs='?',
default='https://docs.google.com/spreadsheets/d/1yrQPpsRi-mijs_BliKFZjeoxP6kGIs9Bz-02_0WDvAA/edit?usp=sharing',
default='https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing',
)
parser.add_argument(
'--finngen_version',
help='The version of the Finngen manifest the study table is based on.',
required=True,
type=int,
)
parser.add_argument(
'--ukb_original_mappings',
Expand Down Expand Up @@ -241,6 +263,7 @@ def flatten_array(arr: List) -> List:
main(
studies=args.studies,
finngen_mappings=args.finngen_mappings,
finngen_version=args.finngen_version,
ukb_original_mappings=args.ukb_original_mappings,
ukb_updated_mappings=args.ukb_updated_mappings,
output_path=args.output,
Expand Down
3 changes: 3 additions & 0 deletions snakefiles/study_and_top_loci_tables.Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ rule study_table_to_parquet:
rule make_disease_mappings_lut:
''' Build LUT that integrates all the disease mappings
studies: merged study table in parquet format
finngen_version: version of the Finngen manifest
finngen_mappings: curation recorded in Google Sheets
ukb_original_mappings: initial UK Biobank disease curation
ukb_updated_curation: updated mappings resulting from upgrading to EFO3
Expand All @@ -267,6 +268,7 @@ rule make_disease_mappings_lut:

params:
finngen_mappings = config['FINNGEN_efo_curation'],
finngen_version = config['FINNGEN_version']

output:
'output/{version}/trait_efo.parquet'
Expand All @@ -275,6 +277,7 @@ rule make_disease_mappings_lut:
wget -q -O {tmpdir}/finngen_mappings.csv {params.finngen_mappings}
python scripts/make_disease_mapping_lut.py \
--studies {input.study_table} \
--finngen_version {params.finngen_version}
--finngen_mappings {tmpdir}/finngen_mappings.csv \
--ukb_original_mappings {input.ukb_original_mappings} \
--output {output}
Expand Down