From a564a873d5b1881b72440abca7822bb98f47d140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 16 Aug 2022 13:26:49 +0100 Subject: [PATCH 1/6] fix: Add Finngen mappings version as a parameter to the disease LUT script Finngen data is updated every so often to include new traits of study which we have to track. --- configs/config.yaml | 3 +- scripts/make_disease_mapping_lut.py | 41 +++++++++++++------ .../study_and_top_loci_tables.Snakefile | 3 ++ 3 files changed, 33 insertions(+), 14 deletions(-) diff --git a/configs/config.yaml b/configs/config.yaml index cae05ba..0f09d29 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -18,11 +18,12 @@ ukb_manifest: 'gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_mani # FINNGEN sumstat manifest file FINNGEN_manifest: "https://r6.finngen.fi/api/phenos" +FINNGEN_version: 6 #Config files for disease mapping lut: ukb_efo_original_curation: 'gs://genetics-portal-input/ukb_phenotypes/ukb_efo_annotation.190828.json' ukb_efo_updated_curation: 'docs.google.com/spreadsheets/d/1PotmUEirkV36dh-vpZ3GgxQg_LcOefZKbyTq0PNQ6NY/edit?usp=sharing' -FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/e/2PACX-1vR4Dh1UVeLZ7TtmpU-QzRwO4GPzGt_3j9nMp5hn0R1Z_JGpwgAgU155UPsNwJcKdA1ra7nee-l7iBiz/pub?output=csv&gid=1853278839&single=true" +FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing" # LD table url_1000G: 'gs://genetics-portal-input/1000Genomes_phase3/plink_format_b38' diff --git a/scripts/make_disease_mapping_lut.py b/scripts/make_disease_mapping_lut.py index 8666b8c..cb47df0 100755 --- a/scripts/make_disease_mapping_lut.py +++ b/scripts/make_disease_mapping_lut.py @@ -11,13 +11,13 @@ def main( - studies: str, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str + studies: str, finngen_version: int, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str ) -> None: # 1. Extract mappings per data source GWAS catalog traits from study table (these do not require OT mapping) gwas_catalog_mappings = get_gwas_catalog_mappings(studies) valid_ukb = get_ukb_mappings(ukb_original_mappings, ukb_updated_mappings) - valid_finngen = get_finngen_mappings(finngen_mappings) + valid_finngen = get_finngen_mappings(finngen_version, finngen_mappings) # Assert there are no studies with a null mapping for source in [gwas_catalog_mappings, valid_ukb, valid_finngen]: if 'proposed_efos' not in source.columns: @@ -162,8 +162,22 @@ def get_ukb_original_mappings(ukb_original_mappings: str) -> pd.DataFrame: ) -def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame: - """Extracts Finngen trait mappings from the curation spreadsheet.""" +def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.DataFrame: + """ + Extracts Finngen trait mappings from the curation spreadsheet + + Args: + finngen_version (int): The version of the Finngen data that you are using. + finngen_mappings (str): The path to the Finngen trait mappings spreadsheet. + + Returns: + A dataframe with the following columns: + - study_id + - trait_reported + - proposed_efos + """ + + version = f'FINNGEN_R{finngen_version}_' return ( read_input_file(finngen_mappings) @@ -177,23 +191,18 @@ def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame: .reset_index() .rename(columns={'NAME': 'study_name', 'LONGNAME': 'trait_reported', 'efo_cls': 'proposed_efos'}) .explode('trait_reported') - .assign(study_id=lambda x: 'FINNGEN_R5_' + x.study_name) + .assign(study_id=lambda x: version + x.study_name) .drop('study_name', axis=1) ) def build_therapeutic_areas(genetics_mappings: pd.DataFrame) -> pd.DataFrame: """Therapeutic areas per trait are built into the mappings table.""" - efo_tas_df = extract_therapeutic_areas_from_owl() - genetics_mappings_w_trait = genetics_mappings.merge( - efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left' - ).drop('efo_id', axis=1) + return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop('efo_id', axis=1) - return genetics_mappings_w_trait - -def flatten_array(arr: List) -> List: +def flatten_array(arr: List) -> List: # sourcery skip: use-contextlib-suppress """Flattens a bidimensional array.""" try: return [i for sublist in arr for i in sublist] @@ -208,7 +217,12 @@ def flatten_array(arr: List) -> List: '--finngen_mappings', help='URL of the spreadsheet that contains all Finngen disease mappings', nargs='?', - default='https://docs.google.com/spreadsheets/d/1yrQPpsRi-mijs_BliKFZjeoxP6kGIs9Bz-02_0WDvAA/edit?usp=sharing', + default='https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing', + ) + parser.add_argument( + '--finngen_version', + help='The version of the Finngen manifest the study table is based on.', + required=True, type=int, ) parser.add_argument( '--ukb_original_mappings', @@ -241,6 +255,7 @@ def flatten_array(arr: List) -> List: main( studies=args.studies, finngen_mappings=args.finngen_mappings, + finngen_version=args.finngen_version, ukb_original_mappings=args.ukb_original_mappings, ukb_updated_mappings=args.ukb_updated_mappings, output_path=args.output, diff --git a/snakefiles/study_and_top_loci_tables.Snakefile b/snakefiles/study_and_top_loci_tables.Snakefile index 71b2e3e..4123c30 100644 --- a/snakefiles/study_and_top_loci_tables.Snakefile +++ b/snakefiles/study_and_top_loci_tables.Snakefile @@ -256,6 +256,7 @@ rule study_table_to_parquet: rule make_disease_mappings_lut: ''' Build LUT that integrates all the disease mappings studies: merged study table in parquet format + finngen_version: version of the Finngen manifest finngen_mappings: curation recorded in Google Sheets ukb_original_mappings: initial UK Biobank disease curation ukb_updated_curation: updated mappings resulting from upgrading to EFO3 @@ -267,6 +268,7 @@ rule make_disease_mappings_lut: params: finngen_mappings = config['FINNGEN_efo_curation'], + finngen_version = config['FINNGEN_version'] output: 'output/{version}/trait_efo.parquet' @@ -275,6 +277,7 @@ rule make_disease_mappings_lut: wget -q -O {tmpdir}/finngen_mappings.csv {params.finngen_mappings} python scripts/make_disease_mapping_lut.py \ --studies {input.study_table} \ + --finngen_version {params.finngen_version} --finngen_mappings {tmpdir}/finngen_mappings.csv \ --ukb_original_mappings {input.ukb_original_mappings} \ --output {output} From 1d85ce721bfeff6072e5998a732c661acba7107f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 16 Aug 2022 16:36:04 +0100 Subject: [PATCH 2/6] fix: Update set up instructions The instance name has been changed (the character `_` is not accepted). The conda installation instructions have been updated. --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 018c7d9..2f43096 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Changes made (Jan 2019): ```bash # Set parameters. -export INSTANCE_NAME=v2d_data +export INSTANCE_NAME=v2d-data export INSTANCE_ZONE=europe-west1-d # Create the instance and SSH. @@ -50,8 +50,13 @@ sudo apt install -yf \ openjdk-13-jre-headless \ python3-pip wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh -chmod +x Miniconda3-latest-Linux-x86_64.sh -./Miniconda3-latest-Linux-x86_64.sh +bash ~/miniconda.sh -b +~/miniconda3/bin/conda init +exec bash + +# Clone repo +git clone https://github.com/opentargets/genetics-v2d-data.git +cd genetics-v2d-data # Install dependencies into isolated environment conda env create -n v2d_data --file environment.yaml @@ -68,7 +73,6 @@ rm -r www.ebi.ac.uk/gwas/ # (I've gotten snakemake problems on subsequent attempts when this happens too) tmux - # May want to use a smaller machine for step 1, then scale up to more # cores for step 2, and back down to a small machine for step 3 export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell" From c3d1969f4c2aeac86d68633a32ef18ec9f20dd39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 16 Aug 2022 16:40:45 +0100 Subject: [PATCH 3/6] fix: Fix typo The `phenostring` column was misreferenced, causing the script to crash. --- scripts/make_FINNGEN_study_table.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/make_FINNGEN_study_table.py b/scripts/make_FINNGEN_study_table.py index 2790a12..6b866f7 100644 --- a/scripts/make_FINNGEN_study_table.py +++ b/scripts/make_FINNGEN_study_table.py @@ -18,7 +18,7 @@ def main(input_path: str, output_path: str) -> None: # Read manifest manifest = ( pd.read_json(input_path, orient='records') - .filter(items=['phenocode', 'phenosring', 'category', 'num_cases', 'num_controls']) + .filter(items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls']) # When phenostring is not provided, phenotype extracted from the phenocode .assign(phenostring=lambda df: df.apply( @@ -83,8 +83,7 @@ def parse_args(): parser.add_argument('--input', metavar="", type=str, required=True) parser.add_argument('--output', metavar="", help=("Output"), type=str, required=True) - args = parser.parse_args() - return args + return parser.parse_args() if __name__ == '__main__': From 5cbbf7b4d3267c35341c781e67f46afccd14ebf1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 16 Aug 2022 16:54:57 +0100 Subject: [PATCH 4/6] style: Format `make_disease_mappings_lut` with Black --- scripts/make_disease_mapping_lut.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/make_disease_mapping_lut.py b/scripts/make_disease_mapping_lut.py index cb47df0..7b512c0 100755 --- a/scripts/make_disease_mapping_lut.py +++ b/scripts/make_disease_mapping_lut.py @@ -11,7 +11,12 @@ def main( - studies: str, finngen_version: int, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str + studies: str, + finngen_mappings: str, + finngen_version: int, + ukb_original_mappings: str, + ukb_updated_mappings: str, + output_path: str, ) -> None: # 1. Extract mappings per data source GWAS catalog traits from study table (these do not require OT mapping) @@ -165,11 +170,11 @@ def get_ukb_original_mappings(ukb_original_mappings: str) -> pd.DataFrame: def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.DataFrame: """ Extracts Finngen trait mappings from the curation spreadsheet - + Args: finngen_version (int): The version of the Finngen data that you are using. finngen_mappings (str): The path to the Finngen trait mappings spreadsheet. - + Returns: A dataframe with the following columns: - study_id @@ -199,7 +204,9 @@ def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.Data def build_therapeutic_areas(genetics_mappings: pd.DataFrame) -> pd.DataFrame: """Therapeutic areas per trait are built into the mappings table.""" efo_tas_df = extract_therapeutic_areas_from_owl() - return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop('efo_id', axis=1) + return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop( + 'efo_id', axis=1 + ) def flatten_array(arr: List) -> List: # sourcery skip: use-contextlib-suppress @@ -222,7 +229,8 @@ def flatten_array(arr: List) -> List: # sourcery skip: use-contextlib-suppress parser.add_argument( '--finngen_version', help='The version of the Finngen manifest the study table is based on.', - required=True, type=int, + required=True, + type=int, ) parser.add_argument( '--ukb_original_mappings', From f91ff1390557898daa31784ef3a6b8134e9f29aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 16 Aug 2022 16:55:55 +0100 Subject: [PATCH 5/6] style: Format `make_FINNGEN_study_table` with Black --- scripts/make_FINNGEN_study_table.py | 31 ++++++++++++++++------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/scripts/make_FINNGEN_study_table.py b/scripts/make_FINNGEN_study_table.py index 6b866f7..01f2afc 100644 --- a/scripts/make_FINNGEN_study_table.py +++ b/scripts/make_FINNGEN_study_table.py @@ -17,23 +17,26 @@ def main(input_path: str, output_path: str) -> None: # Read manifest manifest = ( - pd.read_json(input_path, orient='records') - .filter(items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls']) - + pd.read_json(input_path, orient='records').filter( + items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls'] + ) # When phenostring is not provided, phenotype extracted from the phenocode - .assign(phenostring=lambda df: df.apply( - lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'], - axis=1) + .assign( + phenostring=lambda df: df.apply( + lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'], + axis=1, + ) ) - # Renaming columns to accomodate OTG schema: - .rename(columns={ - 'phenocode': 'study_id', - 'phenostring': 'trait', - 'category': 'trait_category', - 'num_cases': 'n_cases', - 'num_controls': 'n_controls', - }) + .rename( + columns={ + 'phenocode': 'study_id', + 'phenostring': 'trait', + 'category': 'trait_category', + 'num_cases': 'n_cases', + 'num_controls': 'n_controls', + } + ) ) logging.info(f"{input_path} has been loaded. Formatting...") From dccdfd33623a36a6c24ecb4b0f246de475653212 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= Date: Tue, 16 Aug 2022 17:36:35 +0100 Subject: [PATCH 6/6] build: Changed finemapping output files to latest version `220224_merged` The top loci and credible sets have been updated to take the outputs of the latest fine mapping run --- configs/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/config.yaml b/configs/config.yaml index 0f09d29..45811b2 100644 --- a/configs/config.yaml +++ b/configs/config.yaml @@ -10,8 +10,8 @@ gwas_cat_cluster_multi_proportion: 0.3 # For a given study, if more than this pr # Summary statistics finemapping pipeline output files sumstats_gcs_path: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas' -toploci: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/top_loci.json.gz' -credsets: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake +toploci: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/top_loci.json.gz' +credsets: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake # UK Biobank (Neale V2 and SAIGE) sumstat manifest files ukb_manifest: 'gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_manifest.190430.tsv'