From a564a873d5b1881b72440abca7822bb98f47d140 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 16 Aug 2022 13:26:49 +0100
Subject: [PATCH 1/6] fix: Add Finngen mappings version as a parameter to the
 disease LUT script

Finngen data is updated every so often to include new traits of study which we have to track.
---
 configs/config.yaml                           |  3 +-
 scripts/make_disease_mapping_lut.py           | 41 +++++++++++++------
 .../study_and_top_loci_tables.Snakefile       |  3 ++
 3 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/configs/config.yaml b/configs/config.yaml
index cae05ba..0f09d29 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -18,11 +18,12 @@ ukb_manifest: 'gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_mani
 
 # FINNGEN sumstat manifest file
 FINNGEN_manifest: "https://r6.finngen.fi/api/phenos"
+FINNGEN_version: 6
 
 #Config files for disease mapping lut:
 ukb_efo_original_curation: 'gs://genetics-portal-input/ukb_phenotypes/ukb_efo_annotation.190828.json'
 ukb_efo_updated_curation: 'docs.google.com/spreadsheets/d/1PotmUEirkV36dh-vpZ3GgxQg_LcOefZKbyTq0PNQ6NY/edit?usp=sharing'
-FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/e/2PACX-1vR4Dh1UVeLZ7TtmpU-QzRwO4GPzGt_3j9nMp5hn0R1Z_JGpwgAgU155UPsNwJcKdA1ra7nee-l7iBiz/pub?output=csv&gid=1853278839&single=true"
+FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing"
 
 # LD table
 url_1000G: 'gs://genetics-portal-input/1000Genomes_phase3/plink_format_b38'
diff --git a/scripts/make_disease_mapping_lut.py b/scripts/make_disease_mapping_lut.py
index 8666b8c..cb47df0 100755
--- a/scripts/make_disease_mapping_lut.py
+++ b/scripts/make_disease_mapping_lut.py
@@ -11,13 +11,13 @@
 
 
 def main(
-    studies: str, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str
+    studies: str, finngen_version: int, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str
 ) -> None:
 
     # 1. Extract mappings per data source GWAS catalog traits from study table (these do not require OT mapping)
     gwas_catalog_mappings = get_gwas_catalog_mappings(studies)
     valid_ukb = get_ukb_mappings(ukb_original_mappings, ukb_updated_mappings)
-    valid_finngen = get_finngen_mappings(finngen_mappings)
+    valid_finngen = get_finngen_mappings(finngen_version, finngen_mappings)
     # Assert there are no studies with a null mapping
     for source in [gwas_catalog_mappings, valid_ukb, valid_finngen]:
         if 'proposed_efos' not in source.columns:
@@ -162,8 +162,22 @@ def get_ukb_original_mappings(ukb_original_mappings: str) -> pd.DataFrame:
     )
 
 
-def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame:
-    """Extracts Finngen trait mappings from the curation spreadsheet."""
+def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.DataFrame:
+    """
+    Extracts Finngen trait mappings from the curation spreadsheet
+    
+    Args:
+      finngen_version (int): The version of the Finngen data that you are using.
+      finngen_mappings (str): The path to the Finngen trait mappings spreadsheet.
+    
+    Returns:
+      A dataframe with the following columns:
+        - study_id
+        - trait_reported
+        - proposed_efos
+    """
+
+    version = f'FINNGEN_R{finngen_version}_'
 
     return (
         read_input_file(finngen_mappings)
@@ -177,23 +191,18 @@ def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame:
         .reset_index()
         .rename(columns={'NAME': 'study_name', 'LONGNAME': 'trait_reported', 'efo_cls': 'proposed_efos'})
         .explode('trait_reported')
-        .assign(study_id=lambda x: 'FINNGEN_R5_' + x.study_name)
+        .assign(study_id=lambda x: version + x.study_name)
         .drop('study_name', axis=1)
     )
 
 
 def build_therapeutic_areas(genetics_mappings: pd.DataFrame) -> pd.DataFrame:
     """Therapeutic areas per trait are built into the mappings table."""
-
     efo_tas_df = extract_therapeutic_areas_from_owl()
-    genetics_mappings_w_trait = genetics_mappings.merge(
-        efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left'
-    ).drop('efo_id', axis=1)
+    return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop('efo_id', axis=1)
 
-    return genetics_mappings_w_trait
 
-
-def flatten_array(arr: List) -> List:
+def flatten_array(arr: List) -> List:  # sourcery skip: use-contextlib-suppress
     """Flattens a bidimensional array."""
     try:
         return [i for sublist in arr for i in sublist]
@@ -208,7 +217,12 @@ def flatten_array(arr: List) -> List:
         '--finngen_mappings',
         help='URL of the spreadsheet that contains all Finngen disease mappings',
         nargs='?',
-        default='https://docs.google.com/spreadsheets/d/1yrQPpsRi-mijs_BliKFZjeoxP6kGIs9Bz-02_0WDvAA/edit?usp=sharing',
+        default='https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing',
+    )
+    parser.add_argument(
+        '--finngen_version',
+        help='The version of the Finngen manifest the study table is based on.',
+        required=True, type=int,
     )
     parser.add_argument(
         '--ukb_original_mappings',
@@ -241,6 +255,7 @@ def flatten_array(arr: List) -> List:
     main(
         studies=args.studies,
         finngen_mappings=args.finngen_mappings,
+        finngen_version=args.finngen_version,
         ukb_original_mappings=args.ukb_original_mappings,
         ukb_updated_mappings=args.ukb_updated_mappings,
         output_path=args.output,
diff --git a/snakefiles/study_and_top_loci_tables.Snakefile b/snakefiles/study_and_top_loci_tables.Snakefile
index 71b2e3e..4123c30 100644
--- a/snakefiles/study_and_top_loci_tables.Snakefile
+++ b/snakefiles/study_and_top_loci_tables.Snakefile
@@ -256,6 +256,7 @@ rule study_table_to_parquet:
 rule make_disease_mappings_lut:
     ''' Build LUT that integrates all the disease mappings
         studies: merged study table in parquet format
+        finngen_version: version of the Finngen manifest
         finngen_mappings: curation recorded in Google Sheets
         ukb_original_mappings: initial UK Biobank disease curation
         ukb_updated_curation: updated mappings resulting from upgrading to EFO3
@@ -267,6 +268,7 @@ rule make_disease_mappings_lut:
     
     params:
         finngen_mappings = config['FINNGEN_efo_curation'],
+        finngen_version = config['FINNGEN_version']
 
     output:
         'output/{version}/trait_efo.parquet'
@@ -275,6 +277,7 @@ rule make_disease_mappings_lut:
         wget -q -O {tmpdir}/finngen_mappings.csv {params.finngen_mappings}
         python scripts/make_disease_mapping_lut.py \
             --studies {input.study_table} \
+            --finngen_version {params.finngen_version}
             --finngen_mappings {tmpdir}/finngen_mappings.csv \
             --ukb_original_mappings {input.ukb_original_mappings} \
             --output {output}

From 1d85ce721bfeff6072e5998a732c661acba7107f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 16 Aug 2022 16:36:04 +0100
Subject: [PATCH 2/6] fix: Update set up instructions

The instance name has been changed (the character `_` is not accepted). The conda installation instructions have been updated.
---
 README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 018c7d9..2f43096 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Changes made (Jan 2019):
 
 ```bash
 # Set parameters.
-export INSTANCE_NAME=v2d_data
+export INSTANCE_NAME=v2d-data
 export INSTANCE_ZONE=europe-west1-d
 
 # Create the instance and SSH.
@@ -50,8 +50,13 @@ sudo apt install -yf \
   openjdk-13-jre-headless \
   python3-pip
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-chmod +x Miniconda3-latest-Linux-x86_64.sh
-./Miniconda3-latest-Linux-x86_64.sh
+bash ~/miniconda.sh -b
+~/miniconda3/bin/conda init
+exec bash
+
+# Clone repo
+git clone https://github.com/opentargets/genetics-v2d-data.git
+cd genetics-v2d-data
 
 # Install dependencies into isolated environment
 conda env create -n v2d_data --file environment.yaml
@@ -68,7 +73,6 @@ rm -r www.ebi.ac.uk/gwas/
 # (I've gotten snakemake problems on subsequent attempts when this happens too)
 tmux
 
-
 # May want to use a smaller machine for step 1, then scale up to more
 # cores for step 2, and back down to a small machine for step 3
 export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell"

From c3d1969f4c2aeac86d68633a32ef18ec9f20dd39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 16 Aug 2022 16:40:45 +0100
Subject: [PATCH 3/6] fix: Fix typo

The `phenostring` column was misreferenced, causing the script to crash.
---
 scripts/make_FINNGEN_study_table.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/make_FINNGEN_study_table.py b/scripts/make_FINNGEN_study_table.py
index 2790a12..6b866f7 100644
--- a/scripts/make_FINNGEN_study_table.py
+++ b/scripts/make_FINNGEN_study_table.py
@@ -18,7 +18,7 @@ def main(input_path: str, output_path: str) -> None:
     # Read manifest
     manifest = (
         pd.read_json(input_path, orient='records')
-        .filter(items=['phenocode', 'phenosring', 'category', 'num_cases', 'num_controls'])
+        .filter(items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls'])
 
         # When phenostring is not provided, phenotype extracted from the phenocode
         .assign(phenostring=lambda df: df.apply(
@@ -83,8 +83,7 @@ def parse_args():
     parser.add_argument('--input', metavar="<str>", type=str, required=True)
     parser.add_argument('--output', metavar="<str>", help=("Output"), type=str, required=True)
 
-    args = parser.parse_args()
-    return args
+    return parser.parse_args()
 
 
 if __name__ == '__main__':

From 5cbbf7b4d3267c35341c781e67f46afccd14ebf1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 16 Aug 2022 16:54:57 +0100
Subject: [PATCH 4/6] style: Format `make_disease_mappings_lut` with Black

---
 scripts/make_disease_mapping_lut.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/scripts/make_disease_mapping_lut.py b/scripts/make_disease_mapping_lut.py
index cb47df0..7b512c0 100755
--- a/scripts/make_disease_mapping_lut.py
+++ b/scripts/make_disease_mapping_lut.py
@@ -11,7 +11,12 @@
 
 
 def main(
-    studies: str, finngen_version: int, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str
+    studies: str,
+    finngen_mappings: str,
+    finngen_version: int,
+    ukb_original_mappings: str,
+    ukb_updated_mappings: str,
+    output_path: str,
 ) -> None:
 
     # 1. Extract mappings per data source GWAS catalog traits from study table (these do not require OT mapping)
@@ -165,11 +170,11 @@ def get_ukb_original_mappings(ukb_original_mappings: str) -> pd.DataFrame:
 def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.DataFrame:
     """
     Extracts Finngen trait mappings from the curation spreadsheet
-    
+
     Args:
       finngen_version (int): The version of the Finngen data that you are using.
       finngen_mappings (str): The path to the Finngen trait mappings spreadsheet.
-    
+
     Returns:
       A dataframe with the following columns:
         - study_id
@@ -199,7 +204,9 @@ def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.Data
 def build_therapeutic_areas(genetics_mappings: pd.DataFrame) -> pd.DataFrame:
     """Therapeutic areas per trait are built into the mappings table."""
     efo_tas_df = extract_therapeutic_areas_from_owl()
-    return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop('efo_id', axis=1)
+    return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop(
+        'efo_id', axis=1
+    )
 
 
 def flatten_array(arr: List) -> List:  # sourcery skip: use-contextlib-suppress
@@ -222,7 +229,8 @@ def flatten_array(arr: List) -> List:  # sourcery skip: use-contextlib-suppress
     parser.add_argument(
         '--finngen_version',
         help='The version of the Finngen manifest the study table is based on.',
-        required=True, type=int,
+        required=True,
+        type=int,
     )
     parser.add_argument(
         '--ukb_original_mappings',

From f91ff1390557898daa31784ef3a6b8134e9f29aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 16 Aug 2022 16:55:55 +0100
Subject: [PATCH 5/6] style: Format `make_FINNGEN_study_table` with Black

---
 scripts/make_FINNGEN_study_table.py | 31 ++++++++++++++++-------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/scripts/make_FINNGEN_study_table.py b/scripts/make_FINNGEN_study_table.py
index 6b866f7..01f2afc 100644
--- a/scripts/make_FINNGEN_study_table.py
+++ b/scripts/make_FINNGEN_study_table.py
@@ -17,23 +17,26 @@ def main(input_path: str, output_path: str) -> None:
 
     # Read manifest
     manifest = (
-        pd.read_json(input_path, orient='records')
-        .filter(items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls'])
-
+        pd.read_json(input_path, orient='records').filter(
+            items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls']
+        )
         # When phenostring is not provided, phenotype extracted from the phenocode
-        .assign(phenostring=lambda df: df.apply(
-            lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'], 
-            axis=1)
+        .assign(
+            phenostring=lambda df: df.apply(
+                lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'],
+                axis=1,
+            )
         )
-
         # Renaming columns to accomodate OTG schema:
-        .rename(columns={
-            'phenocode': 'study_id',
-            'phenostring': 'trait',
-            'category': 'trait_category',
-            'num_cases': 'n_cases',
-            'num_controls': 'n_controls',
-        })
+        .rename(
+            columns={
+                'phenocode': 'study_id',
+                'phenostring': 'trait',
+                'category': 'trait_category',
+                'num_cases': 'n_cases',
+                'num_controls': 'n_controls',
+            }
+        )
     )
 
     logging.info(f"{input_path} has been loaded. Formatting...")

From dccdfd33623a36a6c24ecb4b0f246de475653212 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Irene=20L=C3=B3pez?= <irene.lopezs@protonmail.com>
Date: Tue, 16 Aug 2022 17:36:35 +0100
Subject: [PATCH 6/6] build: Changed finemapping output files to latest version
 `220224_merged`

The top loci and credible sets have been updated to take the outputs of the latest fine mapping run
---
 configs/config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/config.yaml b/configs/config.yaml
index 0f09d29..45811b2 100644
--- a/configs/config.yaml
+++ b/configs/config.yaml
@@ -10,8 +10,8 @@ gwas_cat_cluster_multi_proportion: 0.3 # For a given study, if more than this pr
 
 # Summary statistics finemapping pipeline output files
 sumstats_gcs_path: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas'
-toploci: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/top_loci.json.gz'
-credsets: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake
+toploci: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/top_loci.json.gz'
+credsets: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake
 
 # UK Biobank (Neale V2 and SAIGE) sumstat manifest files
 ukb_manifest: 'gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_manifest.190430.tsv'