opentargets · ireneisdoomed · Aug 16, 2022 · Aug 16, 2022 · Aug 16, 2022 · Aug 16, 2022
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Changes made (Jan 2019):
 
 ```bash
 # Set parameters.
-export INSTANCE_NAME=v2d_data
+export INSTANCE_NAME=v2d-data
 export INSTANCE_ZONE=europe-west1-d
 
 # Create the instance and SSH.
@@ -50,8 +50,13 @@ sudo apt install -yf \
   openjdk-13-jre-headless \
   python3-pip
 wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-chmod +x Miniconda3-latest-Linux-x86_64.sh
-./Miniconda3-latest-Linux-x86_64.sh
+bash ~/miniconda.sh -b
+~/miniconda3/bin/conda init
+exec bash
+
+# Clone repo
+git clone https://github.com/opentargets/genetics-v2d-data.git
+cd genetics-v2d-data
 
 # Install dependencies into isolated environment
 conda env create -n v2d_data --file environment.yaml
@@ -68,7 +73,6 @@ rm -r www.ebi.ac.uk/gwas/
 # (I've gotten snakemake problems on subsequent attempts when this happens too)
 tmux
 
-
 # May want to use a smaller machine for step 1, then scale up to more
 # cores for step 2, and back down to a small machine for step 3
 export PYSPARK_SUBMIT_ARGS="--driver-memory 100g pyspark-shell"

diff --git a/configs/config.yaml b/configs/config.yaml
@@ -10,19 +10,20 @@ gwas_cat_cluster_multi_proportion: 0.3 # For a given study, if more than this pr
 
 # Summary statistics finemapping pipeline output files
 sumstats_gcs_path: 'gs://genetics-portal-dev-sumstats/unfiltered/gwas'
-toploci: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/top_loci.json.gz'
-credsets: 'gs://genetics-portal-dev-staging/finemapping/220113_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake
+toploci: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/top_loci.json.gz'
+credsets: 'gs://genetics-portal-dev-staging/finemapping/220224_merged/credset/_SUCCESS' # Need the whole directory, so passing the _SUCCSS file instead to trick snakemake
 
 # UK Biobank (Neale V2 and SAIGE) sumstat manifest files
 ukb_manifest: 'gs://genetics-portal-input/ukb_phenotypes/neale2_saige_study_manifest.190430.tsv'
 
 # FINNGEN sumstat manifest file
 FINNGEN_manifest: "https://r6.finngen.fi/api/phenos"
+FINNGEN_version: 6
 
 #Config files for disease mapping lut:
 ukb_efo_original_curation: 'gs://genetics-portal-input/ukb_phenotypes/ukb_efo_annotation.190828.json'
 ukb_efo_updated_curation: 'docs.google.com/spreadsheets/d/1PotmUEirkV36dh-vpZ3GgxQg_LcOefZKbyTq0PNQ6NY/edit?usp=sharing'
-FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/e/2PACX-1vR4Dh1UVeLZ7TtmpU-QzRwO4GPzGt_3j9nMp5hn0R1Z_JGpwgAgU155UPsNwJcKdA1ra7nee-l7iBiz/pub?output=csv&gid=1853278839&single=true"
+FINNGEN_efo_curation: "https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing"
 
 # LD table
 url_1000G: 'gs://genetics-portal-input/1000Genomes_phase3/plink_format_b38'

diff --git a/scripts/make_FINNGEN_study_table.py b/scripts/make_FINNGEN_study_table.py
@@ -17,23 +17,26 @@ def main(input_path: str, output_path: str) -> None:
 
     # Read manifest
     manifest = (
-        pd.read_json(input_path, orient='records')
-        .filter(items=['phenocode', 'phenosring', 'category', 'num_cases', 'num_controls'])
-
+        pd.read_json(input_path, orient='records').filter(
+            items=['phenocode', 'phenostring', 'category', 'num_cases', 'num_controls']
+        )
         # When phenostring is not provided, phenotype extracted from the phenocode
-        .assign(phenostring=lambda df: df.apply(
-            lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'], 
-            axis=1)
+        .assign(
+            phenostring=lambda df: df.apply(
+                lambda row: row['phenostring'] if row['phenostring'] and row['phenostring'] != '' else row['phenocode'],
+                axis=1,
+            )
         )
-
         # Renaming columns to accomodate OTG schema:
-        .rename(columns={
-            'phenocode': 'study_id',
-            'phenostring': 'trait',
-            'category': 'trait_category',
-            'num_cases': 'n_cases',
-            'num_controls': 'n_controls',
-        })
+        .rename(
+            columns={
+                'phenocode': 'study_id',
+                'phenostring': 'trait',
+                'category': 'trait_category',
+                'num_cases': 'n_cases',
+                'num_controls': 'n_controls',
+            }
+        )
     )
 
     logging.info(f"{input_path} has been loaded. Formatting...")
@@ -83,8 +86,7 @@ def parse_args():
     parser.add_argument('--input', metavar="<str>", type=str, required=True)
     parser.add_argument('--output', metavar="<str>", help=("Output"), type=str, required=True)
 
-    args = parser.parse_args()
-    return args
+    return parser.parse_args()
 
 
 if __name__ == '__main__':

diff --git a/scripts/make_disease_mapping_lut.py b/scripts/make_disease_mapping_lut.py
@@ -11,13 +11,18 @@
 
 
 def main(
-    studies: str, finngen_mappings: str, ukb_original_mappings: str, ukb_updated_mappings: str, output_path: str
+    studies: str,
+    finngen_mappings: str,
+    finngen_version: int,
+    ukb_original_mappings: str,
+    ukb_updated_mappings: str,
+    output_path: str,
 ) -> None:
 
     # 1. Extract mappings per data source GWAS catalog traits from study table (these do not require OT mapping)
     gwas_catalog_mappings = get_gwas_catalog_mappings(studies)
     valid_ukb = get_ukb_mappings(ukb_original_mappings, ukb_updated_mappings)
-    valid_finngen = get_finngen_mappings(finngen_mappings)
+    valid_finngen = get_finngen_mappings(finngen_version, finngen_mappings)
     # Assert there are no studies with a null mapping
     for source in [gwas_catalog_mappings, valid_ukb, valid_finngen]:
         if 'proposed_efos' not in source.columns:
@@ -162,8 +167,22 @@ def get_ukb_original_mappings(ukb_original_mappings: str) -> pd.DataFrame:
     )
 
 
-def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame:
-    """Extracts Finngen trait mappings from the curation spreadsheet."""
+def get_finngen_mappings(finngen_version: int, finngen_mappings: str) -> pd.DataFrame:
+    """
+    Extracts Finngen trait mappings from the curation spreadsheet
+
+    Args:
+      finngen_version (int): The version of the Finngen data that you are using.
+      finngen_mappings (str): The path to the Finngen trait mappings spreadsheet.
+
+    Returns:
+      A dataframe with the following columns:
+        - study_id
+        - trait_reported
+        - proposed_efos
+    """
+
+    version = f'FINNGEN_R{finngen_version}_'
 
     return (
         read_input_file(finngen_mappings)
@@ -177,23 +196,20 @@ def get_finngen_mappings(finngen_mappings: str) -> pd.DataFrame:
         .reset_index()
         .rename(columns={'NAME': 'study_name', 'LONGNAME': 'trait_reported', 'efo_cls': 'proposed_efos'})
         .explode('trait_reported')
-        .assign(study_id=lambda x: 'FINNGEN_R5_' + x.study_name)
+        .assign(study_id=lambda x: version + x.study_name)
         .drop('study_name', axis=1)
     )
 
 
 def build_therapeutic_areas(genetics_mappings: pd.DataFrame) -> pd.DataFrame:
     """Therapeutic areas per trait are built into the mappings table."""
-
     efo_tas_df = extract_therapeutic_areas_from_owl()
-    genetics_mappings_w_trait = genetics_mappings.merge(
-        efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left'
-    ).drop('efo_id', axis=1)
-
-    return genetics_mappings_w_trait
+    return genetics_mappings.merge(efo_tas_df, left_on='trait_efos', right_on='efo_id', how='left').drop(
+        'efo_id', axis=1
+    )
 
 
-def flatten_array(arr: List) -> List:
+def flatten_array(arr: List) -> List:  # sourcery skip: use-contextlib-suppress
     """Flattens a bidimensional array."""
     try:
         return [i for sublist in arr for i in sublist]
@@ -208,7 +224,13 @@ def flatten_array(arr: List) -> List:
         '--finngen_mappings',
         help='URL of the spreadsheet that contains all Finngen disease mappings',
         nargs='?',
-        default='https://docs.google.com/spreadsheets/d/1yrQPpsRi-mijs_BliKFZjeoxP6kGIs9Bz-02_0WDvAA/edit?usp=sharing',
+        default='https://docs.google.com/spreadsheets/d/1RRWfUTLy4TO9XmBzcbJ2wPRdda3qISjRS4PJmEdxE3k/edit?usp=sharing',
+    )
+    parser.add_argument(
+        '--finngen_version',
+        help='The version of the Finngen manifest the study table is based on.',
+        required=True,
+        type=int,
     )
     parser.add_argument(
         '--ukb_original_mappings',
@@ -241,6 +263,7 @@ def flatten_array(arr: List) -> List:
     main(
         studies=args.studies,
         finngen_mappings=args.finngen_mappings,
+        finngen_version=args.finngen_version,
         ukb_original_mappings=args.ukb_original_mappings,
         ukb_updated_mappings=args.ukb_updated_mappings,
         output_path=args.output,

diff --git a/snakefiles/study_and_top_loci_tables.Snakefile b/snakefiles/study_and_top_loci_tables.Snakefile
@@ -256,6 +256,7 @@ rule study_table_to_parquet:
 rule make_disease_mappings_lut:
     ''' Build LUT that integrates all the disease mappings
         studies: merged study table in parquet format
+        finngen_version: version of the Finngen manifest
         finngen_mappings: curation recorded in Google Sheets
         ukb_original_mappings: initial UK Biobank disease curation
         ukb_updated_curation: updated mappings resulting from upgrading to EFO3
@@ -267,6 +268,7 @@ rule make_disease_mappings_lut:
 
     params:
         finngen_mappings = config['FINNGEN_efo_curation'],
+        finngen_version = config['FINNGEN_version']
 
     output:
         'output/{version}/trait_efo.parquet'
@@ -275,6 +277,7 @@ rule make_disease_mappings_lut:
         wget -q -O {tmpdir}/finngen_mappings.csv {params.finngen_mappings}
         python scripts/make_disease_mapping_lut.py \
             --studies {input.study_table} \
+            --finngen_version {params.finngen_version}
             --finngen_mappings {tmpdir}/finngen_mappings.csv \
             --ukb_original_mappings {input.ukb_original_mappings} \
             --output {output}