diff --git a/dataset_processing/.DS_Store b/dataset_processing/.DS_Store index cb39dbd..fe78959 100644 Binary files a/dataset_processing/.DS_Store and b/dataset_processing/.DS_Store differ diff --git a/dataset_processing/notebooks/ShifrutMarson2018.ipynb b/dataset_processing/notebooks/ShifrutMarson2018.ipynb index db427e1..f93dd79 100644 --- a/dataset_processing/notebooks/ShifrutMarson2018.ipynb +++ b/dataset_processing/notebooks/ShifrutMarson2018.ipynb @@ -2,23 +2,10 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 26, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/homebrew/Caskroom/mambaforge/base/envs/pertpy/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" - ] - } - ], + "outputs": [], "source": [ - "import pertpy as pt\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", "import scanpy as sc\n", "import anndata as ad\n", "import pandas as pd " diff --git a/dataset_processing/scripts/DixitRegev2016.py b/dataset_processing/scripts/DixitRegev2016.py index 9b827e2..fc0425e 100644 --- a/dataset_processing/scripts/DixitRegev2016.py +++ b/dataset_processing/scripts/DixitRegev2016.py @@ -13,20 +13,20 @@ from process_supp import * sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../'))) -from utils import write_as_singles, read_from_singles, annotate_qc, assert_annotations import yaml -config = yaml.safe_load(open("../../config.yaml", "r")) +config = yaml.safe_load(open("config.yaml", "r")) DIR = config['DIR'] WDIR = config['WDIR'] -path = DIR+'GSE90063/supp/' +path = DIR+'GSE90063/' adatas = {} # GSM2396858_K562_TFs__7_days -path = DIR+'GSE90063/supp/' -folder = 'Supp_GSM2396858_K562_TFs__7_days' +path = DIR+'GSE90063/' +folder = 'GSM2396858_k562_tfs_7' + files = get_files(path+folder, full_path=False) name = os.path.commonprefix(files) @@ -58,7 +58,7 @@ for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','): if barcode in adata.obs_names: val = adata.obs.loc[barcode][key] - adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna + adata.obs.loc[barcode,key] = grna if val is None else val+' + '+grna adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna] adata.obs = adata.obs.rename({'grna': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1) @@ -67,15 +67,15 @@ adata.obs['cell_line'] = 'K562' adata.obs['celltype'] = 'lymphoblasts' adata.obs['perturbation_type'] = 'CRISPR' -adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control' +# adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control' adata.obs['cancer'] = True adata.obs['disease'] = 'myelogenous leukemia' adatas['K562_TFs__7_days'] = adata - +adata.write_h5ad("DixitRegev2016_K562_TFs_7days.h5ad",compression="gzip") # GSM2396859_K562_TFs__13_days -path = DIR+'GSE90063/supp/' -folder = 'Supp_GSM2396859_K562_TFs__13_days' +path = DIR+'GSE90063/' +folder = 'GSM2396859_k562_tfs_13' files = get_files(path+folder, full_path=False) name = os.path.commonprefix(files) @@ -107,7 +107,7 @@ for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','): if barcode in adata.obs_names: val = adata.obs.loc[barcode][key] - adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna + adata.obs.loc[barcode,key] = grna if val is None else val+' + '+grna adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna] adata.obs = adata.obs.rename({'grna': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1) adata.obs['moi'] = 'normal' @@ -115,15 +115,16 @@ adata.obs['cell_line'] = 'K562' adata.obs['celltype'] = 'lymphoblasts' adata.obs['perturbation_type'] = 'CRISPR' -adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control' +# adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control' adata.obs['cancer'] = True adata.obs['disease'] = 'myelogenous leukemia' adatas['K562_TFs__13_days'] = adata +adata.write_h5ad("DixitRegev2016_K562_TFs_13days.h5ad",compression="gzip") # GSM2396860_K562_TFs__High_MOI -path = DIR+'GSE90063/supp/' -folder = 'Supp_GSM2396860_K562_TFs__High_MOI' +path = DIR+'GSE90063/' +folder = 'GSM2396860_k562_tfs_highmoi' files = get_files(path+folder, full_path=False) name = os.path.commonprefix(files) @@ -155,44 +156,54 @@ for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','): if barcode in adata.obs_names: val = adata.obs.loc[barcode][key] - adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna + adata.obs.loc[barcode,key] = grna if val is None else val+' + '+grna adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna_strict] adata.obs = adata.obs.rename({'grna_strict': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1) adata.obs['moi'] = 'high' adata.obs['cell_line'] = 'K562' adata.obs['celltype'] = 'lymphoblasts' adata.obs['perturbation_type'] = 'CRISPR' -adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control' +# shift to treating non-targeting guides as control +# adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control' adata.obs['cancer'] = True adata.obs['disease'] = 'myelogenous leukemia' adatas['K562_TFs__High_MOI'] = adata +adata.write_h5ad("DixitRegev2016_K562_TFs_highMOI.h5ad",compression="gzip") + +for key in adatas: + print(key) + adata = adatas[key] + adata.obs['tissue_type']='cell_line' + adata.obs['organism'] = 'human' + obs = adata.obs + obs.to_csv('test_obs'+key+'.csv') + if 'grna_lenient' in adata.obs.columns: + adata.obs['grna_lenient']=adata.obs['grna_lenient'].str.replace(' + ',';', regex=False) + adata.obs['guide_id']= adata.obs['perturbation'] + adata.obs['perturbation'] = adata.obs['perturbation'].astype(str) + adata.obs.loc[adata.obs['perturbation'].str.contains('INTERGENIC') &~ adata.obs['perturbation'].str.contains('+', regex=False).astype('bool'),'perturbation'] = 'control' + + if 'grna_lenient' in adata.obs.columns: + adata.obs['grna_lenient']=adata.obs['grna_lenient'].str.replace('_','-', regex=False) + + adata.obs['perturbation']=adata.obs['perturbation'].str.replace('_','-', regex=False) + adata.obs['perturbation']=adata.obs['perturbation'].str.replace(' + ','_', regex=False) + adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt' + adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL")) + qc = sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=False) + + adata.obs['ncounts'] = qc[0]['total_counts'] + adata.obs['ngenes'] = qc[0]['n_genes_by_counts'] + adata.obs['percent_mito'] = qc[0]['pct_counts_mt'] + adata.obs['percent_ribo'] = qc[0]['pct_counts_ribo'] + adata.var['ncounts'] = qc[1]['total_counts'] + adata.var['ncells'] = qc[1]['n_cells_by_counts'] + adata.obs['nperts'] = adata.obs['perturbation'].str.count('_') + 1 + adata.obs['tissue_type']='cell_line' + adata.obs['organism'] = 'human' + adata.write_h5ad('DixitRegev2016_'+key +'.h5ad', compression='gzip') + + + + -superdata = sc.concat(adatas, index_unique='-', label='library') -superdata.obs['tissue_type']='cell_line' -superdata.obs['organism'] = 'human' - -obs = adata.obs -obs['grna_lenient']=obs['grna_lenient'].str.replace(' + ',';', regex=False) -obs['guide_id']= obs['perturbation'] -## UNCOMMENT TO CHANGE TO HAVE INTERGENIC GUIDES BE TREATED AS CONTROL CONDITION - -#obs.loc[obs['perturbation'].str.contains('INTERGENIC') &~ obs['perturbation'].str.contains('+', regex=False),'perturbation'] = 'control' - -obs['grna_lenient']=obs['grna_lenient'].str.replace('_','-', regex=False) -obs['perturbation']=obs['perturbation'].str.replace('_','-', regex=False) -obs['perturbation']=obs['perturbation'].str.replace(' + ','_', regex=False) -adata.obs = obs -adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt' -adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL")) -qc = sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=False) -# -# save file -adata.obs['ncounts'] = qc[0]['total_counts'] -adata.obs['ngenes'] = qc[0]['n_genes_by_counts'] -adata.obs['percent_mito'] = qc[0]['pct_counts_mt'] -adata.obs['percent_ribo'] = qc[0]['pct_counts_ribo'] -adata.var['ncounts'] = qc[1]['total_counts'] -adata.var['ncells'] = qc[1]['n_cells_by_counts'] -adata.obs['nperts'] = adata.obs['perturbation'].str.count('_') + 1 - -write_as_singles(adata, '/fast/work/users/peidlis_c/data/perturbation_resource_paper/', 'DixitRegev2016', add_h5=True)