Skip to content

Commit

Permalink
fixed dataset processing
Browse files Browse the repository at this point in the history
added fixed ShifrutMarson notebook and updated DixitRegev script
  • Loading branch information
tessadgreen committed Aug 21, 2024
1 parent 67db2b4 commit 0895562
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 60 deletions.
Binary file modified dataset_processing/.DS_Store
Binary file not shown.
17 changes: 2 additions & 15 deletions dataset_processing/notebooks/ShifrutMarson2018.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,10 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/homebrew/Caskroom/mambaforge/base/envs/pertpy/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n"
]
}
],
"outputs": [],
"source": [
"import pertpy as pt\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import scanpy as sc\n",
"import anndata as ad\n",
"import pandas as pd "
Expand Down
101 changes: 56 additions & 45 deletions dataset_processing/scripts/DixitRegev2016.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,20 @@

from process_supp import *
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../')))
from utils import write_as_singles, read_from_singles, annotate_qc, assert_annotations

import yaml
config = yaml.safe_load(open("../../config.yaml", "r"))
config = yaml.safe_load(open("config.yaml", "r"))
DIR = config['DIR']
WDIR = config['WDIR']

path = DIR+'GSE90063/supp/'
path = DIR+'GSE90063/'
adatas = {}


# GSM2396858_K562_TFs__7_days
path = DIR+'GSE90063/supp/'
folder = 'Supp_GSM2396858_K562_TFs__7_days'
path = DIR+'GSE90063/'
folder = 'GSM2396858_k562_tfs_7'


files = get_files(path+folder, full_path=False)
name = os.path.commonprefix(files)
Expand Down Expand Up @@ -58,7 +58,7 @@
for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','):
if barcode in adata.obs_names:
val = adata.obs.loc[barcode][key]
adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna
adata.obs.loc[barcode,key] = grna if val is None else val+' + '+grna

adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna]
adata.obs = adata.obs.rename({'grna': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1)
Expand All @@ -67,15 +67,15 @@
adata.obs['cell_line'] = 'K562'
adata.obs['celltype'] = 'lymphoblasts'
adata.obs['perturbation_type'] = 'CRISPR'
adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control'
# adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control'
adata.obs['cancer'] = True
adata.obs['disease'] = 'myelogenous leukemia'
adatas['K562_TFs__7_days'] = adata

adata.write_h5ad("DixitRegev2016_K562_TFs_7days.h5ad",compression="gzip")

# GSM2396859_K562_TFs__13_days
path = DIR+'GSE90063/supp/'
folder = 'Supp_GSM2396859_K562_TFs__13_days'
path = DIR+'GSE90063/'
folder = 'GSM2396859_k562_tfs_13'

files = get_files(path+folder, full_path=False)
name = os.path.commonprefix(files)
Expand Down Expand Up @@ -107,23 +107,24 @@
for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','):
if barcode in adata.obs_names:
val = adata.obs.loc[barcode][key]
adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna
adata.obs.loc[barcode,key] = grna if val is None else val+' + '+grna
adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna]
adata.obs = adata.obs.rename({'grna': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1)
adata.obs['moi'] = 'normal'
adata.obs['time'] = 13*24 # 13 days
adata.obs['cell_line'] = 'K562'
adata.obs['celltype'] = 'lymphoblasts'
adata.obs['perturbation_type'] = 'CRISPR'
adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control'
# adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control'
adata.obs['cancer'] = True
adata.obs['disease'] = 'myelogenous leukemia'
adatas['K562_TFs__13_days'] = adata
adata.write_h5ad("DixitRegev2016_K562_TFs_13days.h5ad",compression="gzip")


# GSM2396860_K562_TFs__High_MOI
path = DIR+'GSE90063/supp/'
folder = 'Supp_GSM2396860_K562_TFs__High_MOI'
path = DIR+'GSE90063/'
folder = 'GSM2396860_k562_tfs_highmoi'

files = get_files(path+folder, full_path=False)
name = os.path.commonprefix(files)
Expand Down Expand Up @@ -155,44 +156,54 @@
for barcode in cbc_gbc_dict.loc[grna][1].replace(' ','').split(','):
if barcode in adata.obs_names:
val = adata.obs.loc[barcode][key]
adata.obs.loc[barcode][key] = grna if val is None else val+' + '+grna
adata.obs.loc[barcode,key] = grna if val is None else val+' + '+grna
adata.obs['target'] = [x.replace('p_sg', '').replace('p_', '').split('_')[0] if type(x)==str else None for x in adata.obs.grna_strict]
adata.obs = adata.obs.rename({'grna_strict': 'perturbation'}, axis=1).drop(['identifier_1', 'identifier_0'], axis=1)
adata.obs['moi'] = 'high'
adata.obs['cell_line'] = 'K562'
adata.obs['celltype'] = 'lymphoblasts'
adata.obs['perturbation_type'] = 'CRISPR'
adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control'
# shift to treating non-targeting guides as control
# adata.obs.perturbation[pd.isna(adata.obs.perturbation)] = 'control'
adata.obs['cancer'] = True
adata.obs['disease'] = 'myelogenous leukemia'
adatas['K562_TFs__High_MOI'] = adata
adata.write_h5ad("DixitRegev2016_K562_TFs_highMOI.h5ad",compression="gzip")

for key in adatas:
print(key)
adata = adatas[key]
adata.obs['tissue_type']='cell_line'
adata.obs['organism'] = 'human'
obs = adata.obs
obs.to_csv('test_obs'+key+'.csv')
if 'grna_lenient' in adata.obs.columns:
adata.obs['grna_lenient']=adata.obs['grna_lenient'].str.replace(' + ',';', regex=False)
adata.obs['guide_id']= adata.obs['perturbation']
adata.obs['perturbation'] = adata.obs['perturbation'].astype(str)
adata.obs.loc[adata.obs['perturbation'].str.contains('INTERGENIC') &~ adata.obs['perturbation'].str.contains('+', regex=False).astype('bool'),'perturbation'] = 'control'

if 'grna_lenient' in adata.obs.columns:
adata.obs['grna_lenient']=adata.obs['grna_lenient'].str.replace('_','-', regex=False)

adata.obs['perturbation']=adata.obs['perturbation'].str.replace('_','-', regex=False)
adata.obs['perturbation']=adata.obs['perturbation'].str.replace(' + ','_', regex=False)
adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))
qc = sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=False)

adata.obs['ncounts'] = qc[0]['total_counts']
adata.obs['ngenes'] = qc[0]['n_genes_by_counts']
adata.obs['percent_mito'] = qc[0]['pct_counts_mt']
adata.obs['percent_ribo'] = qc[0]['pct_counts_ribo']
adata.var['ncounts'] = qc[1]['total_counts']
adata.var['ncells'] = qc[1]['n_cells_by_counts']
adata.obs['nperts'] = adata.obs['perturbation'].str.count('_') + 1
adata.obs['tissue_type']='cell_line'
adata.obs['organism'] = 'human'
adata.write_h5ad('DixitRegev2016_'+key +'.h5ad', compression='gzip')





superdata = sc.concat(adatas, index_unique='-', label='library')
superdata.obs['tissue_type']='cell_line'
superdata.obs['organism'] = 'human'

obs = adata.obs
obs['grna_lenient']=obs['grna_lenient'].str.replace(' + ',';', regex=False)
obs['guide_id']= obs['perturbation']
## UNCOMMENT TO CHANGE TO HAVE INTERGENIC GUIDES BE TREATED AS CONTROL CONDITION

#obs.loc[obs['perturbation'].str.contains('INTERGENIC') &~ obs['perturbation'].str.contains('+', regex=False),'perturbation'] = 'control'

obs['grna_lenient']=obs['grna_lenient'].str.replace('_','-', regex=False)
obs['perturbation']=obs['perturbation'].str.replace('_','-', regex=False)
obs['perturbation']=obs['perturbation'].str.replace(' + ','_', regex=False)
adata.obs = obs
adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'
adata.var['ribo'] = adata.var_names.str.startswith(("RPS","RPL"))
qc = sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=False)
#
# save file
adata.obs['ncounts'] = qc[0]['total_counts']
adata.obs['ngenes'] = qc[0]['n_genes_by_counts']
adata.obs['percent_mito'] = qc[0]['pct_counts_mt']
adata.obs['percent_ribo'] = qc[0]['pct_counts_ribo']
adata.var['ncounts'] = qc[1]['total_counts']
adata.var['ncells'] = qc[1]['n_cells_by_counts']
adata.obs['nperts'] = adata.obs['perturbation'].str.count('_') + 1

write_as_singles(adata, '/fast/work/users/peidlis_c/data/perturbation_resource_paper/', 'DixitRegev2016', add_h5=True)

0 comments on commit 0895562

Please sign in to comment.