diff --git a/dataset_processing/.DS_Store b/dataset_processing/.DS_Store index c8a89b7..0ea71d3 100644 Binary files a/dataset_processing/.DS_Store and b/dataset_processing/.DS_Store differ diff --git a/dataset_processing/notebooks/ShifrutMarson2018.ipynb b/dataset_processing/notebooks/ShifrutMarson2018.ipynb new file mode 100644 index 0000000..df1d568 --- /dev/null +++ b/dataset_processing/notebooks/ShifrutMarson2018.ipynb @@ -0,0 +1,836 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/mambaforge/base/envs/pertpy/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], + "source": [ + "import pertpy as pt\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import scanpy as sc\n", + "import anndata as ad\n", + "import pandas as pd " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "adata_d1n = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375483_D1N_matrix')\n", + "adata_d1n.obs.index = [x.split('-')[0] for x in adata_d1n.obs.index]\n", + "adata_d1n.obs['sample'] = 'D1_nostim'\n", + "adata_d1n.obs['patient'] = 'D1'\n", + "\n", + "\n", + "adata_d2n = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375485_D2N_matrix')\n", + "adata_d2n.obs.index = [x.split('-')[0] for x in adata_d2n.obs.index]\n", + "adata_d2n.obs['sample'] = 'D2_nostim'\n", + "adata_d2n.obs['patient'] = 'D2'\n", + "\n", + "adata_d1s = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375484_D1S_matrix')\n", + "adata_d1s.obs.index = [x.split('-')[0] for x in adata_d1s.obs.index]\n", + "adata_d1s.obs['sample'] = 'D1_stim'\n", + "adata_d1s.obs['patient'] = 'D1'\n", + "\n", + "adata_d2s = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375486_D2S_matrix')\n", + "adata_d2s.obs.index = [x.split('-')[0] for x in adata_d2s.obs.index]\n", + "adata_d2s.obs['sample'] = 'D2_stim'\n", + "adata_d2s.obs['patient'] = 'D2'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "obsmat_d1n = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375487_D1N_CellBC_sgRNA.csv', index_col=0)\n", + "obsmat_d2n = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375489_D2N_CellBC_sgRNA.csv', index_col=0)\n", + "obsmat_d1s = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375488_D1S_CellBC_sgRNA.csv', index_col=0)\n", + "obsmat_d2s = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375490_D2S_CellBC_sgRNA.csv', index_col=0)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def integrate_obsmat(adata, obsmat, copy=True):\n", + " \"\"\" set the corresponding columns of adata.obs to the values in obsmat.\n", + " copy boolean slows performance but prevents modification of original adata.\"\"\"\n", + " if copy:\n", + " adata = adata.copy() # make a copy so we don't modify the original\n", + " adata.obs['guide_id'] = 'NA'\n", + " adata.obs['guide_counts']= 0\n", + " for i in obsmat.index:\n", + " if i in adata.obs.index:\n", + " adata.obs.loc[i, 'guide_id'] = obsmat.loc[i]['gRNA.ID']\n", + " adata.obs.loc[i, 'guide_counts'] = obsmat.loc[i]['UMI.count']\n", + " return(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "adata_d1n = integrate_obsmat(adata_d1n, obsmat_d1n, copy=False)\n", + "adata_d2n = integrate_obsmat(adata_d2n, obsmat_d2n, copy=False)\n", + "adata_d1s = integrate_obsmat(adata_d1s, obsmat_d1s, copy=False)\n", + "adata_d2s = integrate_obsmat(adata_d2s, obsmat_d2s, copy=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/mambaforge/base/envs/pertpy/lib/python3.11/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", + " utils.warn_names_duplicates(\"obs\")\n" + ] + } + ], + "source": [ + "adata = ad.concat([adata_d1n, adata_d2n, adata_d1s, adata_d2s], join='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs_names_make_unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sample\n", + "D1_nostim 11105\n", + "D1_stim 15829\n", + "D2_nostim 11486\n", + "D2_stim 13816\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.groupby('sample').size()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AAACCTGAGACACTAA [NA]\n", + "AAACCTGAGAGACTTA [NA]\n", + "AAACCTGAGCATCATC [NA]\n", + "AAACCTGAGCGATTCT [ES, sg26, PDCD1]\n", + "AAACCTGAGGGCTTCC [NA]\n", + " ... \n", + "TTTGTCATCCTCAACC [NA]\n", + "TTTGTCATCTCGCATC [NA]\n", + "TTTGTCATCTTAGAGC [ES, sg34, TCEB2]\n", + "TTTGTCATCTTATCTG [ES, sg35, TCEB2]\n", + "TTTGTCATCTTGTCAT [NA]\n", + "Name: guide_id, Length: 52236, dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs['guide_id'].str.split('.')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# split the entries column 'guide_id' that contain periods by the period, and\n", + "# take the third element and put it into a new 'perturbation' column\n", + "\n", + "adata.obs['target'] = adata.obs['guide_id'].str.split('.').str[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs['perturbation'] = adata.obs['target']\n", + "# set all NaN values to \"control\"\n", + "adata.obs['perturbation'] = adata.obs['perturbation'].fillna('control')\n", + "# set all NonTarget values to \"control\"\n", + "adata.obs['perturbation'] = adata.obs['perturbation'].replace('NonTarget', 'control')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs['perturbation_2'] = adata.obs['sample'].str.split('_').str[1]\n", + "\n", + "# set all \"nostim\" values to \"control\"\n", + "adata.obs['perturbation_2'] = adata.obs['perturbation_2'].replace('nostim', 'control')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "perturbation\n", + "control 30683\n", + "DGKA 2296\n", + "PDCD1 1484\n", + "TMEM222 1426\n", + "BTLA 1412\n", + "HAVCR2 1355\n", + "CBLB 1327\n", + "CD5 1080\n", + "C10orf54 1058\n", + "MEF2D 1026\n", + "DGKZ 1020\n", + "LCP2 981\n", + "TCEB2 929\n", + "RASA2 905\n", + "CD3D 856\n", + "LAG3 840\n", + "SOCS1 835\n", + "TNFRSF9 777\n", + "CDKN1B 749\n", + "ARID1A 625\n", + "STAT6 572\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs['perturbation'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs['disease']= \"healthy\"\n", + "adata.obs['cancer']= False\n", + "adata.obs['tissue_type']=\"primary\"\n", + "adata.obs['organism']=\"human\"\n", + "adata.obs['perturbation_type']=\"CRISPR\"\n", + "adata.obs['perturbation_type_2']= \"TCR stimulation\"\n", + "adata.obs['nperts']=1\n", + "adata.obs['celltype']=\"T cells\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'\n", + "adata.var['ribo']= adata.var_names.str.startswith('RPS') | adata.var_names.str.startswith('RPL') # annotate the group of ribosomal genes as 'ribo'" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "qc = sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "adata.obs['ncounts'] = qc[0]['total_counts']\n", + "adata.obs['ngenes'] = qc[0]['n_genes_by_counts']\n", + "adata.obs['percent_mito'] = qc[0]['pct_counts_mt']\n", + "adata.obs['percent_ribo'] = qc[0]['pct_counts_ribo']\n", + "adata.var['ncounts'] = qc[1]['total_counts']\n", + "adata.var['ncells'] = qc[1]['n_cells_by_counts']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "adata.var.drop(columns=['mt', 'ribo'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | ncounts | \n", + "ncells | \n", + "
---|---|---|
RP11-34P13.3 | \n", + "0.0 | \n", + "0 | \n", + "
FAM138A | \n", + "0.0 | \n", + "0 | \n", + "
OR4F5 | \n", + "0.0 | \n", + "0 | \n", + "
RP11-34P13.7 | \n", + "73.0 | \n", + "73 | \n", + "
RP11-34P13.8 | \n", + "14.0 | \n", + "14 | \n", + "
... | \n", + "... | \n", + "... | \n", + "
AC233755.2 | \n", + "0.0 | \n", + "0 | \n", + "
AC233755.1 | \n", + "4.0 | \n", + "4 | \n", + "
AC240274.1 | \n", + "296.0 | \n", + "287 | \n", + "
AC213203.1 | \n", + "0.0 | \n", + "0 | \n", + "
FAM231B | \n", + "0.0 | \n", + "0 | \n", + "
33694 rows × 2 columns
\n", + "\n", + " | sample | \n", + "patient | \n", + "guide_id | \n", + "guide_counts | \n", + "target | \n", + "perturbation | \n", + "perturbation_2 | \n", + "disease | \n", + "cancer | \n", + "tissue_type | \n", + "organism | \n", + "perturbation_type | \n", + "perturbation_type_2 | \n", + "nperts | \n", + "celltype | \n", + "ncounts | \n", + "ngenes | \n", + "percent_mito | \n", + "percent_ribo | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AAACCTGAGACACTAA | \n", + "D1_nostim | \n", + "D1 | \n", + "NA | \n", + "0 | \n", + "NaN | \n", + "control | \n", + "control | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "4367.0 | \n", + "1716 | \n", + "1.854820 | \n", + "26.402565 | \n", + "
AAACCTGAGAGACTTA | \n", + "D1_nostim | \n", + "D1 | \n", + "NA | \n", + "0 | \n", + "NaN | \n", + "control | \n", + "control | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "5846.0 | \n", + "1998 | \n", + "3.250086 | \n", + "30.294218 | \n", + "
AAACCTGAGCATCATC | \n", + "D1_nostim | \n", + "D1 | \n", + "NA | \n", + "0 | \n", + "NaN | \n", + "control | \n", + "control | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "3377.0 | \n", + "1438 | \n", + "2.931596 | \n", + "28.042641 | \n", + "
AAACCTGAGCGATTCT | \n", + "D1_nostim | \n", + "D1 | \n", + "ES.sg26.PDCD1 | \n", + "5 | \n", + "PDCD1 | \n", + "PDCD1 | \n", + "control | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "5710.0 | \n", + "1993 | \n", + "2.416813 | \n", + "33.047287 | \n", + "
AAACCTGAGGGCTTCC | \n", + "D1_nostim | \n", + "D1 | \n", + "NA | \n", + "0 | \n", + "NaN | \n", + "control | \n", + "control | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "3077.0 | \n", + "1266 | \n", + "0.747481 | \n", + "36.529087 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
TTTGTCATCCTCAACC | \n", + "D2_stim | \n", + "D2 | \n", + "NA | \n", + "0 | \n", + "NaN | \n", + "control | \n", + "stim | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "7121.0 | \n", + "2333 | \n", + "2.892852 | \n", + "25.530121 | \n", + "
TTTGTCATCTCGCATC | \n", + "D2_stim | \n", + "D2 | \n", + "NA | \n", + "0 | \n", + "NaN | \n", + "control | \n", + "stim | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "4100.0 | \n", + "1535 | \n", + "1.975610 | \n", + "34.439026 | \n", + "
TTTGTCATCTTAGAGC | \n", + "D2_stim | \n", + "D2 | \n", + "ES.sg34.TCEB2 | \n", + "2 | \n", + "TCEB2 | \n", + "TCEB2 | \n", + "stim | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "4293.0 | \n", + "1556 | \n", + "2.073142 | \n", + "35.616119 | \n", + "
TTTGTCATCTTATCTG | \n", + "D2_stim | \n", + "D2 | \n", + "ES.sg35.TCEB2 | \n", + "6 | \n", + "TCEB2 | \n", + "TCEB2 | \n", + "stim | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "7735.0 | \n", + "2532 | \n", + "2.301228 | \n", + "27.666452 | \n", + "
TTTGTCATCTTGTCAT | \n", + "D2_stim | \n", + "D2 | \n", + "NA | \n", + "0 | \n", + "NaN | \n", + "control | \n", + "stim | \n", + "healthy | \n", + "False | \n", + "primary | \n", + "human | \n", + "CRISPR | \n", + "TCR stimulation | \n", + "1 | \n", + "T cells | \n", + "3981.0 | \n", + "1651 | \n", + "3.365988 | \n", + "27.530771 | \n", + "
52236 rows × 19 columns
\n", + "