diff --git a/dataset_processing/.DS_Store b/dataset_processing/.DS_Store index c8a89b7..0ea71d3 100644 Binary files a/dataset_processing/.DS_Store and b/dataset_processing/.DS_Store differ diff --git a/dataset_processing/notebooks/ShifrutMarson2018.ipynb b/dataset_processing/notebooks/ShifrutMarson2018.ipynb new file mode 100644 index 0000000..df1d568 --- /dev/null +++ b/dataset_processing/notebooks/ShifrutMarson2018.ipynb @@ -0,0 +1,836 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/mambaforge/base/envs/pertpy/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.\n" + ] + } + ], + "source": [ + "import pertpy as pt\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import scanpy as sc\n", + "import anndata as ad\n", + "import pandas as pd " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "adata_d1n = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375483_D1N_matrix')\n", + "adata_d1n.obs.index = [x.split('-')[0] for x in adata_d1n.obs.index]\n", + "adata_d1n.obs['sample'] = 'D1_nostim'\n", + "adata_d1n.obs['patient'] = 'D1'\n", + "\n", + "\n", + "adata_d2n = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375485_D2N_matrix')\n", + "adata_d2n.obs.index = [x.split('-')[0] for x in adata_d2n.obs.index]\n", + "adata_d2n.obs['sample'] = 'D2_nostim'\n", + "adata_d2n.obs['patient'] = 'D2'\n", + "\n", + "adata_d1s = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375484_D1S_matrix')\n", + "adata_d1s.obs.index = [x.split('-')[0] for x in adata_d1s.obs.index]\n", + "adata_d1s.obs['sample'] = 'D1_stim'\n", + "adata_d1s.obs['patient'] = 'D1'\n", + "\n", + "adata_d2s = sc.read_10x_mtx('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375486_D2S_matrix')\n", + "adata_d2s.obs.index = [x.split('-')[0] for x in adata_d2s.obs.index]\n", + "adata_d2s.obs['sample'] = 'D2_stim'\n", + "adata_d2s.obs['patient'] = 'D2'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "obsmat_d1n = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375487_D1N_CellBC_sgRNA.csv', index_col=0)\n", + "obsmat_d2n = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375489_D2N_CellBC_sgRNA.csv', index_col=0)\n", + "obsmat_d1s = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375488_D1S_CellBC_sgRNA.csv', index_col=0)\n", + "obsmat_d2s = pd.read_csv('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/GSE119450_RAW/GSM3375490_D2S_CellBC_sgRNA.csv', index_col=0)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def integrate_obsmat(adata, obsmat, copy=True):\n", + " \"\"\" set the corresponding columns of adata.obs to the values in obsmat.\n", + " copy boolean slows performance but prevents modification of original adata.\"\"\"\n", + " if copy:\n", + " adata = adata.copy() # make a copy so we don't modify the original\n", + " adata.obs['guide_id'] = 'NA'\n", + " adata.obs['guide_counts']= 0\n", + " for i in obsmat.index:\n", + " if i in adata.obs.index:\n", + " adata.obs.loc[i, 'guide_id'] = obsmat.loc[i]['gRNA.ID']\n", + " adata.obs.loc[i, 'guide_counts'] = obsmat.loc[i]['UMI.count']\n", + " return(adata)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "adata_d1n = integrate_obsmat(adata_d1n, obsmat_d1n, copy=False)\n", + "adata_d2n = integrate_obsmat(adata_d2n, obsmat_d2n, copy=False)\n", + "adata_d1s = integrate_obsmat(adata_d1s, obsmat_d1s, copy=False)\n", + "adata_d2s = integrate_obsmat(adata_d2s, obsmat_d2s, copy=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/homebrew/Caskroom/mambaforge/base/envs/pertpy/lib/python3.11/site-packages/anndata/_core/anndata.py:1818: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.\n", + " utils.warn_names_duplicates(\"obs\")\n" + ] + } + ], + "source": [ + "adata = ad.concat([adata_d1n, adata_d2n, adata_d1s, adata_d2s], join='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs_names_make_unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sample\n", + "D1_nostim 11105\n", + "D1_stim 15829\n", + "D2_nostim 11486\n", + "D2_stim 13816\n", + "dtype: int64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs.groupby('sample').size()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AAACCTGAGACACTAA [NA]\n", + "AAACCTGAGAGACTTA [NA]\n", + "AAACCTGAGCATCATC [NA]\n", + "AAACCTGAGCGATTCT [ES, sg26, PDCD1]\n", + "AAACCTGAGGGCTTCC [NA]\n", + " ... \n", + "TTTGTCATCCTCAACC [NA]\n", + "TTTGTCATCTCGCATC [NA]\n", + "TTTGTCATCTTAGAGC [ES, sg34, TCEB2]\n", + "TTTGTCATCTTATCTG [ES, sg35, TCEB2]\n", + "TTTGTCATCTTGTCAT [NA]\n", + "Name: guide_id, Length: 52236, dtype: object" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs['guide_id'].str.split('.')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# split the entries column 'guide_id' that contain periods by the period, and\n", + "# take the third element and put it into a new 'perturbation' column\n", + "\n", + "adata.obs['target'] = adata.obs['guide_id'].str.split('.').str[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs['perturbation'] = adata.obs['target']\n", + "# set all NaN values to \"control\"\n", + "adata.obs['perturbation'] = adata.obs['perturbation'].fillna('control')\n", + "# set all NonTarget values to \"control\"\n", + "adata.obs['perturbation'] = adata.obs['perturbation'].replace('NonTarget', 'control')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs['perturbation_2'] = adata.obs['sample'].str.split('_').str[1]\n", + "\n", + "# set all \"nostim\" values to \"control\"\n", + "adata.obs['perturbation_2'] = adata.obs['perturbation_2'].replace('nostim', 'control')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "perturbation\n", + "control 30683\n", + "DGKA 2296\n", + "PDCD1 1484\n", + "TMEM222 1426\n", + "BTLA 1412\n", + "HAVCR2 1355\n", + "CBLB 1327\n", + "CD5 1080\n", + "C10orf54 1058\n", + "MEF2D 1026\n", + "DGKZ 1020\n", + "LCP2 981\n", + "TCEB2 929\n", + "RASA2 905\n", + "CD3D 856\n", + "LAG3 840\n", + "SOCS1 835\n", + "TNFRSF9 777\n", + "CDKN1B 749\n", + "ARID1A 625\n", + "STAT6 572\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs['perturbation'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "adata.obs['disease']= \"healthy\"\n", + "adata.obs['cancer']= False\n", + "adata.obs['tissue_type']=\"primary\"\n", + "adata.obs['organism']=\"human\"\n", + "adata.obs['perturbation_type']=\"CRISPR\"\n", + "adata.obs['perturbation_type_2']= \"TCR stimulation\"\n", + "adata.obs['nperts']=1\n", + "adata.obs['celltype']=\"T cells\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "adata.var['mt'] = adata.var_names.str.startswith('MT-') # annotate the group of mitochondrial genes as 'mt'\n", + "adata.var['ribo']= adata.var_names.str.startswith('RPS') | adata.var_names.str.startswith('RPL') # annotate the group of ribosomal genes as 'ribo'" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "qc = sc.pp.calculate_qc_metrics(adata, qc_vars=['mt','ribo'], percent_top=None, log1p=False, inplace=False) " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "adata.obs['ncounts'] = qc[0]['total_counts']\n", + "adata.obs['ngenes'] = qc[0]['n_genes_by_counts']\n", + "adata.obs['percent_mito'] = qc[0]['pct_counts_mt']\n", + "adata.obs['percent_ribo'] = qc[0]['pct_counts_ribo']\n", + "adata.var['ncounts'] = qc[1]['total_counts']\n", + "adata.var['ncells'] = qc[1]['n_cells_by_counts']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "adata.var.drop(columns=['mt', 'ribo'], inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ncountsncells
RP11-34P13.30.00
FAM138A0.00
OR4F50.00
RP11-34P13.773.073
RP11-34P13.814.014
.........
AC233755.20.00
AC233755.14.04
AC240274.1296.0287
AC213203.10.00
FAM231B0.00
\n", + "

33694 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " ncounts ncells\n", + "RP11-34P13.3 0.0 0\n", + "FAM138A 0.0 0\n", + "OR4F5 0.0 0\n", + "RP11-34P13.7 73.0 73\n", + "RP11-34P13.8 14.0 14\n", + "... ... ...\n", + "AC233755.2 0.0 0\n", + "AC233755.1 4.0 4\n", + "AC240274.1 296.0 287\n", + "AC213203.1 0.0 0\n", + "FAM231B 0.0 0\n", + "\n", + "[33694 rows x 2 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.var" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
samplepatientguide_idguide_countstargetperturbationperturbation_2diseasecancertissue_typeorganismperturbation_typeperturbation_type_2npertscelltypencountsngenespercent_mitopercent_ribo
AAACCTGAGACACTAAD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells4367.017161.85482026.402565
AAACCTGAGAGACTTAD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells5846.019983.25008630.294218
AAACCTGAGCATCATCD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells3377.014382.93159628.042641
AAACCTGAGCGATTCTD1_nostimD1ES.sg26.PDCD15PDCD1PDCD1controlhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells5710.019932.41681333.047287
AAACCTGAGGGCTTCCD1_nostimD1NA0NaNcontrolcontrolhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells3077.012660.74748136.529087
............................................................
TTTGTCATCCTCAACCD2_stimD2NA0NaNcontrolstimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells7121.023332.89285225.530121
TTTGTCATCTCGCATCD2_stimD2NA0NaNcontrolstimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells4100.015351.97561034.439026
TTTGTCATCTTAGAGCD2_stimD2ES.sg34.TCEB22TCEB2TCEB2stimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells4293.015562.07314235.616119
TTTGTCATCTTATCTGD2_stimD2ES.sg35.TCEB26TCEB2TCEB2stimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells7735.025322.30122827.666452
TTTGTCATCTTGTCATD2_stimD2NA0NaNcontrolstimhealthyFalseprimaryhumanCRISPRTCR stimulation1T cells3981.016513.36598827.530771
\n", + "

52236 rows × 19 columns

\n", + "
" + ], + "text/plain": [ + " sample patient guide_id guide_counts target \\\n", + "AAACCTGAGACACTAA D1_nostim D1 NA 0 NaN \n", + "AAACCTGAGAGACTTA D1_nostim D1 NA 0 NaN \n", + "AAACCTGAGCATCATC D1_nostim D1 NA 0 NaN \n", + "AAACCTGAGCGATTCT D1_nostim D1 ES.sg26.PDCD1 5 PDCD1 \n", + "AAACCTGAGGGCTTCC D1_nostim D1 NA 0 NaN \n", + "... ... ... ... ... ... \n", + "TTTGTCATCCTCAACC D2_stim D2 NA 0 NaN \n", + "TTTGTCATCTCGCATC D2_stim D2 NA 0 NaN \n", + "TTTGTCATCTTAGAGC D2_stim D2 ES.sg34.TCEB2 2 TCEB2 \n", + "TTTGTCATCTTATCTG D2_stim D2 ES.sg35.TCEB2 6 TCEB2 \n", + "TTTGTCATCTTGTCAT D2_stim D2 NA 0 NaN \n", + "\n", + " perturbation perturbation_2 disease cancer tissue_type \\\n", + "AAACCTGAGACACTAA control control healthy False primary \n", + "AAACCTGAGAGACTTA control control healthy False primary \n", + "AAACCTGAGCATCATC control control healthy False primary \n", + "AAACCTGAGCGATTCT PDCD1 control healthy False primary \n", + "AAACCTGAGGGCTTCC control control healthy False primary \n", + "... ... ... ... ... ... \n", + "TTTGTCATCCTCAACC control stim healthy False primary \n", + "TTTGTCATCTCGCATC control stim healthy False primary \n", + "TTTGTCATCTTAGAGC TCEB2 stim healthy False primary \n", + "TTTGTCATCTTATCTG TCEB2 stim healthy False primary \n", + "TTTGTCATCTTGTCAT control stim healthy False primary \n", + "\n", + " organism perturbation_type perturbation_type_2 nperts \\\n", + "AAACCTGAGACACTAA human CRISPR TCR stimulation 1 \n", + "AAACCTGAGAGACTTA human CRISPR TCR stimulation 1 \n", + "AAACCTGAGCATCATC human CRISPR TCR stimulation 1 \n", + "AAACCTGAGCGATTCT human CRISPR TCR stimulation 1 \n", + "AAACCTGAGGGCTTCC human CRISPR TCR stimulation 1 \n", + "... ... ... ... ... \n", + "TTTGTCATCCTCAACC human CRISPR TCR stimulation 1 \n", + "TTTGTCATCTCGCATC human CRISPR TCR stimulation 1 \n", + "TTTGTCATCTTAGAGC human CRISPR TCR stimulation 1 \n", + "TTTGTCATCTTATCTG human CRISPR TCR stimulation 1 \n", + "TTTGTCATCTTGTCAT human CRISPR TCR stimulation 1 \n", + "\n", + " celltype ncounts ngenes percent_mito percent_ribo \n", + "AAACCTGAGACACTAA T cells 4367.0 1716 1.854820 26.402565 \n", + "AAACCTGAGAGACTTA T cells 5846.0 1998 3.250086 30.294218 \n", + "AAACCTGAGCATCATC T cells 3377.0 1438 2.931596 28.042641 \n", + "AAACCTGAGCGATTCT T cells 5710.0 1993 2.416813 33.047287 \n", + "AAACCTGAGGGCTTCC T cells 3077.0 1266 0.747481 36.529087 \n", + "... ... ... ... ... ... \n", + "TTTGTCATCCTCAACC T cells 7121.0 2333 2.892852 25.530121 \n", + "TTTGTCATCTCGCATC T cells 4100.0 1535 1.975610 34.439026 \n", + "TTTGTCATCTTAGAGC T cells 4293.0 1556 2.073142 35.616119 \n", + "TTTGTCATCTTATCTG T cells 7735.0 2532 2.301228 27.666452 \n", + "TTTGTCATCTTGTCAT T cells 3981.0 1651 3.365988 27.530771 \n", + "\n", + "[52236 rows x 19 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adata.obs" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "adata.write_h5ad('/Users/tessagreen/Documents/datasets/ShifrutMarson2018/ShifrutMarson2018.h5ad')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pertpy", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/dataset_processing/scripts/ShifrutMarson2018.py b/dataset_processing/scripts/ShifrutMarson2018.py deleted file mode 100644 index fe5cfc9..0000000 --- a/dataset_processing/scripts/ShifrutMarson2018.py +++ /dev/null @@ -1,67 +0,0 @@ -import scanpy as sc -import pandas as pd -import os -import sys -import numpy as np -import gzip -import matplotlib.pyplot as pl -import re - -from scipy.io import mmread -from scipy.sparse import csr_matrix -from tqdm import tqdm - -from process_supp import * -sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../'))) -from utils import write_as_singles, read_from_singles, annotate_qc, assert_annotations - -import yaml -config = yaml.safe_load(open("../../config.yaml", "r")) -DIR = config['DIR'] -WDIR = config['WDIR'] - -import tarfile -adatas = {} -folders = get_subfolders(DIR+'GSE119450/supp/') -for d in ['D1', 'D2']: - for cond in ['Stim', 'NoStim']: - sample_name = d+'_'+cond - gex_path=[x for x in folders if sample_name+'_10x' in x][0] - obs_path=[x for x in folders if sample_name+'_ReAmp' in x][0] - - # extract GEX and read - file = [f for f in get_files(gex_path) if '.gz' in f][0] - tar = tarfile.open(file, "r:gz") - tar.extractall(path=gex_path) - tar.close() - adata = sc.read_10x_mtx(gex_path) - adata.obs.index = [x.split('-')[0] for x in adata.obs.index] - - # read and add metadata - tab=pd.read_csv(get_files(obs_path)[0], index_col=0) - tab=tab.rename({'UMI.count': 'gRNA_UMI_count', 'gRNA.ID': 'gRNA_ID'}, axis=1)[['gRNA_ID', 'gRNA_UMI_count']] - adata.obs = pd.merge(adata.obs, tab, how='left', left_index=True, right_index=True) - adata.obs['donor'] = d - adata.obs['condition'] = cond - adata.obs['sample_name'] = sample_name - - adata.write(DIR+'GSE119450/supp/GSE119450' + sample_name + '_processed_supp.h5') - adatas[sample_name] = adata -adata = sc.concat(adatas, index_unique='-') - -# reform var -adata.var.index = adata.var.index.rename('gene_symbol') -# reform obs -adata.obs['cancer'] = False -adata.obs.index=adata.obs.index.rename('cell_barcode') -adata.obs=adata.obs.rename({'gRNA_ID': 'perturbation', 'donor': 'replicate', 'condition': 'perturbation_2', 'sample_name': 'library'}, axis=1) -adata.obs['celltype'] = 'T cells' -adata.obs['organism'] = 'human' -adata.obs['disease'] = 'healthy' -adata.obs['tissue_type']='primary' -adata.obs['perturbation_type'] = 'CRISPR' -adata.obs['perturbation_type_2'] = 'TCR stimulation' -adata.obs.perturbation = adata.obs.perturbation.astype(str) -adata.obs.perturbation[adata.obs.perturbation=='nan'] = 'control' - -adata.write(WDIR+'/ShifrutMarson2018.h5')