diff --git a/workflow/notebooks/misc/expression-and-diversity.ipynb b/workflow/notebooks/misc/expression-and-diversity.ipynb index 4d51173..c30806c 100644 --- a/workflow/notebooks/misc/expression-and-diversity.ipynb +++ b/workflow/notebooks/misc/expression-and-diversity.ipynb @@ -1,457 +1,528 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "51713823", - "metadata": {}, - "outputs": [], - "source": [ - "import anoexpress as xpress\n", - "import pandas as pd\n", - "import malariagen_data\n", - "import numpy as np\n", - "import plotly.express as px" - ] + "cells": [ + { + "cell_type": "code", + "source": [ + "%pip install anoexpress malariagen_data kaleido -U -qq" + ], + "metadata": { + "id": "-WNB2mwp6Le7" + }, + "id": "-WNB2mwp6Le7", + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51713823", + "metadata": { + "id": "51713823" + }, + "outputs": [], + "source": [ + "import anoexpress as xpress\n", + "import pandas as pd\n", + "import malariagen_data\n", + "import numpy as np\n", + "import plotly.express as px\n", + "import kaleido" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7df9188", + "metadata": { + "id": "a7df9188" + }, + "outputs": [], + "source": [ + "ag3 = malariagen_data.Ag3()" + ] + }, + { + "cell_type": "markdown", + "id": "663aa613", + "metadata": { + "id": "663aa613" + }, + "source": [ + "## Gene expression x genetic diversity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e741c3c", + "metadata": { + "id": "8e741c3c" + }, + "outputs": [], + "source": [ + "gd_df = pd.read_csv(f\"pi_pn_ps_new.tsv\", sep=\"\\t\").query(\"gene_id != 'gene_id'\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1faa0f6", + "metadata": { + "id": "c1faa0f6" + }, + "outputs": [], + "source": [ + "for col in gd_df.columns[1:5]:\n", + " gd_df[col] = gd_df[col].astype(float)\n", + "gd_df = gd_df.assign(pn_ps_ratio=lambda x:x.pn/x.ps)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2915da56", + "metadata": { + "id": "2915da56" + }, + "outputs": [], + "source": [ + "counts_df = xpress.data(data_type=\"log2counts\", analysis='gamb_colu_arab_fun')\n", + "metadata = xpress.sample_metadata(analysis='gamb_colu_arab_fun')" + ] + }, + { + "cell_type": "markdown", + "id": "ae99d81c", + "metadata": { + "id": "ae99d81c" + }, + "source": [ + "Lets first look at gene expression correlations between gambiae, coluzzii, and arabiensis." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7a19fb6", + "metadata": { + "id": "c7a19fb6" + }, + "outputs": [], + "source": [ + "species = metadata.species.unique()\n", + "\n", + "sp_counts = []\n", + "for sp in species:\n", + " ids = metadata.query(\"species == @sp\").sampleID\n", + " sp_counts.append(counts_df.loc[:, ids].apply(np.median, axis=1).to_frame().rename(columns={0:sp}))\n", + "\n", + "df = pd.concat(sp_counts, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4312a9a9", + "metadata": { + "id": "4312a9a9" + }, + "outputs": [], + "source": [ + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04a60ee8", + "metadata": { + "scrolled": false, + "id": "04a60ee8" + }, + "outputs": [], + "source": [ + "import itertools\n", + "for x, y in itertools.combinations(species, 2):\n", + "\n", + " fig = px.scatter(df,\n", + " x=x,\n", + " y=y,\n", + " opacity=0.3,\n", + " template='simple_white',\n", + " width=425,\n", + " height=400,\n", + " labels={x:f\"An. {x}\",\n", + " y:f\"An. {y}\"})#,\n", + " # title=f\"An. {x} v An. {y} log2 counts\")\n", + " fig.update_layout(font=dict(size=16), xaxis=dict(mirror=True), yaxis=dict(mirror=True))\n", + " fig.update_traces(marker=dict(size=6,\n", + " line=dict(width=1,\n", + " color='DarkSlateGrey')),\n", + " selector=dict(mode='markers'))\n", + " fig.write_image(f\"corr_{x}_{y}.png\")\n", + " fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed44f0d8", + "metadata": { + "id": "ed44f0d8" + }, + "outputs": [], + "source": [ + "counts_df = xpress.data(data_type=\"log2counts\", analysis='gamb_colu_arab')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8357746f", + "metadata": { + "scrolled": true, + "id": "8357746f" + }, + "outputs": [], + "source": [ + "median_counts = counts_df.apply(np.median, axis=1)\n", + "median_counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d0ab999", + "metadata": { + "scrolled": false, + "id": "8d0ab999" + }, + "outputs": [], + "source": [ + "median_counts = median_counts.to_frame().rename(columns={0:'median_log2counts'})\n", + "\n", + "#.assign(expression_percentile=bins_with_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51751298", + "metadata": { + "id": "51751298" + }, + "outputs": [], + "source": [ + "gd_df = gd_df.rename(columns={'gene_id':'GeneID'}).merge(median_counts.reset_index())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70e0f2e1", + "metadata": { + "id": "70e0f2e1" + }, + "outputs": [], + "source": [ + "# Define the bin edges as percentiles\n", + "bin_edges = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]\n", + "# Use pd.cut to create the bins\n", + "bins = pd.cut(gd_df.median_log2counts, bins=20, labels=False)\n", + "# Since you want bins in the format \"0-5%, 5-10%, ...\", you can create labels accordingly\n", + "labels = [f\"{bin_edges[i]}-{bin_edges[i+1]}%\" for i in range(len(bin_edges) - 1)]\n", + "# Add labels to the bins\n", + "bins_with_labels = [labels[i] for i in bins]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e07135c9", + "metadata": { + "id": "e07135c9" + }, + "outputs": [], + "source": [ + "gd_df = gd_df.assign(expression_percentile=bins_with_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75aff266", + "metadata": { + "id": "75aff266" + }, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def natural_sort(l):\n", + " convert = lambda text: int(text) if text.isdigit() else text.lower()\n", + " alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]\n", + " return sorted(l, key=alphanum_key)\n", + "\n", + "labels_order = natural_sort(np.unique(bins_with_labels))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffd6b8be", + "metadata": { + "id": "ffd6b8be" + }, + "outputs": [], + "source": [ + "# Define the custom sorting order\n", + "custom_order = ['coluzzii', 'gambiae', 'arabiensis']\n", + "\n", + "# Convert the 'species' column to a categorical data type with the custom order\n", + "gd_df['sp'] = pd.Categorical(gd_df['sp'], categories=custom_order, ordered=True)\n", + "\n", + "# Sort the DataFrame based on the 'species' column\n", + "gd_df = gd_df.sort_values('sp')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1588cb1e", + "metadata": { + "id": "1588cb1e" + }, + "outputs": [], + "source": [ + "fig = px.box(gd_df,\n", + " x='expression_percentile',\n", + " y='pn_ps_ratio',\n", + " color='sp',\n", + " labels={'pn_ps_ratio': 'pN/pS','expression_percentile':'Expression rate (percentile)'},\n", + " template='simple_white',\n", + " height=400,\n", + " width=700,\n", + " # points='suspectedoutliers',\n", + " title='Purifying Selection')\n", + "\n", + "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", + "fig.update_yaxes(range=[-0.1, 10])\n", + "fig.update_layout(showlegend=False)\n", + "fig.write_image(\"pn_ps_expression.png\", scale=2)\n", + "fig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "756bf90a", + "metadata": { + "id": "756bf90a" + }, + "outputs": [], + "source": [ + "fig = px.box(gd_df.dropna(),\n", + " x='expression_percentile',\n", + " y='theta',\n", + " color='sp',\n", + " labels={'theta': 'Wattersons Theta','expression_percentile':'Expression rate (percentile)'},\n", + " template='simple_white',\n", + " height=400,\n", + " width=700,\n", + " title='Diversity x Expression')\n", + "\n", + "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", + "fig.write_image(\"theta_expression.png\", scale=2)\n", + "fig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afce53fc", + "metadata": { + "id": "afce53fc" + }, + "outputs": [], + "source": [ + "fig = px.box(gd_df.dropna(),\n", + " x='expression_percentile',\n", + " y='pi',\n", + " color='sp',\n", + " labels={'pi': 'Nucleotide diversity','expression_percentile':'Expression rate (percentile)'},\n", + " template='simple_white',\n", + " height=400,\n", + " width=700,\n", + " title='Diversity x Expression')\n", + "fig.update_layout(showlegend=False)\n", + "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", + "fig.write_image(\"pi_expression.png\", scale=2)\n", + "fig" + ] + }, + { + "cell_type": "markdown", + "id": "ee7303c7", + "metadata": { + "id": "ee7303c7" + }, + "source": [ + "Lets calculate for each gene the total CDS length" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce3a5c0d", + "metadata": { + "id": "ce3a5c0d" + }, + "outputs": [], + "source": [ + "ag3 = malariagen_data.Ag3()\n", + "gff = ag3.genome_features()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f818a9c3", + "metadata": { + "id": "f818a9c3" + }, + "outputs": [], + "source": [ + "# from tqdm.notebook import tqdm\n", + "\n", + "# cds_lengths = []\n", + "# gene_ids = []\n", + "# for gene_id in tqdm(gd_df.dropna().GeneID.unique()):\n", + "\n", + "# df = gff.query(f\"Parent == '{gene_id}-RA' and type == 'CDS'\")\n", + "# if df.empty:\n", + "# df = gff.query(f\"Parent == '{gene_id}-RB' and type == 'CDS'\")\n", + "\n", + "# df = df.assign(exon_size=lambda x: np.abs(x.end-x.start))\n", + "# cds_length = df.exon_size.sum()\n", + "# cds_lengths.append(cds_length)\n", + "# gene_ids.append(gene_id)\n", + "\n", + "# np.save(\"cds_lengths.npy\", cds_lengths)\n", + "# np.save(\"gene_ids.npy\", gene_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad991ad0", + "metadata": { + "id": "ad991ad0" + }, + "outputs": [], + "source": [ + "cds_df = pd.DataFrame({'GeneID':np.load(\"gene_ids.npy\"), 'cds_length':np.load(\"cds_lengths.npy\")})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09259e98", + "metadata": { + "id": "09259e98" + }, + "outputs": [], + "source": [ + "gd_df = gd_df.dropna().merge(cds_df)\n", + "gd_df = gd_df.assign(cds_ratio=lambda x:1000/x.cds_length)\n", + "gd_df = gd_df.assign(non_synon_count_cds_kb=lambda x:x.pn*x.cds_ratio,\n", + " synon_count_cds_kb=lambda x:x.ps*x.cds_ratio)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7926aff", + "metadata": { + "id": "e7926aff" + }, + "outputs": [], + "source": [ + "#np.save(\"gd_df.npy\", gd_df)\n", + "\n", + "fig = px.box(gd_df,\n", + " x='expression_percentile',\n", + " y='non_synon_count_cds_kb',\n", + " color='sp',\n", + " labels={'non_synon_count_cds_kb': 'count per CDS kb',\n", + " 'expression_percentile':'Expression rate (percentile)'},\n", + " template='simple_white',\n", + " width=700,\n", + " height=400,\n", + " title='Nonsynonymous')\n", + "fig.update_layout(showlegend=False)\n", + "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", + "fig.write_image(\"non_synon_expression.png\", scale=2)\n", + "fig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "813db8b4", + "metadata": { + "id": "813db8b4" + }, + "outputs": [], + "source": [ + "fig = px.box(gd_df,\n", + " x='expression_percentile',\n", + " y='synon_count_cds_kb',\n", + " color='sp',\n", + " labels={'synon_count_cds_kb': 'count per CDS kb',\n", + " 'expression_percentile':'Expression rate (percentile)'},\n", + " template='simple_white',\n", + " width=700,\n", + " height=400,\n", + " title='Synonymous')\n", + "fig.update_layout(showlegend=False)\n", + "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", + "fig.write_image(\"synon_expression.png\", scale=2)\n", + "fig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f22f067d", + "metadata": { + "id": "f22f067d" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "colab": { + "provenance": [] + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "a7df9188", - "metadata": {}, - "outputs": [], - "source": [ - "ag3 = malariagen_data.Ag3()" - ] - }, - { - "cell_type": "markdown", - "id": "663aa613", - "metadata": {}, - "source": [ - "## Gene expression x genetic diversity" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8e741c3c", - "metadata": {}, - "outputs": [], - "source": [ - "gd_df = pd.read_csv(f\"../../../results/genetic_diversity/pi_pn_ps.tsv\", sep=\"\\t\").query(\"gene_id != 'gene_id'\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c1faa0f6", - "metadata": {}, - "outputs": [], - "source": [ - "for col in gd_df.columns[1:5]:\n", - " gd_df[col] = gd_df[col].astype(float)\n", - "gd_df = gd_df.assign(pn_ps_ratio=lambda x:x.pn/x.ps)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2915da56", - "metadata": {}, - "outputs": [], - "source": [ - "counts_df = xpress.data(data_type=\"log2counts\", analysis='gamb_colu_arab_fun')\n", - "metadata = xpress.sample_metadata(analysis='gamb_colu_arab_fun')" - ] - }, - { - "cell_type": "markdown", - "id": "ae99d81c", - "metadata": {}, - "source": [ - "Lets first look at gene expression correlations between gambiae, coluzzii, and arabiensis. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7a19fb6", - "metadata": {}, - "outputs": [], - "source": [ - "species = metadata.species.unique()\n", - "\n", - "sp_counts = []\n", - "for sp in species:\n", - " ids = metadata.query(\"species == @sp\").sampleID\n", - " sp_counts.append(counts_df.loc[:, ids].apply(np.median, axis=1).to_frame().rename(columns={0:sp}))\n", - "\n", - "df = pd.concat(sp_counts, axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4312a9a9", - "metadata": {}, - "outputs": [], - "source": [ - "import seaborn as sns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "04a60ee8", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "import itertools\n", - "for x, y in itertools.combinations(species, 2):\n", - "\n", - " fig = px.scatter(df, \n", - " x=x,\n", - " y=y, \n", - " opacity=0.3, \n", - " template='simple_white', \n", - " width=425, \n", - " height=400,\n", - " labels={x:f\"An. {x}\",\n", - " y:f\"An. {y}\"})#,\n", - " # title=f\"An. {x} v An. {y} log2 counts\")\n", - " fig.update_layout(font=dict(size=16), xaxis=dict(mirror=True), yaxis=dict(mirror=True)) \n", - " fig.update_traces(marker=dict(size=6,\n", - " line=dict(width=1,\n", - " color='DarkSlateGrey')),\n", - " selector=dict(mode='markers'))\n", - " fig.write_image(f\"../../../results/correlations/corr_{x}_{y}.png\")\n", - " fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed44f0d8", - "metadata": {}, - "outputs": [], - "source": [ - "counts_df = xpress.data(data_type=\"log2counts\", analysis='gamb_colu_arab')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8357746f", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "median_counts = counts_df.apply(np.median, axis=1)\n", - "median_counts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d0ab999", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "median_counts = median_counts.to_frame().rename(columns={0:'median_log2counts'})\n", - "\n", - "#.assign(expression_percentile=bins_with_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51751298", - "metadata": {}, - "outputs": [], - "source": [ - "gd_df = gd_df.rename(columns={'gene_id':'GeneID'}).merge(median_counts.reset_index())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "70e0f2e1", - "metadata": {}, - "outputs": [], - "source": [ - "# Define the bin edges as percentiles\n", - "bin_edges = [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100]\n", - "# Use pd.cut to create the bins\n", - "bins = pd.cut(gd_df.median_log2counts, bins=20, labels=False)\n", - "# Since you want bins in the format \"0-5%, 5-10%, ...\", you can create labels accordingly\n", - "labels = [f\"{bin_edges[i]}-{bin_edges[i+1]}%\" for i in range(len(bin_edges) - 1)]\n", - "# Add labels to the bins\n", - "bins_with_labels = [labels[i] for i in bins]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e07135c9", - "metadata": {}, - "outputs": [], - "source": [ - "gd_df = gd_df.assign(expression_percentile=bins_with_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75aff266", - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "def natural_sort(l): \n", - " convert = lambda text: int(text) if text.isdigit() else text.lower()\n", - " alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]\n", - " return sorted(l, key=alphanum_key)\n", - "\n", - "labels_order = natural_sort(np.unique(bins_with_labels))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ffd6b8be", - "metadata": {}, - "outputs": [], - "source": [ - "# Define the custom sorting order\n", - "custom_order = ['coluzzii', 'gambiae', 'arabiensis']\n", - "\n", - "# Convert the 'species' column to a categorical data type with the custom order\n", - "gd_df['sp'] = pd.Categorical(gd_df['sp'], categories=custom_order, ordered=True)\n", - "\n", - "# Sort the DataFrame based on the 'species' column\n", - "gd_df = gd_df.sort_values('sp')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1588cb1e", - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.box(gd_df, \n", - " x='expression_percentile', \n", - " y='pn_ps_ratio', \n", - " color='sp',\n", - " labels={'pn_ps_ratio': 'pN/pS','expression_percentile':'Expression rate (percentile)'},\n", - " template='simple_white', \n", - " height=400,\n", - " width=700,\n", - " # points='suspectedoutliers',\n", - " title='Purifying Selection')\n", - "\n", - "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", - "fig.update_yaxes(range=[-0.1, 10])\n", - "fig.update_layout(showlegend=False)\n", - "fig.write_image(\"../../../results/genetic_diversity/pn_ps_expression.png\")\n", - "fig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "756bf90a", - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.box(gd_df.dropna(), \n", - " x='expression_percentile', \n", - " y='theta', \n", - " color='sp',\n", - " labels={'theta': 'Wattersons Theta','expression_percentile':'Expression rate (percentile)'},\n", - " template='simple_white', \n", - " height=400,\n", - " width=700,\n", - " title='Diversity x Expression')\n", - "\n", - "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", - "fig.write_image(\"../../../results/genetic_diversity/theta_expression.png\")\n", - "fig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "afce53fc", - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.box(gd_df.dropna(), \n", - " x='expression_percentile', \n", - " y='pi', \n", - " color='sp',\n", - " labels={'pi': 'Nucleotide diversity','expression_percentile':'Expression rate (percentile)'},\n", - " template='simple_white', \n", - " height=400,\n", - " width=700,\n", - " title='Diversity x Expression')\n", - "fig.update_layout(showlegend=False)\n", - "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", - "fig.write_image(\"../../../results/genetic_diversity/pi_expression.png\")\n", - "fig" - ] - }, - { - "cell_type": "markdown", - "id": "ee7303c7", - "metadata": {}, - "source": [ - "Lets calculate for each gene the total CDS length" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce3a5c0d", - "metadata": {}, - "outputs": [], - "source": [ - "ag3 = malariagen_data.Ag3()\n", - "gff = ag3.genome_features()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f818a9c3", - "metadata": {}, - "outputs": [], - "source": [ - "# from tqdm.notebook import tqdm \n", - "\n", - "# cds_lengths = []\n", - "# gene_ids = []\n", - "# for gene_id in tqdm(gd_df.dropna().GeneID.unique()):\n", - " \n", - "# df = gff.query(f\"Parent == '{gene_id}-RA' and type == 'CDS'\")\n", - "# if df.empty:\n", - "# df = gff.query(f\"Parent == '{gene_id}-RB' and type == 'CDS'\")\n", - " \n", - "# df = df.assign(exon_size=lambda x: np.abs(x.end-x.start))\n", - "# cds_length = df.exon_size.sum()\n", - "# cds_lengths.append(cds_length)\n", - "# gene_ids.append(gene_id)\n", - " \n", - "# np.save(\"cds_lengths.npy\", cds_lengths)\n", - "# np.save(\"gene_ids.npy\", gene_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ad991ad0", - "metadata": {}, - "outputs": [], - "source": [ - "cds_df = pd.DataFrame({'GeneID':np.load(\"gene_ids.npy\"), 'cds_length':np.load(\"cds_lengths.npy\")})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "09259e98", - "metadata": {}, - "outputs": [], - "source": [ - "gd_df = gd_df.dropna().merge(cds_df)\n", - "gd_df = gd_df.assign(cds_ratio=lambda x:1000/x.cds_length)\n", - "gd_df = gd_df.assign(non_synon_count_cds_kb=lambda x:x.pn*x.cds_ratio,\n", - " synon_count_cds_kb=lambda x:x.ps*x.cds_ratio)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7926aff", - "metadata": {}, - "outputs": [], - "source": [ - "#np.save(\"gd_df.npy\", gd_df)\n", - "\n", - "fig = px.box(gd_df, \n", - " x='expression_percentile', \n", - " y='non_synon_count_cds_kb',\n", - " color='sp',\n", - " labels={'non_synon_count_cds_kb': 'count per CDS kb',\n", - " 'expression_percentile':'Expression rate (percentile)'},\n", - " template='simple_white', \n", - " width=700,\n", - " height=400,\n", - " title='Nonsynonymous')\n", - "fig.update_layout(showlegend=False)\n", - "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", - "fig.write_image(\"../../../results/genetic_diversity/non_synon_expression.png\")\n", - "fig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "813db8b4", - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.box(gd_df, \n", - " x='expression_percentile', \n", - " y='synon_count_cds_kb', \n", - " color='sp',\n", - " labels={'synon_count_cds_kb': 'count per CDS kb',\n", - " 'expression_percentile':'Expression rate (percentile)'},\n", - " template='simple_white', \n", - " width=700,\n", - " height=400,\n", - " title='Synonymous')\n", - "fig.update_layout(showlegend=False)\n", - "fig.update_xaxes(categoryorder='array', categoryarray=labels_order)\n", - "fig.write_image(\"../../../results/genetic_diversity/synon_expression.png\")\n", - "fig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f22f067d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file