diff --git a/docs/ingest/gene-ensembl-plant-release-57.ipynb b/docs/ingest/gene-ensembl-plant-release-57.ipynb new file mode 100644 index 0000000..46a2f1d --- /dev/null +++ b/docs/ingest/gene-ensembl-plant-release-57.ipynb @@ -0,0 +1,631 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plant `Gene`: ensembl, release-57\n", + "\n", + "- https://www.ensembl.org/info/data/mysql.html\n", + "- https://www.ensembl.org/info/docs/api/core/core_schema.html\n", + "\n", + "Install mysqlclient: https://pypi.org/project/mysqlclient/" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from bionty_base.entities._gene import EnsemblGene" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading arabidopsis thaliana...\n", + "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/arabidopsis_thaliana_core_57_110_11\n", + "\u001b[94m•\u001b[0m fetching records from the core DB...\n", + "\u001b[94m•\u001b[0m fetching records from the external DBs...\n", + "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 438\n", + "\u001b[93m!\u001b[0m no ensembl_gene_id found, writing to table_id column.\n", + "\u001b[92m✓\u001b[0m downloaded Gene table containing 33127 entries.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0AT1G01010NAC001839580protein_codingNAC domain containing protein 1T25K16.1|ANAC001|T25K16_1|NAC domain containin...
1AT1G01020ARV1839569protein_codingARV1 family proteinT25K16.2|T25K16_2
2AT1G01030NGA3839321protein_codingAP2/B3-like transcriptional factor family prot...T25K16.3|NGATHA3|T25K16_3
3AT1G01040DCL1839574protein_codingdicer-like 1T25K16_4|SUS1|SHORT INTEGUMENTS 1|T25K16.4|CAF...
4AT1G01046ath-MIR8386240410miRNAath-MIR838
\n", + "
" + ], + "text/plain": [ + " stable_id symbol ncbi_gene_id biotype \\\n", + "0 AT1G01010 NAC001 839580 protein_coding \n", + "1 AT1G01020 ARV1 839569 protein_coding \n", + "2 AT1G01030 NGA3 839321 protein_coding \n", + "3 AT1G01040 DCL1 839574 protein_coding \n", + "4 AT1G01046 ath-MIR838 6240410 miRNA \n", + "\n", + " description \\\n", + "0 NAC domain containing protein 1 \n", + "1 ARV1 family protein \n", + "2 AP2/B3-like transcriptional factor family prot... \n", + "3 dicer-like 1 \n", + "4 ath-MIR838 \n", + "\n", + " synonyms \n", + "0 T25K16.1|ANAC001|T25K16_1|NAC domain containin... \n", + "1 T25K16.2|T25K16_2 \n", + "2 T25K16.3|NGATHA3|T25K16_3 \n", + "3 T25K16_4|SUS1|SHORT INTEGUMENTS 1|T25K16.4|CAF... \n", + "4 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading medicago truncatula...\n", + "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/medicago_truncatula_core_57_110_2\n", + "\u001b[94m•\u001b[0m fetching records from the core DB...\n", + "\u001b[94m•\u001b[0m fetching records from the external DBs...\n", + "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 1589\n", + "\u001b[92m✓\u001b[0m downloaded Gene table containing 1328 entries.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ensembl_gene_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0ENSRNA049434913tRNA-AsnNaNtRNAtRNA-Asn for anticodon GUU
1ENSRNA049434956tRNA-GluNaNtRNAtRNA-Glu for anticodon UUC
2ENSRNA049434965tRNA-MetNaNtRNAtRNA-Met for anticodon CAU
3ENSRNA049435003tRNA-LeuNaNtRNAtRNA-Leu for anticodon AAG
4ENSRNA049435027tRNA-GlyNaNtRNAtRNA-Gly for anticodon UCC
\n", + "
" + ], + "text/plain": [ + " ensembl_gene_id symbol ncbi_gene_id biotype description \\\n", + "0 ENSRNA049434913 tRNA-Asn NaN tRNA tRNA-Asn for anticodon GUU \n", + "1 ENSRNA049434956 tRNA-Glu NaN tRNA tRNA-Glu for anticodon UUC \n", + "2 ENSRNA049434965 tRNA-Met NaN tRNA tRNA-Met for anticodon CAU \n", + "3 ENSRNA049435003 tRNA-Leu NaN tRNA tRNA-Leu for anticodon AAG \n", + "4 ENSRNA049435027 tRNA-Gly NaN tRNA tRNA-Gly for anticodon UCC \n", + "\n", + " synonyms \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading solanum lycopersicum...\n", + "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/solanum_lycopersicum_core_57_110_3\n", + "\u001b[94m•\u001b[0m fetching records from the core DB...\n", + "\u001b[94m•\u001b[0m fetching records from the external DBs...\n", + "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 243\n", + "\u001b[92m✓\u001b[0m downloaded Gene table containing 1167 entries.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ensembl_gene_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0ENSRNA049444660tRNA-ValNaNtRNAtRNA-Val for anticodon UAC
1ENSRNA049446579tRNA-IleNaNtRNAtRNA-Ile for anticodon AAU
2ENSRNA050028289tRNA-SerNaNtRNAtRNA-Ser for anticodon AGA
3ENSRNA050028290tRNA-LysNaNtRNAtRNA-Lys for anticodon CUU
4ENSRNA050028291tRNA-MetNaNtRNAtRNA-Met for anticodon CAU
\n", + "
" + ], + "text/plain": [ + " ensembl_gene_id symbol ncbi_gene_id biotype description \\\n", + "0 ENSRNA049444660 tRNA-Val NaN tRNA tRNA-Val for anticodon UAC \n", + "1 ENSRNA049446579 tRNA-Ile NaN tRNA tRNA-Ile for anticodon AAU \n", + "2 ENSRNA050028289 tRNA-Ser NaN tRNA tRNA-Ser for anticodon AGA \n", + "3 ENSRNA050028290 tRNA-Lys NaN tRNA tRNA-Lys for anticodon CUU \n", + "4 ENSRNA050028291 tRNA-Met NaN tRNA tRNA-Met for anticodon CAU \n", + "\n", + " synonyms \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading zea mays...\n", + "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/zea_mays_core_57_110_8\n", + "\u001b[94m•\u001b[0m fetching records from the core DB...\n", + "\u001b[94m•\u001b[0m fetching records from the external DBs...\n", + "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 641\n", + "\u001b[93m!\u001b[0m no ensembl_gene_id found, writing to table_id column.\n", + "\u001b[92m✓\u001b[0m downloaded Gene table containing 44735 entries.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
stable_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0Zm00001eb000010NoneNaNprotein_codingZm00001e000001
1Zm00001eb000020NoneNaNprotein_codingZm00001e000002
2Zm00001eb000030NoneNaNmisc_non_codingZm00001e000003
3Zm00001eb000040NoneNaNmisc_non_codingZm00001e000004
4Zm00001eb000050NoneNaNprotein_codingZm00001e100003
\n", + "
" + ], + "text/plain": [ + " stable_id symbol ncbi_gene_id biotype description \\\n", + "0 Zm00001eb000010 None NaN protein_coding Zm00001e000001 \n", + "1 Zm00001eb000020 None NaN protein_coding Zm00001e000002 \n", + "2 Zm00001eb000030 None NaN misc_non_coding Zm00001e000003 \n", + "3 Zm00001eb000040 None NaN misc_non_coding Zm00001e000004 \n", + "4 Zm00001eb000050 None NaN protein_coding Zm00001e100003 \n", + "\n", + " synonyms \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading oryza sativa japonica group...\n", + "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/oryza_sativa_core_57_110_7\n", + "\u001b[94m•\u001b[0m fetching records from the core DB...\n", + "\u001b[94m•\u001b[0m fetching records from the external DBs...\n", + "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 780\n", + "\u001b[92m✓\u001b[0m downloaded Gene table containing 949 entries.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ensembl_gene_idsymbolncbi_gene_idbiotypedescriptionsynonyms
0ENSRNA049440515tRNA-AsnNaNtRNAtRNA-Asn for anticodon GUU
1ENSRNA049440716tRNA-LeuNaNtRNAtRNA-Leu for anticodon AAG
2ENSRNA049441102tRNA-GlnNaNtRNAtRNA-Gln for anticodon UUG
3ENSRNA049441259tRNA-AlaNaNtRNAtRNA-Ala for anticodon AGC
4ENSRNA049441339tRNA-LeuNaNtRNAtRNA-Leu for anticodon AAG
\n", + "
" + ], + "text/plain": [ + " ensembl_gene_id symbol ncbi_gene_id biotype description \\\n", + "0 ENSRNA049440515 tRNA-Asn NaN tRNA tRNA-Asn for anticodon GUU \n", + "1 ENSRNA049440716 tRNA-Leu NaN tRNA tRNA-Leu for anticodon AAG \n", + "2 ENSRNA049441102 tRNA-Gln NaN tRNA tRNA-Gln for anticodon UUG \n", + "3 ENSRNA049441259 tRNA-Ala NaN tRNA tRNA-Ala for anticodon AGC \n", + "4 ENSRNA049441339 tRNA-Leu NaN tRNA tRNA-Leu for anticodon AAG \n", + "\n", + " synonyms \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "organisms = [\n", + " \"arabidopsis thaliana\",\n", + " \"medicago truncatula\",\n", + " \"solanum lycopersicum\",\n", + " \"zea mays\",\n", + " \"oryza sativa japonica group\",\n", + "]\n", + "version = \"release-57\"\n", + "for organism in organisms:\n", + " print(f\"Downloading {organism}...\")\n", + " ensembl_gene = EnsemblGene(organism=organism, version=version, kingdom=\"plants\")\n", + " print(\"URL:\", ensembl_gene._url)\n", + " df = ensembl_gene.download_df()\n", + " # https://github.com/laminlabs/bionty/issues/533\n", + " df[\"description\"] = df[\"description\"].str.replace(r\"\\[.*?\\]\", \"\", regex=True)\n", + " df.to_parquet(f\"df_{organism}__ensembl__{version}__Gene.parquet\")\n", + " display(df.head())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "scprint-Vjske0Mr-py3.10", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}