diff --git a/docs/ingest/gene-ensembl-plant-release-57.ipynb b/docs/ingest/gene-ensembl-plant-release-57.ipynb
new file mode 100644
index 0000000..46a2f1d
--- /dev/null
+++ b/docs/ingest/gene-ensembl-plant-release-57.ipynb
@@ -0,0 +1,631 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Plant `Gene`: ensembl, release-57\n",
+ "\n",
+ "- https://www.ensembl.org/info/data/mysql.html\n",
+ "- https://www.ensembl.org/info/docs/api/core/core_schema.html\n",
+ "\n",
+ "Install mysqlclient: https://pypi.org/project/mysqlclient/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from bionty_base.entities._gene import EnsemblGene"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Downloading arabidopsis thaliana...\n",
+ "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/arabidopsis_thaliana_core_57_110_11\n",
+ "\u001b[94m•\u001b[0m fetching records from the core DB...\n",
+ "\u001b[94m•\u001b[0m fetching records from the external DBs...\n",
+ "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 438\n",
+ "\u001b[93m!\u001b[0m no ensembl_gene_id found, writing to table_id column.\n",
+ "\u001b[92m✓\u001b[0m downloaded Gene table containing 33127 entries.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " AT1G01010 | \n",
+ " NAC001 | \n",
+ " 839580 | \n",
+ " protein_coding | \n",
+ " NAC domain containing protein 1 | \n",
+ " T25K16.1|ANAC001|T25K16_1|NAC domain containin... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " AT1G01020 | \n",
+ " ARV1 | \n",
+ " 839569 | \n",
+ " protein_coding | \n",
+ " ARV1 family protein | \n",
+ " T25K16.2|T25K16_2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " AT1G01030 | \n",
+ " NGA3 | \n",
+ " 839321 | \n",
+ " protein_coding | \n",
+ " AP2/B3-like transcriptional factor family prot... | \n",
+ " T25K16.3|NGATHA3|T25K16_3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " AT1G01040 | \n",
+ " DCL1 | \n",
+ " 839574 | \n",
+ " protein_coding | \n",
+ " dicer-like 1 | \n",
+ " T25K16_4|SUS1|SHORT INTEGUMENTS 1|T25K16.4|CAF... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " AT1G01046 | \n",
+ " ath-MIR838 | \n",
+ " 6240410 | \n",
+ " miRNA | \n",
+ " ath-MIR838 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stable_id symbol ncbi_gene_id biotype \\\n",
+ "0 AT1G01010 NAC001 839580 protein_coding \n",
+ "1 AT1G01020 ARV1 839569 protein_coding \n",
+ "2 AT1G01030 NGA3 839321 protein_coding \n",
+ "3 AT1G01040 DCL1 839574 protein_coding \n",
+ "4 AT1G01046 ath-MIR838 6240410 miRNA \n",
+ "\n",
+ " description \\\n",
+ "0 NAC domain containing protein 1 \n",
+ "1 ARV1 family protein \n",
+ "2 AP2/B3-like transcriptional factor family prot... \n",
+ "3 dicer-like 1 \n",
+ "4 ath-MIR838 \n",
+ "\n",
+ " synonyms \n",
+ "0 T25K16.1|ANAC001|T25K16_1|NAC domain containin... \n",
+ "1 T25K16.2|T25K16_2 \n",
+ "2 T25K16.3|NGATHA3|T25K16_3 \n",
+ "3 T25K16_4|SUS1|SHORT INTEGUMENTS 1|T25K16.4|CAF... \n",
+ "4 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Downloading medicago truncatula...\n",
+ "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/medicago_truncatula_core_57_110_2\n",
+ "\u001b[94m•\u001b[0m fetching records from the core DB...\n",
+ "\u001b[94m•\u001b[0m fetching records from the external DBs...\n",
+ "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 1589\n",
+ "\u001b[92m✓\u001b[0m downloaded Gene table containing 1328 entries.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ensembl_gene_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSRNA049434913 | \n",
+ " tRNA-Asn | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Asn for anticodon GUU | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSRNA049434956 | \n",
+ " tRNA-Glu | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Glu for anticodon UUC | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSRNA049434965 | \n",
+ " tRNA-Met | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Met for anticodon CAU | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSRNA049435003 | \n",
+ " tRNA-Leu | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Leu for anticodon AAG | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSRNA049435027 | \n",
+ " tRNA-Gly | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Gly for anticodon UCC | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ensembl_gene_id symbol ncbi_gene_id biotype description \\\n",
+ "0 ENSRNA049434913 tRNA-Asn NaN tRNA tRNA-Asn for anticodon GUU \n",
+ "1 ENSRNA049434956 tRNA-Glu NaN tRNA tRNA-Glu for anticodon UUC \n",
+ "2 ENSRNA049434965 tRNA-Met NaN tRNA tRNA-Met for anticodon CAU \n",
+ "3 ENSRNA049435003 tRNA-Leu NaN tRNA tRNA-Leu for anticodon AAG \n",
+ "4 ENSRNA049435027 tRNA-Gly NaN tRNA tRNA-Gly for anticodon UCC \n",
+ "\n",
+ " synonyms \n",
+ "0 \n",
+ "1 \n",
+ "2 \n",
+ "3 \n",
+ "4 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Downloading solanum lycopersicum...\n",
+ "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/solanum_lycopersicum_core_57_110_3\n",
+ "\u001b[94m•\u001b[0m fetching records from the core DB...\n",
+ "\u001b[94m•\u001b[0m fetching records from the external DBs...\n",
+ "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 243\n",
+ "\u001b[92m✓\u001b[0m downloaded Gene table containing 1167 entries.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ensembl_gene_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSRNA049444660 | \n",
+ " tRNA-Val | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Val for anticodon UAC | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSRNA049446579 | \n",
+ " tRNA-Ile | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Ile for anticodon AAU | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSRNA050028289 | \n",
+ " tRNA-Ser | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Ser for anticodon AGA | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSRNA050028290 | \n",
+ " tRNA-Lys | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Lys for anticodon CUU | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSRNA050028291 | \n",
+ " tRNA-Met | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Met for anticodon CAU | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ensembl_gene_id symbol ncbi_gene_id biotype description \\\n",
+ "0 ENSRNA049444660 tRNA-Val NaN tRNA tRNA-Val for anticodon UAC \n",
+ "1 ENSRNA049446579 tRNA-Ile NaN tRNA tRNA-Ile for anticodon AAU \n",
+ "2 ENSRNA050028289 tRNA-Ser NaN tRNA tRNA-Ser for anticodon AGA \n",
+ "3 ENSRNA050028290 tRNA-Lys NaN tRNA tRNA-Lys for anticodon CUU \n",
+ "4 ENSRNA050028291 tRNA-Met NaN tRNA tRNA-Met for anticodon CAU \n",
+ "\n",
+ " synonyms \n",
+ "0 \n",
+ "1 \n",
+ "2 \n",
+ "3 \n",
+ "4 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Downloading zea mays...\n",
+ "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/zea_mays_core_57_110_8\n",
+ "\u001b[94m•\u001b[0m fetching records from the core DB...\n",
+ "\u001b[94m•\u001b[0m fetching records from the external DBs...\n",
+ "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 641\n",
+ "\u001b[93m!\u001b[0m no ensembl_gene_id found, writing to table_id column.\n",
+ "\u001b[92m✓\u001b[0m downloaded Gene table containing 44735 entries.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " stable_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Zm00001eb000010 | \n",
+ " None | \n",
+ " NaN | \n",
+ " protein_coding | \n",
+ " Zm00001e000001 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Zm00001eb000020 | \n",
+ " None | \n",
+ " NaN | \n",
+ " protein_coding | \n",
+ " Zm00001e000002 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Zm00001eb000030 | \n",
+ " None | \n",
+ " NaN | \n",
+ " misc_non_coding | \n",
+ " Zm00001e000003 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Zm00001eb000040 | \n",
+ " None | \n",
+ " NaN | \n",
+ " misc_non_coding | \n",
+ " Zm00001e000004 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Zm00001eb000050 | \n",
+ " None | \n",
+ " NaN | \n",
+ " protein_coding | \n",
+ " Zm00001e100003 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " stable_id symbol ncbi_gene_id biotype description \\\n",
+ "0 Zm00001eb000010 None NaN protein_coding Zm00001e000001 \n",
+ "1 Zm00001eb000020 None NaN protein_coding Zm00001e000002 \n",
+ "2 Zm00001eb000030 None NaN misc_non_coding Zm00001e000003 \n",
+ "3 Zm00001eb000040 None NaN misc_non_coding Zm00001e000004 \n",
+ "4 Zm00001eb000050 None NaN protein_coding Zm00001e100003 \n",
+ "\n",
+ " synonyms \n",
+ "0 \n",
+ "1 \n",
+ "2 \n",
+ "3 \n",
+ "4 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Downloading oryza sativa japonica group...\n",
+ "URL: mysql+mysqldb://anonymous:@ensembldb.ensembl.org:4157/oryza_sativa_core_57_110_7\n",
+ "\u001b[94m•\u001b[0m fetching records from the core DB...\n",
+ "\u001b[94m•\u001b[0m fetching records from the external DBs...\n",
+ "\u001b[93m!\u001b[0m duplicated #rows ensembl_gene_id with ncbi_gene_id: 780\n",
+ "\u001b[92m✓\u001b[0m downloaded Gene table containing 949 entries.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ensembl_gene_id | \n",
+ " symbol | \n",
+ " ncbi_gene_id | \n",
+ " biotype | \n",
+ " description | \n",
+ " synonyms | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ENSRNA049440515 | \n",
+ " tRNA-Asn | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Asn for anticodon GUU | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ENSRNA049440716 | \n",
+ " tRNA-Leu | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Leu for anticodon AAG | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ENSRNA049441102 | \n",
+ " tRNA-Gln | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Gln for anticodon UUG | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " ENSRNA049441259 | \n",
+ " tRNA-Ala | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Ala for anticodon AGC | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " ENSRNA049441339 | \n",
+ " tRNA-Leu | \n",
+ " NaN | \n",
+ " tRNA | \n",
+ " tRNA-Leu for anticodon AAG | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ensembl_gene_id symbol ncbi_gene_id biotype description \\\n",
+ "0 ENSRNA049440515 tRNA-Asn NaN tRNA tRNA-Asn for anticodon GUU \n",
+ "1 ENSRNA049440716 tRNA-Leu NaN tRNA tRNA-Leu for anticodon AAG \n",
+ "2 ENSRNA049441102 tRNA-Gln NaN tRNA tRNA-Gln for anticodon UUG \n",
+ "3 ENSRNA049441259 tRNA-Ala NaN tRNA tRNA-Ala for anticodon AGC \n",
+ "4 ENSRNA049441339 tRNA-Leu NaN tRNA tRNA-Leu for anticodon AAG \n",
+ "\n",
+ " synonyms \n",
+ "0 \n",
+ "1 \n",
+ "2 \n",
+ "3 \n",
+ "4 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "organisms = [\n",
+ " \"arabidopsis thaliana\",\n",
+ " \"medicago truncatula\",\n",
+ " \"solanum lycopersicum\",\n",
+ " \"zea mays\",\n",
+ " \"oryza sativa japonica group\",\n",
+ "]\n",
+ "version = \"release-57\"\n",
+ "for organism in organisms:\n",
+ " print(f\"Downloading {organism}...\")\n",
+ " ensembl_gene = EnsemblGene(organism=organism, version=version, kingdom=\"plants\")\n",
+ " print(\"URL:\", ensembl_gene._url)\n",
+ " df = ensembl_gene.download_df()\n",
+ " # https://github.com/laminlabs/bionty/issues/533\n",
+ " df[\"description\"] = df[\"description\"].str.replace(r\"\\[.*?\\]\", \"\", regex=True)\n",
+ " df.to_parquet(f\"df_{organism}__ensembl__{version}__Gene.parquet\")\n",
+ " display(df.head())"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "scprint-Vjske0Mr-py3.10",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}