From 261732d06c62f732316200afd5b89be54d79b0d1 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 7 Dec 2023 17:30:13 +0100 Subject: [PATCH] Update GO standardization tutorial --- ... Annotation Database for Integration.ipynb | 518 ++++++------------ src/bioregistry/pandas.py | 19 +- 2 files changed, 189 insertions(+), 348 deletions(-) diff --git a/notebooks/Preparing the GO Annotation Database for Integration.ipynb b/notebooks/Preparing the GO Annotation Database for Integration.ipynb index ef371fef5..7b0cbf6ea 100644 --- a/notebooks/Preparing the GO Annotation Database for Integration.ipynb +++ b/notebooks/Preparing the GO Annotation Database for Integration.ipynb @@ -1,8 +1,22 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "d1c3fc4b-eaed-4fc0-b92d-2e2f4862d865", + "metadata": {}, + "source": [ + "# Preparing the Gene Ontology Annotations Database for Integration\n", + "\n", + "> A Gene Ontology (GO) annotation is a statement about the function of a particular gene. GO annotations are created by associating a gene or gene product with a GO term. Together, these statements comprise a “snapshot” of current biological knowledge. Hence, GO annotations capture statements about how a gene functions at the molecular level, where in the cell it functions, and what biological processes (pathways, programs) it helps to carry out. (quoted from https://geneontology.org/docs/go-annotations)\n", + "\n", + "This notebook downloads the Gene Ontology annotation database and walks through the steps of 1) validating the usages of prefixes, local unique identifiers, and CURIEs then 2) standardizing them. Many datasets require such standardization to make them readily interoperable with other datasets.\n", + "\n", + "In the first step, we load the most recent GO annotations database from http://geneontology.org/gene-associations/goa_human.gaf.gz. The format of this file is explained at https://geneontology.org/docs/go-annotation-file-gaf-format-2.2/, but we only look at a subset of columns." + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "4d5bae86", "metadata": {}, "outputs": [ @@ -27,12 +41,11 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", - " 4\n", - " 5\n", - " 7\n", - " 12\n", + " subject_prefix\n", + " subject_identifier\n", + " object_curie\n", + " reference_curie\n", + " taxon_curie\n", " \n", " \n", " \n", @@ -42,7 +55,6 @@ " A0A024RBG1\n", " GO:0003723\n", " GO_REF:0000043\n", - " UniProtKB-KW:KW-0694\n", " taxon:9606\n", " \n", " \n", @@ -51,7 +63,6 @@ " A0A024RBG1\n", " GO:0046872\n", " GO_REF:0000043\n", - " UniProtKB-KW:KW-0479\n", " taxon:9606\n", " \n", " \n", @@ -60,7 +71,6 @@ " A0A024RBG1\n", " GO:0005829\n", " GO_REF:0000052\n", - " NaN\n", " taxon:9606\n", " \n", " \n", @@ -69,7 +79,6 @@ " A0A075B6H7\n", " GO:0002250\n", " GO_REF:0000043\n", - " UniProtKB-KW:KW-1064\n", " taxon:9606\n", " \n", " \n", @@ -78,7 +87,6 @@ " A0A075B6H7\n", " GO:0005886\n", " GO_REF:0000044\n", - " UniProtKB-SubCell:SL-0039\n", " taxon:9606\n", " \n", " \n", @@ -86,22 +94,15 @@ "" ], "text/plain": [ - " 0 1 4 5 \\\n", - "0 UniProtKB A0A024RBG1 GO:0003723 GO_REF:0000043 \n", - "1 UniProtKB A0A024RBG1 GO:0046872 GO_REF:0000043 \n", - "2 UniProtKB A0A024RBG1 GO:0005829 GO_REF:0000052 \n", - "3 UniProtKB A0A075B6H7 GO:0002250 GO_REF:0000043 \n", - "4 UniProtKB A0A075B6H7 GO:0005886 GO_REF:0000044 \n", - "\n", - " 7 12 \n", - "0 UniProtKB-KW:KW-0694 taxon:9606 \n", - "1 UniProtKB-KW:KW-0479 taxon:9606 \n", - "2 NaN taxon:9606 \n", - "3 UniProtKB-KW:KW-1064 taxon:9606 \n", - "4 UniProtKB-SubCell:SL-0039 taxon:9606 " + " subject_prefix subject_identifier object_curie reference_curie taxon_curie\n", + "0 UniProtKB A0A024RBG1 GO:0003723 GO_REF:0000043 taxon:9606\n", + "1 UniProtKB A0A024RBG1 GO:0046872 GO_REF:0000043 taxon:9606\n", + "2 UniProtKB A0A024RBG1 GO:0005829 GO_REF:0000052 taxon:9606\n", + "3 UniProtKB A0A075B6H7 GO:0002250 GO_REF:0000043 taxon:9606\n", + "4 UniProtKB A0A075B6H7 GO:0005886 GO_REF:0000044 taxon:9606" ] }, - "execution_count": 1, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -111,56 +112,80 @@ "import pandas as pd\n", "\n", "# Focus on these columns when displaying the data\n", - "columns = [0, 1, 4, 5, 7, 12]\n", + "columns = [0, 1, 4, 5, 12]\n", + "names = [\n", + " \"subject_prefix\", \"subject_identifier\", \"object_curie\", \n", + " \"reference_curie\", \"taxon_curie\",\n", + "]\n", "\n", "df = pd.read_csv(\n", " \"http://geneontology.org/gene-associations/goa_human.gaf.gz\",\n", " sep=\"\\t\",\n", " comment=\"!\",\n", " header=None,\n", + " usecols=columns,\n", + " names=names,\n", " dtype=str,\n", ").head(100)\n", "\n", "\n", - "df[columns].head()" + "df.head()" ] }, { "cell_type": "markdown", - "id": "c06bb2b2", + "id": "ead3473a-7cc8-4fe9-93ab-68f078e4bf44", "metadata": {}, "source": [ - "## Prefixes" + "## Validation" ] }, { "cell_type": "code", - "execution_count": 2, - "id": "c6da0e76", + "execution_count": 7, + "id": "acec9e21-e4ed-4821-9d7c-fba5af854173", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "100 of 100 (100%) rows with the following prefixes need to be fixed: ['UniProtKB']\n", - "The following prefixes could be normalized using normalize_curies():\n", - "\n", - "| raw | standardized |\n", - "|-----------|----------------|\n", - "| UniProtKB | uniprot |\n" + "ename": "KeyError", + "evalue": "'subject_prefix'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/.virtualenvs/indra/lib/python3.11/site-packages/pandas/core/indexes/base.py:3790\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3789\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3790\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3791\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:181\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'subject_prefix'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m idx \u001b[38;5;241m=\u001b[39m \u001b[43mbrpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_prefixes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msubject_prefix\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m brpd\u001b[38;5;241m.\u001b[39msummarize_prefix_validation(df, idx, column\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msubject_prefix\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/dev/bioregistry/src/bioregistry/pandas.py:154\u001b[0m, in \u001b[0;36mvalidate_prefixes\u001b[0;34m(df, column, target_column)\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Validate prefixes in a given column.\u001b[39;00m\n\u001b[1;32m 130\u001b[0m \n\u001b[1;32m 131\u001b[0m \u001b[38;5;124;03m:param df: A DataFrame\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;124;03m invalid_prefix_df = df[~idx]\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 153\u001b[0m column \u001b[38;5;241m=\u001b[39m _norm_column(df, column)\n\u001b[0;32m--> 154\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mmap(\u001b[38;5;28;01mlambda\u001b[39;00m x: bioregistry\u001b[38;5;241m.\u001b[39mnormalize_prefix(x) \u001b[38;5;241m==\u001b[39m x, na_action\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m target_column:\n\u001b[1;32m 156\u001b[0m df[target_column] \u001b[38;5;241m=\u001b[39m results\n", + "File \u001b[0;32m~/.virtualenvs/indra/lib/python3.11/site-packages/pandas/core/frame.py:3893\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3893\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3895\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/.virtualenvs/indra/lib/python3.11/site-packages/pandas/core/indexes/base.py:3797\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3792\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3793\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3794\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3795\u001b[0m ):\n\u001b[1;32m 3796\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3798\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3799\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3800\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3802\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'subject_prefix'" ] } ], "source": [ - "idx = brpd.validate_prefixes(df, column=0)\n", + "idx = brpd.validate_prefixes(df, column=\"subject_prefix\")\n", "\n", - "brpd.summarize_prefix_validation(df, idx)" + "brpd.summarize_prefix_validation(df, idx, column=\"subject_prefix\")" + ] + }, + { + "cell_type": "markdown", + "id": "74cdd8e3-9dce-4824-b769-ab6117b73339", + "metadata": {}, + "source": [ + "## Standardize" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "06b5680a", "metadata": {}, "outputs": [ @@ -185,111 +210,141 @@ " \n", " \n", " \n", - " 0\n", - " 1\n", - " 4\n", - " 5\n", - " 7\n", - " 12\n", + " subject_curie\n", + " object_curie\n", + " reference_curie\n", + " taxon_curie\n", " \n", " \n", " \n", " \n", " 0\n", - " uniprot\n", - " A0A024RBG1\n", - " GO:0003723\n", - " GO_REF:0000043\n", - " UniProtKB-KW:KW-0694\n", - " taxon:9606\n", + " uniprot:A0A024RBG1\n", + " go:0003723\n", + " go.ref:0000043\n", + " ncbitaxon:9606\n", " \n", " \n", " 1\n", - " uniprot\n", - " A0A024RBG1\n", - " GO:0046872\n", - " GO_REF:0000043\n", - " UniProtKB-KW:KW-0479\n", - " taxon:9606\n", + " uniprot:A0A024RBG1\n", + " go:0046872\n", + " go.ref:0000043\n", + " ncbitaxon:9606\n", " \n", " \n", " 2\n", - " uniprot\n", - " A0A024RBG1\n", - " GO:0005829\n", - " GO_REF:0000052\n", - " NaN\n", - " taxon:9606\n", + " uniprot:A0A024RBG1\n", + " go:0005829\n", + " go.ref:0000052\n", + " ncbitaxon:9606\n", " \n", " \n", " 3\n", - " uniprot\n", - " A0A075B6H7\n", - " GO:0002250\n", - " GO_REF:0000043\n", - " UniProtKB-KW:KW-1064\n", - " taxon:9606\n", + " uniprot:A0A075B6H7\n", + " go:0002250\n", + " go.ref:0000043\n", + " ncbitaxon:9606\n", " \n", " \n", " 4\n", - " uniprot\n", - " A0A075B6H7\n", - " GO:0005886\n", - " GO_REF:0000044\n", - " UniProtKB-SubCell:SL-0039\n", - " taxon:9606\n", + " uniprot:A0A075B6H7\n", + " go:0005886\n", + " go.ref:0000044\n", + " ncbitaxon:9606\n", " \n", " \n", "\n", "" ], "text/plain": [ - " 0 1 4 5 7 \\\n", - "0 uniprot A0A024RBG1 GO:0003723 GO_REF:0000043 UniProtKB-KW:KW-0694 \n", - "1 uniprot A0A024RBG1 GO:0046872 GO_REF:0000043 UniProtKB-KW:KW-0479 \n", - "2 uniprot A0A024RBG1 GO:0005829 GO_REF:0000052 NaN \n", - "3 uniprot A0A075B6H7 GO:0002250 GO_REF:0000043 UniProtKB-KW:KW-1064 \n", - "4 uniprot A0A075B6H7 GO:0005886 GO_REF:0000044 UniProtKB-SubCell:SL-0039 \n", - "\n", - " 12 \n", - "0 taxon:9606 \n", - "1 taxon:9606 \n", - "2 taxon:9606 \n", - "3 taxon:9606 \n", - "4 taxon:9606 " + " subject_curie object_curie reference_curie taxon_curie\n", + "0 uniprot:A0A024RBG1 go:0003723 go.ref:0000043 ncbitaxon:9606\n", + "1 uniprot:A0A024RBG1 go:0046872 go.ref:0000043 ncbitaxon:9606\n", + "2 uniprot:A0A024RBG1 go:0005829 go.ref:0000052 ncbitaxon:9606\n", + "3 uniprot:A0A075B6H7 go:0002250 go.ref:0000043 ncbitaxon:9606\n", + "4 uniprot:A0A075B6H7 go:0005886 go.ref:0000044 ncbitaxon:9606" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "brpd.normalize_prefixes(df, column=0)\n", + "brpd.normalize_prefixes(df, column=\"subject_prefix\")\n", "\n", - "df[columns].head()" + "# Collapse split prefix/identifier columns together into curies\n", + "brpd.pd_collapse_to_curies(\n", + " df, prefix_column=\"subject_prefix\", identifier_column=\"subject_identifier\", target_column=\"subject_curie\",\n", + ")\n", + "\n", + "brpd.normalize_curies(df, column=\"object_curie\")\n", + "brpd.normalize_curies(df, column=\"reference_curie\")\n", + "brpd.normalize_curies(df, column=\"taxon_curie\")\n", + "\n", + "df = df[[\"subject_curie\", \"object_curie\", \"reference_curie\", \"taxon_curie\"]]\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c06bb2b2", + "metadata": {}, + "source": [ + "## Prefixes" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "933d04cb", + "execution_count": 3, + "id": "c6da0e76", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 of 100 (0%) rows with the following prefixes need to be fixed: []\n" + "ename": "KeyError", + "evalue": "'subject_prefix'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/.virtualenvs/indra/lib/python3.11/site-packages/pandas/core/indexes/base.py:3790\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3789\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 3790\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcasted_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3791\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32mindex.pyx:152\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mindex.pyx:181\u001b[0m, in \u001b[0;36mpandas._libs.index.IndexEngine.get_loc\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7080\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "File \u001b[0;32mpandas/_libs/hashtable_class_helper.pxi:7088\u001b[0m, in \u001b[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'subject_prefix'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m idx \u001b[38;5;241m=\u001b[39m \u001b[43mbrpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_prefixes\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msubject_prefix\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m brpd\u001b[38;5;241m.\u001b[39msummarize_prefix_validation(df, idx, column\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msubject_prefix\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/dev/bioregistry/src/bioregistry/pandas.py:154\u001b[0m, in \u001b[0;36mvalidate_prefixes\u001b[0;34m(df, column, target_column)\u001b[0m\n\u001b[1;32m 129\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Validate prefixes in a given column.\u001b[39;00m\n\u001b[1;32m 130\u001b[0m \n\u001b[1;32m 131\u001b[0m \u001b[38;5;124;03m:param df: A DataFrame\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;124;03m invalid_prefix_df = df[~idx]\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 153\u001b[0m column \u001b[38;5;241m=\u001b[39m _norm_column(df, column)\n\u001b[0;32m--> 154\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[43mcolumn\u001b[49m\u001b[43m]\u001b[49m\u001b[38;5;241m.\u001b[39mmap(\u001b[38;5;28;01mlambda\u001b[39;00m x: bioregistry\u001b[38;5;241m.\u001b[39mnormalize_prefix(x) \u001b[38;5;241m==\u001b[39m x, na_action\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 155\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m target_column:\n\u001b[1;32m 156\u001b[0m df[target_column] \u001b[38;5;241m=\u001b[39m results\n", + "File \u001b[0;32m~/.virtualenvs/indra/lib/python3.11/site-packages/pandas/core/frame.py:3893\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3891\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcolumns\u001b[38;5;241m.\u001b[39mnlevels \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 3892\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_getitem_multilevel(key)\n\u001b[0;32m-> 3893\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_loc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3894\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_integer(indexer):\n\u001b[1;32m 3895\u001b[0m indexer \u001b[38;5;241m=\u001b[39m [indexer]\n", + "File \u001b[0;32m~/.virtualenvs/indra/lib/python3.11/site-packages/pandas/core/indexes/base.py:3797\u001b[0m, in \u001b[0;36mIndex.get_loc\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 3792\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(casted_key, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;129;01mor\u001b[39;00m (\n\u001b[1;32m 3793\u001b[0m \u001b[38;5;28misinstance\u001b[39m(casted_key, abc\u001b[38;5;241m.\u001b[39mIterable)\n\u001b[1;32m 3794\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28many\u001b[39m(\u001b[38;5;28misinstance\u001b[39m(x, \u001b[38;5;28mslice\u001b[39m) \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m casted_key)\n\u001b[1;32m 3795\u001b[0m ):\n\u001b[1;32m 3796\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m InvalidIndexError(key)\n\u001b[0;32m-> 3797\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01merr\u001b[39;00m\n\u001b[1;32m 3798\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m:\n\u001b[1;32m 3799\u001b[0m \u001b[38;5;66;03m# If we have a listlike key, _check_indexing_error will raise\u001b[39;00m\n\u001b[1;32m 3800\u001b[0m \u001b[38;5;66;03m# InvalidIndexError. Otherwise we fall through and re-raise\u001b[39;00m\n\u001b[1;32m 3801\u001b[0m \u001b[38;5;66;03m# the TypeError.\u001b[39;00m\n\u001b[1;32m 3802\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_indexing_error(key)\n", + "\u001b[0;31mKeyError\u001b[0m: 'subject_prefix'" ] } ], "source": [ - "idx = brpd.validate_prefixes(df, column=0)\n", + "idx = brpd.validate_prefixes(df, column=\"subject_prefix\")\n", "\n", - "brpd.summarize_prefix_validation(df, idx)" + "brpd.summarize_prefix_validation(df, idx, column=\"subject_prefix\")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c186b41-d381-4106-9a88-8284b12afa62", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "933d04cb", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "id": "57db3a76", @@ -300,18 +355,10 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "a7a33717", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "100 of 100 (100%) rows with the following CURIEs need to be fixed: ['uniprot']\n" - ] - } - ], + "outputs": [], "source": [ "idx = brpd.validate_curies(df, column=4)\n", "\n", @@ -320,110 +367,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "64a1c5bc", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0145712
0uniprotA0A024RBG1go:0003723GO_REF:0000043UniProtKB-KW:KW-0694taxon:9606
1uniprotA0A024RBG1go:0046872GO_REF:0000043UniProtKB-KW:KW-0479taxon:9606
2uniprotA0A024RBG1go:0005829GO_REF:0000052NaNtaxon:9606
3uniprotA0A075B6H7go:0002250GO_REF:0000043UniProtKB-KW:KW-1064taxon:9606
4uniprotA0A075B6H7go:0005886GO_REF:0000044UniProtKB-SubCell:SL-0039taxon:9606
\n", - "
" - ], - "text/plain": [ - " 0 1 4 5 7 \\\n", - "0 uniprot A0A024RBG1 go:0003723 GO_REF:0000043 UniProtKB-KW:KW-0694 \n", - "1 uniprot A0A024RBG1 go:0046872 GO_REF:0000043 UniProtKB-KW:KW-0479 \n", - "2 uniprot A0A024RBG1 go:0005829 GO_REF:0000052 NaN \n", - "3 uniprot A0A075B6H7 go:0002250 GO_REF:0000043 UniProtKB-KW:KW-1064 \n", - "4 uniprot A0A075B6H7 go:0005886 GO_REF:0000044 UniProtKB-SubCell:SL-0039 \n", - "\n", - " 12 \n", - "0 taxon:9606 \n", - "1 taxon:9606 \n", - "2 taxon:9606 \n", - "3 taxon:9606 \n", - "4 taxon:9606 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "brpd.normalize_curies(df, column=4)\n", "\n", @@ -432,18 +379,10 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "5772c6c4", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 of 100 (0%) rows with the following CURIEs need to be fixed: []\n" - ] - } - ], + "outputs": [], "source": [ "idx = brpd.validate_curies(df, column=4)\n", "\n", @@ -460,18 +399,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "0faf0720", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 rows have invalid identifiers\n" - ] - } - ], + "outputs": [], "source": [ "idx = brpd.validate_identifiers(df, column=1, prefix_column=0, use_tqdm=True)\n", "print(f\"{(~idx).sum():,} rows have invalid identifiers\")" @@ -479,125 +410,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "bc2d73f2", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "(~idx).sum()" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "id": "5691ff03", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
145712
0uniprot:A0A024RBG1go:0003723GO_REF:0000043UniProtKB-KW:KW-0694taxon:9606
1uniprot:A0A024RBG1go:0046872GO_REF:0000043UniProtKB-KW:KW-0479taxon:9606
2uniprot:A0A024RBG1go:0005829GO_REF:0000052NaNtaxon:9606
3uniprot:A0A075B6H7go:0002250GO_REF:0000043UniProtKB-KW:KW-1064taxon:9606
4uniprot:A0A075B6H7go:0005886GO_REF:0000044UniProtKB-SubCell:SL-0039taxon:9606
\n", - "
" - ], - "text/plain": [ - " 1 4 5 7 \\\n", - "0 uniprot:A0A024RBG1 go:0003723 GO_REF:0000043 UniProtKB-KW:KW-0694 \n", - "1 uniprot:A0A024RBG1 go:0046872 GO_REF:0000043 UniProtKB-KW:KW-0479 \n", - "2 uniprot:A0A024RBG1 go:0005829 GO_REF:0000052 NaN \n", - "3 uniprot:A0A075B6H7 go:0002250 GO_REF:0000043 UniProtKB-KW:KW-1064 \n", - "4 uniprot:A0A075B6H7 go:0005886 GO_REF:0000044 UniProtKB-SubCell:SL-0039 \n", - "\n", - " 12 \n", - "0 taxon:9606 \n", - "1 taxon:9606 \n", - "2 taxon:9606 \n", - "3 taxon:9606 \n", - "4 taxon:9606 " - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "brpd.identifiers_to_curies(df, column=1, prefix_column=0)\n", "\n", @@ -623,7 +449,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.6" } }, "nbformat": 4, diff --git a/src/bioregistry/pandas.py b/src/bioregistry/pandas.py index bcbade453..c86b4329d 100644 --- a/src/bioregistry/pandas.py +++ b/src/bioregistry/pandas.py @@ -33,6 +33,7 @@ "curies_to_iris", "curies_to_identifiers", "iris_to_curies", + "pd_collapse_to_curies", ] logger = logging.getLogger(__name__) @@ -156,14 +157,14 @@ def validate_prefixes( return results -def summarize_prefix_validation(df: pd.DataFrame, idx: pd.Series) -> None: +def summarize_prefix_validation(df: pd.DataFrame, idx: pd.Series, column) -> None: """Provide a summary of prefix validation.""" # TODO add suggestions on what to do next, e.g.:, # 1. can some be normalized? use normalization function # 2. slice out invalid content # 3. make new prefix request to Bioregistry count = (~idx).sum() - unique = sorted(df[~idx][0].unique()) + unique = sorted(df[~idx][column].unique()) print( # noqa:T201 f"{count:,} of {len(df.index):,} ({count / len(df.index):.0%})", @@ -553,3 +554,17 @@ def iris_to_curies( """ column = _norm_column(df, column) df[target_column or column] = df[column].map(bioregistry.curie_from_iri, na_action="ignore") + + +def pd_collapse_to_curies(df: pd.DataFrame, prefix_column: Union[int, str], identifier_column: Union[int, str], *, target_column: str) -> None: + prefix_column = _norm_column(df, prefix_column) + identifier_column = _norm_column(df, identifier_column) + df[target_column] = [ + f"{prefix}:{identifier}" + for prefix, identifier in df[[prefix_column, identifier_column]].values + ] + del df[prefix_column] + del df[identifier_column] + + +