From 34e6f71fc4ac46bbecdca5e6d4b17fd94ea44437 Mon Sep 17 00:00:00 2001 From: Cannon Date: Tue, 15 Oct 2024 13:54:55 -0400 Subject: [PATCH] in-progress: variation overlap across sources --- notebooks/overlap.ipynb | 3700 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 3700 insertions(+) create mode 100644 notebooks/overlap.ipynb diff --git a/notebooks/overlap.ipynb b/notebooks/overlap.ipynb new file mode 100644 index 00000000..b8220dba --- /dev/null +++ b/notebooks/overlap.ipynb @@ -0,0 +1,3700 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "from neo4j import GraphDatabase\n", + "import pandas as pd\n", + "import requests\n", + "from tqdm import tqdm\n", + "import re" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Variant" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1008\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/5t/sfw5tjx56m10xb861_pd3wfm0000gq/T/ipykernel_51887/488723324.py:10: DeprecationWarning: Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.\n", + " with driver.session() as session:\n" + ] + } + ], + "source": [ + "query_variant_nodes = \"\"\"MATCH (v:Variation)\n", + " RETURN properties(v) AS Variation\n", + "\"\"\"\n", + "\n", + "result = execute_query(driver, query_variant_nodes)\n", + "\n", + "# Close the connection\n", + "driver.close()\n", + "print(len(result))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idexpression_hgvs_cexpression_hgvs_pdigestlabeltypeexpression_hgvs_g
0ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]
1ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNoneNoneW6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNM_004333.4:c.1799T>AAlleleNone
2ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAeNoneNoneOtc5ovrw906Ack087o1fhegB4jDRqCAeNC_000007.13:g.140453136A>TAlleleNone
3ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...[NP_005219.2:p.Thr790Met]sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-T790MAllele[NC_000007.13:g.55249071C>T]
4ga4gh:VA.uldmTYEfqQ0PtALYw8aiE14mYGs5bzkSNoneNoneuldmTYEfqQ0PtALYw8aiE14mYGs5bzkSNM_005228.4:c.2369C>TAlleleNone
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "2 ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "3 ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "4 ga4gh:VA.uldmTYEfqQ0PtALYw8aiE14mYGs5bzkS \n", + "\n", + " expression_hgvs_c \\\n", + "0 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "1 None \n", + "2 None \n", + "3 [ENST00000275493.2:c.2369C>T, NM_005228.4:c.23... \n", + "4 None \n", + "\n", + " expression_hgvs_p digest \\\n", + "0 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 None W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "2 None Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "3 [NP_005219.2:p.Thr790Met] sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "4 None uldmTYEfqQ0PtALYw8aiE14mYGs5bzkS \n", + "\n", + " label type expression_hgvs_g \n", + "0 V600E Allele [NC_000007.13:g.140453136A>T] \n", + "1 NM_004333.4:c.1799T>A Allele None \n", + "2 NC_000007.13:g.140453136A>T Allele None \n", + "3 T790M Allele [NC_000007.13:g.55249071C>T] \n", + "4 NM_005228.4:c.2369C>T Allele None " + ] + }, + "execution_count": 168, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = []\n", + "for record in result:\n", + " variation = record.get('Variation', {})\n", + " row = {\n", + " 'id': variation.get('id', None),\n", + " 'expression_hgvs_c': variation.get('expression_hgvs_c', None),\n", + " 'expression_hgvs_p': variation.get('expression_hgvs_p', None),\n", + " 'digest': variation.get('digest', None),\n", + " 'label': variation.get('label', None),\n", + " 'type': variation.get('type', None),\n", + " 'expression_hgvs_g': variation.get('expression_hgvs_g', None),\n", + " }\n", + " data.append(row)\n", + "\n", + "df = pd.DataFrame(data)\n", + "\n", + "df[0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Study" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "# Function to create a connection to the Neo4j database\n", + "def create_db_connection(uri, user, password):\n", + " driver = GraphDatabase.driver(uri, auth=(user, password))\n", + " return driver\n", + "\n", + "# Function to execute a Cypher query\n", + "def execute_query(driver, query):\n", + " with driver.session() as session:\n", + " result = session.run(query)\n", + " return [record for record in result]\n", + "\n", + "# Connect to the Neo4j database\n", + "uri = \"bolt://localhost:7687\"\n", + "user = \"neo4j\"\n", + "password = \"password\" # Replace 'your_password' with your actual password\n", + "driver = create_db_connection(uri, user, password)\n", + "\n", + "# Strict, Must have Combination Therapies\n", + "query = \"\"\"MATCH (s:Study)\n", + " RETURN properties(s) AS Study\n", + "\"\"\"\n", + "\n", + "# Execute the query\n", + "result = execute_query(driver, query)\n", + "\n", + "# Close the connection\n", + "driver.close()\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "alleleOrigin\n", + "id\n", + "description\n", + "direction\n", + "predicate\n", + "type\n" + ] + } + ], + "source": [ + "for field in result[0]['Study']:\n", + " print(field)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Categorical Variation\n", + "First, basic at evidence level. Each variant evidence item will be normalized and the variation_id will be used between sources to demonstrate overlap of evidence across CIViC and Moalmanac. Later, will look across studies " + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/5t/sfw5tjx56m10xb861_pd3wfm0000gq/T/ipykernel_51887/488723324.py:10: DeprecationWarning: Using a driver after it has been closed is deprecated. Future versions of the driver will raise an error.\n", + " with driver.session() as session:\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1008\n", + "1048\n" + ] + } + ], + "source": [ + "query_variant_categorical = \"\"\"MATCH (v:Variation)\n", + " OPTIONAL MATCH (v)-[:HAS_DEFINING_CONTEXT]-(c:CategoricalVariation)\n", + " OPTIONAL MATCH (c)-[:HAS_VARIANT]-(s:Study)\n", + " RETURN properties(v) AS Variation,\n", + " properties(c) AS Category,\n", + " COUNT(s) as Count\n", + "\"\"\"\n", + "\n", + "# Execute the query\n", + "result = execute_query(driver, query_variant_categorical)\n", + "\n", + "# Close the connection\n", + "driver.close()\n", + "print(len(result))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "31" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result[0]['Count']" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idexpression_hgvs_cexpression_hgvs_pdigestlabeltypeexpression_hgvs_gcategory_idcategory_desccategory_labelapplied_study_count
0ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]moa.variant:144NoneBRAF p.V600E (Missense)31
1ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]civic.mpid:12BRAF V600E has been shown to be recurrent in m...BRAF V600E70
2ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNoneNoneW6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNM_004333.4:c.1799T>AAlleleNoneNoneNoneNone0
3ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAeNoneNoneOtc5ovrw906Ack087o1fhegB4jDRqCAeNC_000007.13:g.140453136A>TAlleleNoneNoneNoneNone0
4ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...[NP_005219.2:p.Thr790Met]sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-T790MAllele[NC_000007.13:g.55249071C>T]moa.variant:242NoneEGFR p.T790M (Missense)11
\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "2 ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "3 ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "4 ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "\n", + " expression_hgvs_c \\\n", + "0 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "1 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "2 None \n", + "3 None \n", + "4 [ENST00000275493.2:c.2369C>T, NM_005228.4:c.23... \n", + "\n", + " expression_hgvs_p digest \\\n", + "0 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "2 None W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "3 None Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "4 [NP_005219.2:p.Thr790Met] sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "\n", + " label type expression_hgvs_g \\\n", + "0 V600E Allele [NC_000007.13:g.140453136A>T] \n", + "1 V600E Allele [NC_000007.13:g.140453136A>T] \n", + "2 NM_004333.4:c.1799T>A Allele None \n", + "3 NC_000007.13:g.140453136A>T Allele None \n", + "4 T790M Allele [NC_000007.13:g.55249071C>T] \n", + "\n", + " category_id category_desc \\\n", + "0 moa.variant:144 None \n", + "1 civic.mpid:12 BRAF V600E has been shown to be recurrent in m... \n", + "2 None None \n", + "3 None None \n", + "4 moa.variant:242 None \n", + "\n", + " category_label applied_study_count \n", + "0 BRAF p.V600E (Missense) 31 \n", + "1 BRAF V600E 70 \n", + "2 None 0 \n", + "3 None 0 \n", + "4 EGFR p.T790M (Missense) 11 " + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = []\n", + "for record in result:\n", + " variation = record.get('Variation', {})\n", + " category = record.get('Category', {}) # Safe check if 'Category' key is missing\n", + " count = record.get('Count', {})\n", + " # print(record)\n", + " row = {\n", + " 'id': variation.get('id', None),\n", + " 'expression_hgvs_c': variation.get('expression_hgvs_c', None),\n", + " 'expression_hgvs_p': variation.get('expression_hgvs_p', None),\n", + " 'digest': variation.get('digest', None),\n", + " 'label': variation.get('label', None),\n", + " 'type': variation.get('type', None),\n", + " 'expression_hgvs_g': variation.get('expression_hgvs_g', None),\n", + " 'category_id': category.get('id', None) if isinstance(category, dict) else None,\n", + " 'category_desc': category.get('description', None) if isinstance(category, dict) else None,\n", + " 'category_label': category.get('label', None) if isinstance(category, dict) else None,\n", + " 'applied_study_count': count\n", + " }\n", + " data.append(row)\n", + "\n", + "df = pd.DataFrame(data)\n", + "\n", + "df[0:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1048it [00:38, 27.51it/s]\n" + ] + } + ], + "source": [ + "def normalize_variant(variant):\n", + " url = f'https://normalize.cancervariants.org/variation/normalize?q={variant}&hgvs_dup_del_mode=default'\n", + " r = requests.get(url) \n", + " if r.status_code == 200: \n", + " return(r.json())\n", + " else:\n", + " return None\n", + "\n", + "df['var_id'] = None\n", + "df['var_type'] = None\n", + "df['var_loc_id'] = None\n", + "df['var_loc_start'] = None\n", + "df['var_loc_end'] = None\n", + "\n", + "for idx, row in tqdm(df.iterrows()):\n", + " # print(idx)\n", + " if row['expression_hgvs_g'] == None:\n", + " continue\n", + "\n", + " result = normalize_variant(row['expression_hgvs_g'][0])\n", + "\n", + " if result == None:\n", + " continue\n", + " else:\n", + " # print(result)\n", + " if any(\"Unable to find classification for:\" in warning for warning in result['warnings']):\n", + " continue\n", + " if any(\"Unable to translate\" in warning for warning in result['warnings']):\n", + " continue\n", + "\n", + " df.at[idx, 'var_id'] = result['variation']['id']\n", + " df.at[idx, 'var_type'] = result['variation']['type']\n", + " df.at[idx, 'var_loc_id'] = result['variation']['location']['id']\n", + " df.at[idx, 'var_loc_start'] = result['variation']['location']['start']\n", + " df.at[idx, 'var_loc_end'] = result['variation']['location']['end']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idexpression_hgvs_cexpression_hgvs_pdigestlabeltypeexpression_hgvs_gcategory_idcategory_desccategory_labelapplied_study_countvar_idvar_typevar_loc_idvar_loc_startvar_loc_end
0ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]moa.variant:144NoneBRAF p.V600E (Missense)31ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MAllelega4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi140753335140753336
1ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]civic.mpid:12BRAF V600E has been shown to be recurrent in m...BRAF V600E70ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MAllelega4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi140753335140753336
2ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNoneNoneW6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNM_004333.4:c.1799T>AAlleleNoneNoneNoneNone0NoneNoneNoneNoneNone
3ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAeNoneNoneOtc5ovrw906Ack087o1fhegB4jDRqCAeNC_000007.13:g.140453136A>TAlleleNoneNoneNoneNone0NoneNoneNoneNoneNone
4ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...[NP_005219.2:p.Thr790Met]sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-T790MAllele[NC_000007.13:g.55249071C>T]moa.variant:242NoneEGFR p.T790M (Missense)11ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjypAllelega4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY5518137755181378
...................................................
1043ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqDNoneNoneuaKTab81sgTH6v26fsDJkGcKidJa_GqD2-209113113-G-CAlleleNoneNoneNoneNone0NoneNoneNoneNoneNone
1044ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9HNoneNoney-ioX4e_ySwP_LlplLNp0cz04a8BBr9HNoneAlleleNonemoa.variant:860NoneIDH1 p.R132S (Missense)1NoneNoneNoneNoneNone
1045ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjhNoneNoneVbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh2-209113113-G-TAlleleNoneNoneNoneNone0NoneNoneNoneNoneNone
1046ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9NNoneNone2zh_S3hn7AWJd00rhirLhhDW8VcVBo9NNoneAlleleNonemoa.variant:861NoneIDH1 p.R132L (Missense)1NoneNoneNoneNoneNone
1047ga4gh:VA.5lDLi65BeuLyMcfhmTprIk1PmmE3mF63NoneNone5lDLi65BeuLyMcfhmTprIk1PmmE3mF632-209113112-C-AAlleleNoneNoneNoneNone0NoneNoneNoneNoneNone
\n", + "

1048 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "2 ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "3 ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "4 ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "... ... \n", + "1043 ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqD \n", + "1044 ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H \n", + "1045 ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh \n", + "1046 ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N \n", + "1047 ga4gh:VA.5lDLi65BeuLyMcfhmTprIk1PmmE3mF63 \n", + "\n", + " expression_hgvs_c \\\n", + "0 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "1 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "2 None \n", + "3 None \n", + "4 [ENST00000275493.2:c.2369C>T, NM_005228.4:c.23... \n", + "... ... \n", + "1043 None \n", + "1044 None \n", + "1045 None \n", + "1046 None \n", + "1047 None \n", + "\n", + " expression_hgvs_p digest \\\n", + "0 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "2 None W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "3 None Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "4 [NP_005219.2:p.Thr790Met] sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "... ... ... \n", + "1043 None uaKTab81sgTH6v26fsDJkGcKidJa_GqD \n", + "1044 None y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H \n", + "1045 None VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh \n", + "1046 None 2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N \n", + "1047 None 5lDLi65BeuLyMcfhmTprIk1PmmE3mF63 \n", + "\n", + " label type expression_hgvs_g \\\n", + "0 V600E Allele [NC_000007.13:g.140453136A>T] \n", + "1 V600E Allele [NC_000007.13:g.140453136A>T] \n", + "2 NM_004333.4:c.1799T>A Allele None \n", + "3 NC_000007.13:g.140453136A>T Allele None \n", + "4 T790M Allele [NC_000007.13:g.55249071C>T] \n", + "... ... ... ... \n", + "1043 2-209113113-G-C Allele None \n", + "1044 None Allele None \n", + "1045 2-209113113-G-T Allele None \n", + "1046 None Allele None \n", + "1047 2-209113112-C-A Allele None \n", + "\n", + " category_id category_desc \\\n", + "0 moa.variant:144 None \n", + "1 civic.mpid:12 BRAF V600E has been shown to be recurrent in m... \n", + "2 None None \n", + "3 None None \n", + "4 moa.variant:242 None \n", + "... ... ... \n", + "1043 None None \n", + "1044 moa.variant:860 None \n", + "1045 None None \n", + "1046 moa.variant:861 None \n", + "1047 None None \n", + "\n", + " category_label applied_study_count \\\n", + "0 BRAF p.V600E (Missense) 31 \n", + "1 BRAF V600E 70 \n", + "2 None 0 \n", + "3 None 0 \n", + "4 EGFR p.T790M (Missense) 11 \n", + "... ... ... \n", + "1043 None 0 \n", + "1044 IDH1 p.R132S (Missense) 1 \n", + "1045 None 0 \n", + "1046 IDH1 p.R132L (Missense) 1 \n", + "1047 None 0 \n", + "\n", + " var_id var_type \\\n", + "0 ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M Allele \n", + "1 ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M Allele \n", + "2 None None \n", + "3 None None \n", + "4 ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp Allele \n", + "... ... ... \n", + "1043 None None \n", + "1044 None None \n", + "1045 None None \n", + "1046 None None \n", + "1047 None None \n", + "\n", + " var_loc_id var_loc_start var_loc_end \n", + "0 ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi 140753335 140753336 \n", + "1 ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi 140753335 140753336 \n", + "2 None None None \n", + "3 None None None \n", + "4 ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY 55181377 55181378 \n", + "... ... ... ... \n", + "1043 None None None \n", + "1044 None None None \n", + "1045 None None None \n", + "1046 None None None \n", + "1047 None None None \n", + "\n", + "[1048 rows x 16 columns]" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "var_id label \n", + "ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7 D816V 3\n", + "ga4gh:VA.JX_FH6W6rts4VAX6GWlurlqheqrgHGoT E384* 2\n", + "ga4gh:VA.TAARa2cxRHmOiij9UBwvW-noMDoOq2x9 L858R 2\n", + "ga4gh:VA.Ol69g1SmOdYaopX-zIp42cHsWZCWrCj7 H1047L 2\n", + "ga4gh:VA.VzsVyqlcWS87LveLKdzeYwvmm7lz9ie1 K642E 2\n", + " ..\n", + "ga4gh:VA.NbIeg9oY7URUtee74IS69PsAgTVwqmMS E81K 1\n", + "ga4gh:VA.O7TpCfowJTi-tHr3uIA3ZU_Cg9XUDITD Y220C 1\n", + "ga4gh:VA.OdkVLBI2BYn4rmrjkqjEh6v_9RKfzswA D770_N771insGT 1\n", + "ga4gh:VA.POEl_3_26UPoaUTf3nqH0s77PWQJEGPD L597R 1\n", + "ga4gh:VA.zvOLR_KJgwNfZVpYbUA6IPmR86rWKp-5 D1228N 1\n", + "Name: count, Length: 261, dtype: int64" + ] + }, + "execution_count": 144, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['var_id','label']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
expression_hgvs_gvar_idlabelcategory_labelcategory_idapplied_study_count
0[NC_000007.13:g.140453136A>T]ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MV600EBRAF p.V600E (Missense)moa.variant:14431
1[NC_000007.13:g.140453136A>T]ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MV600EBRAF V600Ecivic.mpid:1270
2NoneNoneNM_004333.4:c.1799T>ANoneNone0
3NoneNoneNC_000007.13:g.140453136A>TNoneNone0
4[NC_000007.13:g.55249071C>T]ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjypT790MEGFR p.T790M (Missense)moa.variant:24211
.....................
1043NoneNone2-209113113-G-CNoneNone0
1044NoneNoneNoneIDH1 p.R132S (Missense)moa.variant:8601
1045NoneNone2-209113113-G-TNoneNone0
1046NoneNoneNoneIDH1 p.R132L (Missense)moa.variant:8611
1047NoneNone2-209113112-C-ANoneNone0
\n", + "

1048 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " expression_hgvs_g \\\n", + "0 [NC_000007.13:g.140453136A>T] \n", + "1 [NC_000007.13:g.140453136A>T] \n", + "2 None \n", + "3 None \n", + "4 [NC_000007.13:g.55249071C>T] \n", + "... ... \n", + "1043 None \n", + "1044 None \n", + "1045 None \n", + "1046 None \n", + "1047 None \n", + "\n", + " var_id label \\\n", + "0 ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M V600E \n", + "1 ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M V600E \n", + "2 None NM_004333.4:c.1799T>A \n", + "3 None NC_000007.13:g.140453136A>T \n", + "4 ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp T790M \n", + "... ... ... \n", + "1043 None 2-209113113-G-C \n", + "1044 None None \n", + "1045 None 2-209113113-G-T \n", + "1046 None None \n", + "1047 None 2-209113112-C-A \n", + "\n", + " category_label category_id applied_study_count \n", + "0 BRAF p.V600E (Missense) moa.variant:144 31 \n", + "1 BRAF V600E civic.mpid:12 70 \n", + "2 None None 0 \n", + "3 None None 0 \n", + "4 EGFR p.T790M (Missense) moa.variant:242 11 \n", + "... ... ... ... \n", + "1043 None None 0 \n", + "1044 IDH1 p.R132S (Missense) moa.variant:860 1 \n", + "1045 None None 0 \n", + "1046 IDH1 p.R132L (Missense) moa.variant:861 1 \n", + "1047 None None 0 \n", + "\n", + "[1048 rows x 6 columns]" + ] + }, + "execution_count": 151, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = df[['expression_hgvs_g','var_id','label','category_label','category_id','applied_study_count']]\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
expression_hgvs_gvar_idlabelcategory_labelcategory_idapplied_study_count
545[NC_000007.13:g.116423407G>A, NC_000007.14:g.1...ga4gh:VA.zvOLR_KJgwNfZVpYbUA6IPmR86rWKp-5D1228NMET D1228Ncivic.mpid:6451
758[NC_000011.9:g.108201096G>A]ga4gh:VA.zqaGzhlafiGlcmXVqwfdZ2EwAKM3xG4bC2488YATM C2488Ycivic.mpid:11461
253[NC_000004.11:g.55593612_55593614del]ga4gh:VA.zmmhHogAiHozB4_pcU_6L6ISfnTtoIpeV560DELKIT V560DELcivic.mpid:2021
353[NC_000006.11:g.152419920_152419921delinsAG]ga4gh:VA.zbjFtas3cyxI78Ph-xrWHNGlJdUE-Hm8L536QESR1 L536Qcivic.mpid:462
219[NC_000007.13:g.140453145A>T]ga4gh:VA.zATpR7iDy-_AbeeQ93IgDZwYYZwApallL597QBRAF L597Qcivic.mpid:5793
213[NC_000012.11:g.56478854G>A]ga4gh:VA.z7tojh3NvzxeTqX5seNcHlmCuB-s_-cFV104MERBB3 V104Mcivic.mpid:6821
162[NC_000007.13:g.55249005G>T]ga4gh:VA.yr3duXsAtLA9Sd79rm8szW7ILvFJAYWvS768IEGFR S768Icivic.mpid:5585
310[NC_000017.10:g.7577548C>T]ga4gh:VA.yr-4Fnb8Q_RBQD-JtGGQemDK3Mby1BXeG245STP53 G245Scivic.mpid:8531
425[NC_000003.11:g.178936091G>C]ga4gh:VA.yldbvaOLm7SnNhqBQM9XiY3BxKrtNcMNE545QPIK3CA E545Qcivic.mpid:8551
424[NC_000003.11:g.178936091G>C]ga4gh:VA.yldbvaOLm7SnNhqBQM9XiY3BxKrtNcMNE545QPIK3CA p.E545Q (Missense)moa.variant:4591
340[NC_000002.11:g.209113112C>T]ga4gh:VA.y3AcrkM1rtuFjcKurTbKEBQGYa40qvZ2R132HIDH1 p.R132H (Missense)moa.variant:3106
341[NC_000002.11:g.209113112C>T]ga4gh:VA.y3AcrkM1rtuFjcKurTbKEBQGYa40qvZ2R132HIDH1 R132Hcivic.mpid:4162
764[NC_000007.14:g.55191852A>G, NC_000007.13:g.55...ga4gh:VA.xzmDAi2gLCL4nFWP8LyRoe7O-sbghi2IE868GEGFR E868Gcivic.mpid:13901
832[NC_000003.11:g.178936092A>G]ga4gh:VA.xx5iQAtlm2hRJams6rLacyOblG2DvQCWE545GPIK3CA E545Gcivic.mpid:8571
831[NC_000003.11:g.178936092A>G]ga4gh:VA.xx5iQAtlm2hRJams6rLacyOblG2DvQCWE545GPIK3CA p.E545G (Missense)moa.variant:4661
460[NC_000007.14:g.55174015G>A, NC_000007.13:g.55...ga4gh:VA.wyD-G_ggt_ETBzCG2YsGFW1fFFHU7npaG719DEGFR G719Dcivic.mpid:13281
788[NC_000004.11:g.55961059G>A]ga4gh:VA.wxuwNDS7ihRW_or8TKB-iRjadhyx-JUiR961WKDR R961Wcivic.mpid:4981
101[NC_000001.10:g.161479745A>G]ga4gh:VA.wWLhkZ3ryEQUXyybfItMdtIAnQkLZtRNH167RFCGR2A H167Rcivic.mpid:4521
333[NC_000013.10:g.28592642C>G]ga4gh:VA.wV0gjNrVs2cIsNLtqJZUSkKM4zqcSp7eD835HFLT3 D835Hcivic.mpid:6082
23[NC_000012.11:g.25398281C>T]ga4gh:VA.wQro48c_oFxB35UNb2O1Z7Vj6rXnJywlG13DKRAS G13Dcivic.mpid:8114
44[NC_000003.11:g.41266137C>T]ga4gh:VA.wPaeM1o9r1fSzqTO2BUv8ahkT5-xhLARS45FCTNNB1 S45Fcivic.mpid:12601
805[NC_000016.9:g.69745145G>A]ga4gh:VA.wLURDNCy6u6aGEppsKH8cbrn1nZl5ZYXP187SNQO1 P187Scivic.mpid:3981
808[NC_000007.13:g.55227884C>T]ga4gh:VA.vpEbsePV5eX6035RuTKyul6NGvo-KMudR451CEGFR R451Ccivic.mpid:4501
715[NC_000017.10:g.37881392A>G]ga4gh:VA.vhq5ifNdKqsndDf53CX0kWxgWggPBRXnT862AERBB2 T862Acivic.mpid:8451
454[NC_000004.11:g.55599333A>G]ga4gh:VA.vYd5YGENoBuUWSoXOvNZtH6TIs_oP-TND820GKIT D820Gcivic.mpid:12401
811[NC_000001.10:g.11184573G>T]ga4gh:VA.vLy655kG5H6doX1bP9ymaivrQpL8JWyGS2215YMTOR S2215Ycivic.mpid:5381
87[NC_000017.10:g.7577120C>T]ga4gh:VA.vCOV0bY9s6tDpYIN2VWtUe10Njr-Ai7SR273HTP53 R273Hcivic.mpid:1222
48[NC_000003.11:g.178936091G>A]ga4gh:VA.uqD1zF6cMOYhBp8ur5oLTMqV0jxiC2a7E545KPIK3CA E545Kcivic.mpid:10418
47[NC_000003.11:g.178936091G>A]ga4gh:VA.uqD1zF6cMOYhBp8ur5oLTMqV0jxiC2a7E545KPIK3CA p.E545K (Missense)moa.variant:4671
153[NC_000010.10:g.104852955C>T]ga4gh:VA.ugrh8GrTjT-VBp-nbf7GaesWACeYjWZ0R367QNT5C2 R367Qcivic.mpid:2344
682[NC_000004.11:g.55141036T>C]ga4gh:VA.uVADAvdFFVKPUoYjXWtcZk-7eEB20U14V561APDGFRA V561Acivic.mpid:2431
644[NC_000001.10:g.162745497A>T]ga4gh:VA.uQujclzyHLBIg9aP-IJLieWPM9T5M51YI638FDDR2 I638Fcivic.mpid:1431
706[NC_000004.11:g.55152090G>A]ga4gh:VA.uA2r6EwLxX-hzJ8ugmZIqqRffc0oAE4pR841KPDGFRA R841Kcivic.mpid:8371
451[NC_000004.11:g.55599333A>C]ga4gh:VA.u8O0LC07v1S3Lq1TsRNBQV8vFuDJVWujD820AKIT D820Acivic.mpid:12391
862[NC_000017.10:g.37881378A>G]ga4gh:VA.u7fmXZc2P1xlkCEZ4CeMR8Ag0UJjXX4-N857SERBB2 N857Scivic.mpid:8471
370[NC_000008.10:g.38274849G>T]ga4gh:VA.u2sHe5gOQvbCZRZZMFEQ2Sl0hYbNwygfN546KFGFR1 N546Kcivic.mpid:5112
795[NC_000003.11:g.178921553T>A]ga4gh:VA.tfgawi1GKnwuikAqBVZGFxX12380TJcVN345KPIK3CA N345Kcivic.mpid:9041
746[NC_000011.9:g.108199938T>C]ga4gh:VA.tYRtC-EJVMtiZIrMcrIf6_wAc5zeUshAL2427PATM L2427Pcivic.mpid:11371
125[NC_000017.10:g.7578406C>T]ga4gh:VA.tKbKBAAo6GbAm7D1Mj2R3F8I1VB1GGlMR175HTP53 R175Hcivic.mpid:1162
137[NC_000002.11:g.29432664C>T]ga4gh:VA.t3vU-1OKU2n8Hp6J5zIgf-fdOngWPcJIR1275QALK R1275Qcivic.mpid:96
104[NC_000014.8:g.105246551C>T]ga4gh:VA.swSBQXaa6uybP38HHxbddf9Cvl8JzGEWE17KAKT1 p.E17K (Missense)moa.variant:1071
105[NC_000014.8:g.105246551C>T]ga4gh:VA.swSBQXaa6uybP38HHxbddf9Cvl8JzGEWE17KAKT1 E17Kcivic.mpid:45
694[NC_000004.11:g.55968180T>A]ga4gh:VA.svLVRwCf-NY8TtCsrz409vLrGSlNIhACD717VKDR D717Vcivic.mpid:4641
688[NC_000001.10:g.11184580G>A]ga4gh:VA.sSh_-iUZCGTj0jNilVZBbo-ozXOIn7_vP2213SMTOR P2213Scivic.mpid:2801
871[NC_000007.13:g.140453154T>A]ga4gh:VA.sPKyzLqx4hThPeOaNqHkIINpLwpbn2sDD594VBRAF D594Vcivic.mpid:5761
122[NC_000004.11:g.55593464A>C]ga4gh:VA.sOzRU-tQ3ZFWYJSe26eag0maq4n_2mzQM541LKIT M541Lcivic.mpid:2012
358[NC_000006.11:g.152419922T>A]ga4gh:VA.sD5jnSppiGf7A7Pc_4oiuBTPRFO9yDRdY537NESR1 Y537Ncivic.mpid:492
331[NC_000007.13:g.140453145_140453146delinsGA]ga4gh:VA.rUz4rYx_rneGPOX1aAGU5etxHoDlH6aKL597SBRAF L597Scivic.mpid:5782
233[NC_000007.13:g.140481411C>A]ga4gh:VA.rTU_Q-4E6SYZRuGw6uLmg0sekqgcyAB6G466VBRAF G466Vcivic.mpid:20984
634[NC_000001.10:g.162724415C>G]ga4gh:VA.rC1Ksowb7DJ4T0lL7t8w_5YBBXtCdN9wL63VDDR2 L63Vcivic.mpid:1391
\n", + "
" + ], + "text/plain": [ + " expression_hgvs_g \\\n", + "545 [NC_000007.13:g.116423407G>A, NC_000007.14:g.1... \n", + "758 [NC_000011.9:g.108201096G>A] \n", + "253 [NC_000004.11:g.55593612_55593614del] \n", + "353 [NC_000006.11:g.152419920_152419921delinsAG] \n", + "219 [NC_000007.13:g.140453145A>T] \n", + "213 [NC_000012.11:g.56478854G>A] \n", + "162 [NC_000007.13:g.55249005G>T] \n", + "310 [NC_000017.10:g.7577548C>T] \n", + "425 [NC_000003.11:g.178936091G>C] \n", + "424 [NC_000003.11:g.178936091G>C] \n", + "340 [NC_000002.11:g.209113112C>T] \n", + "341 [NC_000002.11:g.209113112C>T] \n", + "764 [NC_000007.14:g.55191852A>G, NC_000007.13:g.55... \n", + "832 [NC_000003.11:g.178936092A>G] \n", + "831 [NC_000003.11:g.178936092A>G] \n", + "460 [NC_000007.14:g.55174015G>A, NC_000007.13:g.55... \n", + "788 [NC_000004.11:g.55961059G>A] \n", + "101 [NC_000001.10:g.161479745A>G] \n", + "333 [NC_000013.10:g.28592642C>G] \n", + "23 [NC_000012.11:g.25398281C>T] \n", + "44 [NC_000003.11:g.41266137C>T] \n", + "805 [NC_000016.9:g.69745145G>A] \n", + "808 [NC_000007.13:g.55227884C>T] \n", + "715 [NC_000017.10:g.37881392A>G] \n", + "454 [NC_000004.11:g.55599333A>G] \n", + "811 [NC_000001.10:g.11184573G>T] \n", + "87 [NC_000017.10:g.7577120C>T] \n", + "48 [NC_000003.11:g.178936091G>A] \n", + "47 [NC_000003.11:g.178936091G>A] \n", + "153 [NC_000010.10:g.104852955C>T] \n", + "682 [NC_000004.11:g.55141036T>C] \n", + "644 [NC_000001.10:g.162745497A>T] \n", + "706 [NC_000004.11:g.55152090G>A] \n", + "451 [NC_000004.11:g.55599333A>C] \n", + "862 [NC_000017.10:g.37881378A>G] \n", + "370 [NC_000008.10:g.38274849G>T] \n", + "795 [NC_000003.11:g.178921553T>A] \n", + "746 [NC_000011.9:g.108199938T>C] \n", + "125 [NC_000017.10:g.7578406C>T] \n", + "137 [NC_000002.11:g.29432664C>T] \n", + "104 [NC_000014.8:g.105246551C>T] \n", + "105 [NC_000014.8:g.105246551C>T] \n", + "694 [NC_000004.11:g.55968180T>A] \n", + "688 [NC_000001.10:g.11184580G>A] \n", + "871 [NC_000007.13:g.140453154T>A] \n", + "122 [NC_000004.11:g.55593464A>C] \n", + "358 [NC_000006.11:g.152419922T>A] \n", + "331 [NC_000007.13:g.140453145_140453146delinsGA] \n", + "233 [NC_000007.13:g.140481411C>A] \n", + "634 [NC_000001.10:g.162724415C>G] \n", + "\n", + " var_id label \\\n", + "545 ga4gh:VA.zvOLR_KJgwNfZVpYbUA6IPmR86rWKp-5 D1228N \n", + "758 ga4gh:VA.zqaGzhlafiGlcmXVqwfdZ2EwAKM3xG4b C2488Y \n", + "253 ga4gh:VA.zmmhHogAiHozB4_pcU_6L6ISfnTtoIpe V560DEL \n", + "353 ga4gh:VA.zbjFtas3cyxI78Ph-xrWHNGlJdUE-Hm8 L536Q \n", + "219 ga4gh:VA.zATpR7iDy-_AbeeQ93IgDZwYYZwApall L597Q \n", + "213 ga4gh:VA.z7tojh3NvzxeTqX5seNcHlmCuB-s_-cF V104M \n", + "162 ga4gh:VA.yr3duXsAtLA9Sd79rm8szW7ILvFJAYWv S768I \n", + "310 ga4gh:VA.yr-4Fnb8Q_RBQD-JtGGQemDK3Mby1BXe G245S \n", + "425 ga4gh:VA.yldbvaOLm7SnNhqBQM9XiY3BxKrtNcMN E545Q \n", + "424 ga4gh:VA.yldbvaOLm7SnNhqBQM9XiY3BxKrtNcMN E545Q \n", + "340 ga4gh:VA.y3AcrkM1rtuFjcKurTbKEBQGYa40qvZ2 R132H \n", + "341 ga4gh:VA.y3AcrkM1rtuFjcKurTbKEBQGYa40qvZ2 R132H \n", + "764 ga4gh:VA.xzmDAi2gLCL4nFWP8LyRoe7O-sbghi2I E868G \n", + "832 ga4gh:VA.xx5iQAtlm2hRJams6rLacyOblG2DvQCW E545G \n", + "831 ga4gh:VA.xx5iQAtlm2hRJams6rLacyOblG2DvQCW E545G \n", + "460 ga4gh:VA.wyD-G_ggt_ETBzCG2YsGFW1fFFHU7npa G719D \n", + "788 ga4gh:VA.wxuwNDS7ihRW_or8TKB-iRjadhyx-JUi R961W \n", + "101 ga4gh:VA.wWLhkZ3ryEQUXyybfItMdtIAnQkLZtRN H167R \n", + "333 ga4gh:VA.wV0gjNrVs2cIsNLtqJZUSkKM4zqcSp7e D835H \n", + "23 ga4gh:VA.wQro48c_oFxB35UNb2O1Z7Vj6rXnJywl G13D \n", + "44 ga4gh:VA.wPaeM1o9r1fSzqTO2BUv8ahkT5-xhLAR S45F \n", + "805 ga4gh:VA.wLURDNCy6u6aGEppsKH8cbrn1nZl5ZYX P187S \n", + "808 ga4gh:VA.vpEbsePV5eX6035RuTKyul6NGvo-KMud R451C \n", + "715 ga4gh:VA.vhq5ifNdKqsndDf53CX0kWxgWggPBRXn T862A \n", + "454 ga4gh:VA.vYd5YGENoBuUWSoXOvNZtH6TIs_oP-TN D820G \n", + "811 ga4gh:VA.vLy655kG5H6doX1bP9ymaivrQpL8JWyG S2215Y \n", + "87 ga4gh:VA.vCOV0bY9s6tDpYIN2VWtUe10Njr-Ai7S R273H \n", + "48 ga4gh:VA.uqD1zF6cMOYhBp8ur5oLTMqV0jxiC2a7 E545K \n", + "47 ga4gh:VA.uqD1zF6cMOYhBp8ur5oLTMqV0jxiC2a7 E545K \n", + "153 ga4gh:VA.ugrh8GrTjT-VBp-nbf7GaesWACeYjWZ0 R367Q \n", + "682 ga4gh:VA.uVADAvdFFVKPUoYjXWtcZk-7eEB20U14 V561A \n", + "644 ga4gh:VA.uQujclzyHLBIg9aP-IJLieWPM9T5M51Y I638F \n", + "706 ga4gh:VA.uA2r6EwLxX-hzJ8ugmZIqqRffc0oAE4p R841K \n", + "451 ga4gh:VA.u8O0LC07v1S3Lq1TsRNBQV8vFuDJVWuj D820A \n", + "862 ga4gh:VA.u7fmXZc2P1xlkCEZ4CeMR8Ag0UJjXX4- N857S \n", + "370 ga4gh:VA.u2sHe5gOQvbCZRZZMFEQ2Sl0hYbNwygf N546K \n", + "795 ga4gh:VA.tfgawi1GKnwuikAqBVZGFxX12380TJcV N345K \n", + "746 ga4gh:VA.tYRtC-EJVMtiZIrMcrIf6_wAc5zeUshA L2427P \n", + "125 ga4gh:VA.tKbKBAAo6GbAm7D1Mj2R3F8I1VB1GGlM R175H \n", + "137 ga4gh:VA.t3vU-1OKU2n8Hp6J5zIgf-fdOngWPcJI R1275Q \n", + "104 ga4gh:VA.swSBQXaa6uybP38HHxbddf9Cvl8JzGEW E17K \n", + "105 ga4gh:VA.swSBQXaa6uybP38HHxbddf9Cvl8JzGEW E17K \n", + "694 ga4gh:VA.svLVRwCf-NY8TtCsrz409vLrGSlNIhAC D717V \n", + "688 ga4gh:VA.sSh_-iUZCGTj0jNilVZBbo-ozXOIn7_v P2213S \n", + "871 ga4gh:VA.sPKyzLqx4hThPeOaNqHkIINpLwpbn2sD D594V \n", + "122 ga4gh:VA.sOzRU-tQ3ZFWYJSe26eag0maq4n_2mzQ M541L \n", + "358 ga4gh:VA.sD5jnSppiGf7A7Pc_4oiuBTPRFO9yDRd Y537N \n", + "331 ga4gh:VA.rUz4rYx_rneGPOX1aAGU5etxHoDlH6aK L597S \n", + "233 ga4gh:VA.rTU_Q-4E6SYZRuGw6uLmg0sekqgcyAB6 G466V \n", + "634 ga4gh:VA.rC1Ksowb7DJ4T0lL7t8w_5YBBXtCdN9w L63V \n", + "\n", + " category_label category_id applied_study_count \n", + "545 MET D1228N civic.mpid:645 1 \n", + "758 ATM C2488Y civic.mpid:1146 1 \n", + "253 KIT V560DEL civic.mpid:202 1 \n", + "353 ESR1 L536Q civic.mpid:46 2 \n", + "219 BRAF L597Q civic.mpid:579 3 \n", + "213 ERBB3 V104M civic.mpid:682 1 \n", + "162 EGFR S768I civic.mpid:558 5 \n", + "310 TP53 G245S civic.mpid:853 1 \n", + "425 PIK3CA E545Q civic.mpid:855 1 \n", + "424 PIK3CA p.E545Q (Missense) moa.variant:459 1 \n", + "340 IDH1 p.R132H (Missense) moa.variant:310 6 \n", + "341 IDH1 R132H civic.mpid:416 2 \n", + "764 EGFR E868G civic.mpid:1390 1 \n", + "832 PIK3CA E545G civic.mpid:857 1 \n", + "831 PIK3CA p.E545G (Missense) moa.variant:466 1 \n", + "460 EGFR G719D civic.mpid:1328 1 \n", + "788 KDR R961W civic.mpid:498 1 \n", + "101 FCGR2A H167R civic.mpid:452 1 \n", + "333 FLT3 D835H civic.mpid:608 2 \n", + "23 KRAS G13D civic.mpid:81 14 \n", + "44 CTNNB1 S45F civic.mpid:1260 1 \n", + "805 NQO1 P187S civic.mpid:398 1 \n", + "808 EGFR R451C civic.mpid:450 1 \n", + "715 ERBB2 T862A civic.mpid:845 1 \n", + "454 KIT D820G civic.mpid:1240 1 \n", + "811 MTOR S2215Y civic.mpid:538 1 \n", + "87 TP53 R273H civic.mpid:122 2 \n", + "48 PIK3CA E545K civic.mpid:104 18 \n", + "47 PIK3CA p.E545K (Missense) moa.variant:467 1 \n", + "153 NT5C2 R367Q civic.mpid:234 4 \n", + "682 PDGFRA V561A civic.mpid:243 1 \n", + "644 DDR2 I638F civic.mpid:143 1 \n", + "706 PDGFRA R841K civic.mpid:837 1 \n", + "451 KIT D820A civic.mpid:1239 1 \n", + "862 ERBB2 N857S civic.mpid:847 1 \n", + "370 FGFR1 N546K civic.mpid:511 2 \n", + "795 PIK3CA N345K civic.mpid:904 1 \n", + "746 ATM L2427P civic.mpid:1137 1 \n", + "125 TP53 R175H civic.mpid:116 2 \n", + "137 ALK R1275Q civic.mpid:9 6 \n", + "104 AKT1 p.E17K (Missense) moa.variant:107 1 \n", + "105 AKT1 E17K civic.mpid:4 5 \n", + "694 KDR D717V civic.mpid:464 1 \n", + "688 MTOR P2213S civic.mpid:280 1 \n", + "871 BRAF D594V civic.mpid:576 1 \n", + "122 KIT M541L civic.mpid:201 2 \n", + "358 ESR1 Y537N civic.mpid:49 2 \n", + "331 BRAF L597S civic.mpid:578 2 \n", + "233 BRAF G466V civic.mpid:2098 4 \n", + "634 DDR2 L63V civic.mpid:139 1 " + ] + }, + "execution_count": 164, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.sort_values(by='var_id',ascending=False)[0:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
expression_hgvs_gvar_idlabelcategory_labelcategory_idapplied_study_count
58[NC_000004.11:g.55599321A>T]ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7D816VKIT p.D816V (Missense)moa.variant:3601
59[NC_000004.11:g.55599321A>T]ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7D816VNOT KIT D816Vcivic.mpid:43531
60[NC_000004.11:g.55599321A>T]ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7D816VKIT D816Vcivic.mpid:653
\n", + "
" + ], + "text/plain": [ + " expression_hgvs_g var_id \\\n", + "58 [NC_000004.11:g.55599321A>T] ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7 \n", + "59 [NC_000004.11:g.55599321A>T] ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7 \n", + "60 [NC_000004.11:g.55599321A>T] ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7 \n", + "\n", + " label category_label category_id applied_study_count \n", + "58 D816V KIT p.D816V (Missense) moa.variant:360 1 \n", + "59 D816V NOT KIT D816V civic.mpid:4353 1 \n", + "60 D816V KIT D816V civic.mpid:65 3 " + ] + }, + "execution_count": 152, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[data['var_id']=='ga4gh:VA.gdbp5quI5YscqYY01qlHfuJkT7R41cR7']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Inspects" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idexpression_hgvs_cexpression_hgvs_pdigestlabeltypeexpression_hgvs_gcategory_idcategory_desccategory_labelapplied_study_countvar_idvar_typevar_loc_idvar_loc_startvar_loc_end
912ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQNoneNoneD6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQNoneAlleleNonemoa.variant:66NoneABL1 p.T315I (Missense)6NoneNoneNoneNoneNone
914ga4gh:VA.37YVc2HpRgXOq3HtsjcL1eiyLhDXLmYyNoneNone37YVc2HpRgXOq3HtsjcL1eiyLhDXLmYyNoneAlleleNonemoa.variant:68NoneABL1 p.T315A (Missense)3NoneNoneNoneNoneNone
916ga4gh:VA.ZJZc_8PkTSu-twmaJvj6yQXvPJHElPZcNoneNoneZJZc_8PkTSu-twmaJvj6yQXvPJHElPZcNoneAlleleNonemoa.variant:70NoneABL1 p.F317L (Missense)3NoneNoneNoneNoneNone
918ga4gh:VA.SnGz3wUT2JaIid12PoI6OHc4t7LgHVj1NoneNoneSnGz3wUT2JaIid12PoI6OHc4t7LgHVj1NoneAlleleNonemoa.variant:71NoneABL1 p.F317V (Missense)3NoneNoneNoneNoneNone
920ga4gh:VA.wDDVWfpuxnuYkLj5_0OrnaBvrJAXYcJANoneNonewDDVWfpuxnuYkLj5_0OrnaBvrJAXYcJANoneAlleleNonemoa.variant:72NoneABL1 p.F317I (Missense)3NoneNoneNoneNoneNone
...................................................
1036ga4gh:VA.zS_-FFo-cPjizcBEraRCMJ-wfCLNXM9FNoneNonezS_-FFo-cPjizcBEraRCMJ-wfCLNXM9FNoneAlleleNonemoa.variant:476NonePIK3CA p.P539R (Missense)1NoneNoneNoneNoneNone
1039ga4gh:VA.8aRynLgwo0OYPIuCyiw6BGNd8oLxoGXxNoneNone8aRynLgwo0OYPIuCyiw6BGNd8oLxoGXxNoneAlleleNonemoa.variant:478NonePIK3CA p.Y1021C (Missense)1NoneNoneNoneNoneNone
1042ga4gh:VA.aAXNeFGBgeJUGbun-bKvgoW204tC1xdpNoneNoneaAXNeFGBgeJUGbun-bKvgoW204tC1xdpNoneAlleleNonemoa.variant:859NoneIDH1 p.R132G (Missense)1NoneNoneNoneNoneNone
1044ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9HNoneNoney-ioX4e_ySwP_LlplLNp0cz04a8BBr9HNoneAlleleNonemoa.variant:860NoneIDH1 p.R132S (Missense)1NoneNoneNoneNoneNone
1046ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9NNoneNone2zh_S3hn7AWJd00rhirLhhDW8VcVBo9NNoneAlleleNonemoa.variant:861NoneIDH1 p.R132L (Missense)1NoneNoneNoneNoneNone
\n", + "

62 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " id expression_hgvs_c \\\n", + "912 ga4gh:VA.D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ None \n", + "914 ga4gh:VA.37YVc2HpRgXOq3HtsjcL1eiyLhDXLmYy None \n", + "916 ga4gh:VA.ZJZc_8PkTSu-twmaJvj6yQXvPJHElPZc None \n", + "918 ga4gh:VA.SnGz3wUT2JaIid12PoI6OHc4t7LgHVj1 None \n", + "920 ga4gh:VA.wDDVWfpuxnuYkLj5_0OrnaBvrJAXYcJA None \n", + "... ... ... \n", + "1036 ga4gh:VA.zS_-FFo-cPjizcBEraRCMJ-wfCLNXM9F None \n", + "1039 ga4gh:VA.8aRynLgwo0OYPIuCyiw6BGNd8oLxoGXx None \n", + "1042 ga4gh:VA.aAXNeFGBgeJUGbun-bKvgoW204tC1xdp None \n", + "1044 ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H None \n", + "1046 ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N None \n", + "\n", + " expression_hgvs_p digest label type \\\n", + "912 None D6NzpWXKqBnbcZZrXNSXj4tMUwROKbsQ None Allele \n", + "914 None 37YVc2HpRgXOq3HtsjcL1eiyLhDXLmYy None Allele \n", + "916 None ZJZc_8PkTSu-twmaJvj6yQXvPJHElPZc None Allele \n", + "918 None SnGz3wUT2JaIid12PoI6OHc4t7LgHVj1 None Allele \n", + "920 None wDDVWfpuxnuYkLj5_0OrnaBvrJAXYcJA None Allele \n", + "... ... ... ... ... \n", + "1036 None zS_-FFo-cPjizcBEraRCMJ-wfCLNXM9F None Allele \n", + "1039 None 8aRynLgwo0OYPIuCyiw6BGNd8oLxoGXx None Allele \n", + "1042 None aAXNeFGBgeJUGbun-bKvgoW204tC1xdp None Allele \n", + "1044 None y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H None Allele \n", + "1046 None 2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N None Allele \n", + "\n", + " expression_hgvs_g category_id category_desc \\\n", + "912 None moa.variant:66 None \n", + "914 None moa.variant:68 None \n", + "916 None moa.variant:70 None \n", + "918 None moa.variant:71 None \n", + "920 None moa.variant:72 None \n", + "... ... ... ... \n", + "1036 None moa.variant:476 None \n", + "1039 None moa.variant:478 None \n", + "1042 None moa.variant:859 None \n", + "1044 None moa.variant:860 None \n", + "1046 None moa.variant:861 None \n", + "\n", + " category_label applied_study_count var_id var_type \\\n", + "912 ABL1 p.T315I (Missense) 6 None None \n", + "914 ABL1 p.T315A (Missense) 3 None None \n", + "916 ABL1 p.F317L (Missense) 3 None None \n", + "918 ABL1 p.F317V (Missense) 3 None None \n", + "920 ABL1 p.F317I (Missense) 3 None None \n", + "... ... ... ... ... \n", + "1036 PIK3CA p.P539R (Missense) 1 None None \n", + "1039 PIK3CA p.Y1021C (Missense) 1 None None \n", + "1042 IDH1 p.R132G (Missense) 1 None None \n", + "1044 IDH1 p.R132S (Missense) 1 None None \n", + "1046 IDH1 p.R132L (Missense) 1 None None \n", + "\n", + " var_loc_id var_loc_start var_loc_end \n", + "912 None None None \n", + "914 None None None \n", + "916 None None None \n", + "918 None None None \n", + "920 None None None \n", + "... ... ... ... \n", + "1036 None None None \n", + "1039 None None None \n", + "1042 None None None \n", + "1044 None None None \n", + "1046 None None None \n", + "\n", + "[62 rows x 16 columns]" + ] + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['label'].isna()==True] # MATCH (c:CategoricalVariation) WHERE c.id = 'moa.variant:66' RETURN c LIMIT 25 || shows that some Variation nodes can have no label on it\n" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
expression_hgvs_gvar_idlabelcategory_labelcategory_idapplied_study_count
2NoneNoneNM_004333.4:c.1799T>ANoneNone0
\n", + "
" + ], + "text/plain": [ + " expression_hgvs_g var_id label category_label category_id \\\n", + "2 None None NM_004333.4:c.1799T>A None None \n", + "\n", + " applied_study_count \n", + "2 0 " + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tdf = data[data['label'].isna()==False]\n", + "tdf[tdf['label']=='NM_004333.4:c.1799T>A']" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
expression_hgvs_gvar_idlabelcategory_labelcategory_idapplied_study_count
1[NC_000007.13:g.140453136A>T]ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MV600EBRAF V600Ecivic.mpid:1270
\n", + "
" + ], + "text/plain": [ + " expression_hgvs_g var_id \\\n", + "1 [NC_000007.13:g.140453136A>T] ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M \n", + "\n", + " label category_label category_id applied_study_count \n", + "1 V600E BRAF V600E civic.mpid:12 70 " + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tdf[tdf['category_id']=='civic.mpid:12']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Weird Regex Stuff" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "pattern1 = r'^[A-Za-z]\\d+[A-Za-z]{1,3}$' # V600E\n", + "pattern2 = r'^([A-Z]+_\\d+\\.\\d+):(g|c|p)\\.((\\d+(_\\d+)?[A-Z]+>[A-Z]+)|(\\d+|\\d+_\\d+)del(?!ins)|(\\d+|\\d+_\\d+)ins[A-Z]+|(\\d+|\\d+_\\d+)delins[A-Z]+|[A-Z][a-z]{2}\\d+[A-Z][a-z]{2}|[A-Z][a-z]{2}\\d+del|\\d+=)$' # NM_004333.4:c.1799T>A\n", + "pattern3 = r'^\\d+-\\d+-[A-Za-z]-[A-Za-z]$' # 2-209113113-G-C\n", + "pattern4 = r'^c\\.\\d+[A-Z]>[A-Z]$' # c.393T>C\n", + "pattern5 = r'ENST'\n", + "\n", + "# Function to classify based on patterns\n", + "def classify_by_pattern(value):\n", + " if pd.isna(value): # Check for NaN or NA values\n", + " return 'No Match'\n", + " value = str(value) # Convert value to string\n", + " if re.match(pattern1, value):\n", + " return 'Pattern 1'\n", + " elif re.match(pattern2, value):\n", + " return 'Pattern 2'\n", + " elif re.match(pattern3, value):\n", + " return 'Pattern 3'\n", + " elif re.match(pattern4, value):\n", + " return 'Pattern 4'\n", + " elif re.search(pattern5, value): \n", + " return 'Pattern 5'\n", + " else:\n", + " return 'No Match'\n", + "\n", + "# Apply the function to create a new column\n", + "df['label_regex'] = df['label'].apply(classify_by_pattern)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "label_regex\n", + "Pattern 2 471\n", + "Pattern 1 356\n", + "No Match 121\n", + "Pattern 3 74\n", + "Pattern 5 25\n", + "Pattern 4 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 128, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['label_regex'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idexpression_hgvs_cexpression_hgvs_pdigestlabeltypeexpression_hgvs_gcategory_idcategory_desccategory_labelvar_idvar_typevar_loc_idvar_loc_startvar_loc_endlabel_regex
0ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]moa.variant:144NoneBRAF p.V600E (Missense)ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MAllelega4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi140753335140753336Pattern 1
1ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]civic.mpid:12BRAF V600E has been shown to be recurrent in m...BRAF V600Ega4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MAllelega4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi140753335140753336Pattern 1
4ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...[NP_005219.2:p.Thr790Met]sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-T790MAllele[NC_000007.13:g.55249071C>T]moa.variant:242NoneEGFR p.T790M (Missense)ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjypAllelega4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY5518137755181378Pattern 1
5ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...[NP_005219.2:p.Thr790Met]sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-T790MAllele[NC_000007.13:g.55249071C>T]civic.mpid:34EGFR T790M was one of the very first mutations...EGFR T790Mga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjypAllelega4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY5518137755181378Pattern 1
8ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ[NM_005228.4:c.2573T>G, ENST00000275493.2:c.25...[NP_005219.2:p.Leu858Arg]S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZL858RAllele[NC_000007.13:g.55259515T>G]moa.variant:254NoneEGFR p.L858R (Missense)ga4gh:VA.TAARa2cxRHmOiij9UBwvW-noMDoOq2x9Allelega4gh:SL.ulUNwZvajob7nzyrlpOd6uUWZIYCsoWb5519182155191822Pattern 1
...................................................
895ga4gh:VA.RynDzpGjpLKfmAOrN0yrjRyAeIPqV52Q[NM_005631.4:c.1234C>T, ENST00000249373.3:c.12...[NP_005622.1:p.Leu412Phe]RynDzpGjpLKfmAOrN0yrjRyAeIPqV52QL412FAllele[NC_000007.13:g.128846398C>T]civic.mpid:1478NoneSMO L412Fga4gh:VA.6kcO_sqNNSHOo6fR6cZJ6_o992202o97Allelega4gh:SL.xuaPDRPulmJSAqoCc1WSSSBhcVvY0fhY129206556129206557Pattern 1
898ga4gh:VA.rIlkyhIg01Me8yT2_Q2woVzTaTcK-Dz1NoneNonerIlkyhIg01Me8yT2_Q2woVzTaTcK-Dz1C284YAlleleNonecivic.mpid:1555NonePOLD1 C284YNoneNoneNoneNoneNonePattern 1
899ga4gh:VA.tAvB46rxfRKnXF1pWq1iRJAzyu-pNEz6NoneNonetAvB46rxfRKnXF1pWq1iRJAzyu-pNEz6E374KAlleleNonecivic.mpid:1556NonePOLD1 E374KNoneNoneNoneNoneNonePattern 1
900ga4gh:VA.ie88C_NJ9fuZjOO1ZgGVoGb6ZU1yYuObNoneNoneie88C_NJ9fuZjOO1ZgGVoGb6ZU1yYuObQ179XAlleleNonecivic.mpid:1562NoneNRAS Q179XNoneNoneNoneNoneNonePattern 1
904ga4gh:VA.QT4fR_w5vpZVBTlVFS434T0Hpzrq010bNoneNoneQT4fR_w5vpZVBTlVFS434T0Hpzrq010bS310FAlleleNonecivic.mpid:4470NoneERBB2 S310FNoneNoneNoneNoneNonePattern 1
\n", + "

356 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "4 ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "5 ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "8 ga4gh:VA.S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ \n", + ".. ... \n", + "895 ga4gh:VA.RynDzpGjpLKfmAOrN0yrjRyAeIPqV52Q \n", + "898 ga4gh:VA.rIlkyhIg01Me8yT2_Q2woVzTaTcK-Dz1 \n", + "899 ga4gh:VA.tAvB46rxfRKnXF1pWq1iRJAzyu-pNEz6 \n", + "900 ga4gh:VA.ie88C_NJ9fuZjOO1ZgGVoGb6ZU1yYuOb \n", + "904 ga4gh:VA.QT4fR_w5vpZVBTlVFS434T0Hpzrq010b \n", + "\n", + " expression_hgvs_c \\\n", + "0 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "1 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "4 [ENST00000275493.2:c.2369C>T, NM_005228.4:c.23... \n", + "5 [ENST00000275493.2:c.2369C>T, NM_005228.4:c.23... \n", + "8 [NM_005228.4:c.2573T>G, ENST00000275493.2:c.25... \n", + ".. ... \n", + "895 [NM_005631.4:c.1234C>T, ENST00000249373.3:c.12... \n", + "898 None \n", + "899 None \n", + "900 None \n", + "904 None \n", + "\n", + " expression_hgvs_p digest label \\\n", + "0 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L V600E \n", + "1 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L V600E \n", + "4 [NP_005219.2:p.Thr790Met] sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- T790M \n", + "5 [NP_005219.2:p.Thr790Met] sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- T790M \n", + "8 [NP_005219.2:p.Leu858Arg] S41CcMJT2bcd8R4-qXZWH1PoHWNtG2PZ L858R \n", + ".. ... ... ... \n", + "895 [NP_005622.1:p.Leu412Phe] RynDzpGjpLKfmAOrN0yrjRyAeIPqV52Q L412F \n", + "898 None rIlkyhIg01Me8yT2_Q2woVzTaTcK-Dz1 C284Y \n", + "899 None tAvB46rxfRKnXF1pWq1iRJAzyu-pNEz6 E374K \n", + "900 None ie88C_NJ9fuZjOO1ZgGVoGb6ZU1yYuOb Q179X \n", + "904 None QT4fR_w5vpZVBTlVFS434T0Hpzrq010b S310F \n", + "\n", + " type expression_hgvs_g category_id \\\n", + "0 Allele [NC_000007.13:g.140453136A>T] moa.variant:144 \n", + "1 Allele [NC_000007.13:g.140453136A>T] civic.mpid:12 \n", + "4 Allele [NC_000007.13:g.55249071C>T] moa.variant:242 \n", + "5 Allele [NC_000007.13:g.55249071C>T] civic.mpid:34 \n", + "8 Allele [NC_000007.13:g.55259515T>G] moa.variant:254 \n", + ".. ... ... ... \n", + "895 Allele [NC_000007.13:g.128846398C>T] civic.mpid:1478 \n", + "898 Allele None civic.mpid:1555 \n", + "899 Allele None civic.mpid:1556 \n", + "900 Allele None civic.mpid:1562 \n", + "904 Allele None civic.mpid:4470 \n", + "\n", + " category_desc \\\n", + "0 None \n", + "1 BRAF V600E has been shown to be recurrent in m... \n", + "4 None \n", + "5 EGFR T790M was one of the very first mutations... \n", + "8 None \n", + ".. ... \n", + "895 None \n", + "898 None \n", + "899 None \n", + "900 None \n", + "904 None \n", + "\n", + " category_label var_id \\\n", + "0 BRAF p.V600E (Missense) ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M \n", + "1 BRAF V600E ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M \n", + "4 EGFR p.T790M (Missense) ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp \n", + "5 EGFR T790M ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp \n", + "8 EGFR p.L858R (Missense) ga4gh:VA.TAARa2cxRHmOiij9UBwvW-noMDoOq2x9 \n", + ".. ... ... \n", + "895 SMO L412F ga4gh:VA.6kcO_sqNNSHOo6fR6cZJ6_o992202o97 \n", + "898 POLD1 C284Y None \n", + "899 POLD1 E374K None \n", + "900 NRAS Q179X None \n", + "904 ERBB2 S310F None \n", + "\n", + " var_type var_loc_id var_loc_start \\\n", + "0 Allele ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi 140753335 \n", + "1 Allele ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi 140753335 \n", + "4 Allele ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY 55181377 \n", + "5 Allele ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY 55181377 \n", + "8 Allele ga4gh:SL.ulUNwZvajob7nzyrlpOd6uUWZIYCsoWb 55191821 \n", + ".. ... ... ... \n", + "895 Allele ga4gh:SL.xuaPDRPulmJSAqoCc1WSSSBhcVvY0fhY 129206556 \n", + "898 None None None \n", + "899 None None None \n", + "900 None None None \n", + "904 None None None \n", + "\n", + " var_loc_end label_regex \n", + "0 140753336 Pattern 1 \n", + "1 140753336 Pattern 1 \n", + "4 55181378 Pattern 1 \n", + "5 55181378 Pattern 1 \n", + "8 55191822 Pattern 1 \n", + ".. ... ... \n", + "895 129206557 Pattern 1 \n", + "898 None Pattern 1 \n", + "899 None Pattern 1 \n", + "900 None Pattern 1 \n", + "904 None Pattern 1 \n", + "\n", + "[356 rows x 16 columns]" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['label_regex']=='Pattern 1']" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2 NM_004333.4:c.1799T>A\n", + "3 NC_000007.13:g.140453136A>T\n", + "6 NM_005228.4:c.2369C>T\n", + "7 NC_000007.13:g.55249071C>T\n", + "10 NC_000007.13:g.55259515T>G\n", + " ... \n", + "886 NM_005228.3:c.2590G>A\n", + "893 NC_000007.13:g.55249012_55249013insGGCACA\n", + "894 NM_005228.3:c.2310_2311insGGCACA\n", + "896 NM_005631.4:c.1234C>T\n", + "897 NC_000007.13:g.128846398C>T\n", + "Name: label, Length: 471, dtype: object" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['label_regex']=='Pattern 2']['label']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idexpression_hgvs_cexpression_hgvs_pdigestlabeltypeexpression_hgvs_gcategory_idcategory_desccategory_labelvar_idvar_typevar_loc_idvar_loc_startvar_loc_endlabel_regex
913ga4gh:VA.HUJOQCml0LngKmUf5IJIYQk9CfKmagbfNoneNoneHUJOQCml0LngKmUf5IJIYQk9CfKmagbf9-133748283-C-TAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
915ga4gh:VA.R7udthNB0ErCSOrSgHNUKB1uCLXE5BZ5NoneNoneR7udthNB0ErCSOrSgHNUKB1uCLXE5BZ59-133747582-A-GAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
917ga4gh:VA.UTV6lwIVIZgs38dBRg1TU7HYgG5cObP0NoneNoneUTV6lwIVIZgs38dBRg1TU7HYgG5cObP09-133748290-C-GAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
919ga4gh:VA.gVx_jtWuo12r_n-3PxXKH5eV3L8MDt0yNoneNonegVx_jtWuo12r_n-3PxXKH5eV3L8MDt0y9-133748288-T-GAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
921ga4gh:VA.a8TcUEmtsJGEaOHdkcqe-TGj2z19iLhONoneNonea8TcUEmtsJGEaOHdkcqe-TGj2z19iLhO9-133748288-T-AAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
...................................................
1040ga4gh:VA.TCoGnTPgu4-nkn9VkajNGKLRSTN7ei6sNoneNoneTCoGnTPgu4-nkn9VkajNGKLRSTN7ei6s3-178952007-A-GAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
1041ga4gh:VA.Fw5XPRvCcynVkUKffAWKjwPTuF7R1FO3NoneNoneFw5XPRvCcynVkUKffAWKjwPTuF7R1FO310-43614996-G-AAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
1043ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqDNoneNoneuaKTab81sgTH6v26fsDJkGcKidJa_GqD2-209113113-G-CAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
1045ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjhNoneNoneVbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh2-209113113-G-TAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
1047ga4gh:VA.5lDLi65BeuLyMcfhmTprIk1PmmE3mF63NoneNone5lDLi65BeuLyMcfhmTprIk1PmmE3mF632-209113112-C-AAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
\n", + "

74 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " id expression_hgvs_c \\\n", + "913 ga4gh:VA.HUJOQCml0LngKmUf5IJIYQk9CfKmagbf None \n", + "915 ga4gh:VA.R7udthNB0ErCSOrSgHNUKB1uCLXE5BZ5 None \n", + "917 ga4gh:VA.UTV6lwIVIZgs38dBRg1TU7HYgG5cObP0 None \n", + "919 ga4gh:VA.gVx_jtWuo12r_n-3PxXKH5eV3L8MDt0y None \n", + "921 ga4gh:VA.a8TcUEmtsJGEaOHdkcqe-TGj2z19iLhO None \n", + "... ... ... \n", + "1040 ga4gh:VA.TCoGnTPgu4-nkn9VkajNGKLRSTN7ei6s None \n", + "1041 ga4gh:VA.Fw5XPRvCcynVkUKffAWKjwPTuF7R1FO3 None \n", + "1043 ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqD None \n", + "1045 ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh None \n", + "1047 ga4gh:VA.5lDLi65BeuLyMcfhmTprIk1PmmE3mF63 None \n", + "\n", + " expression_hgvs_p digest label \\\n", + "913 None HUJOQCml0LngKmUf5IJIYQk9CfKmagbf 9-133748283-C-T \n", + "915 None R7udthNB0ErCSOrSgHNUKB1uCLXE5BZ5 9-133747582-A-G \n", + "917 None UTV6lwIVIZgs38dBRg1TU7HYgG5cObP0 9-133748290-C-G \n", + "919 None gVx_jtWuo12r_n-3PxXKH5eV3L8MDt0y 9-133748288-T-G \n", + "921 None a8TcUEmtsJGEaOHdkcqe-TGj2z19iLhO 9-133748288-T-A \n", + "... ... ... ... \n", + "1040 None TCoGnTPgu4-nkn9VkajNGKLRSTN7ei6s 3-178952007-A-G \n", + "1041 None Fw5XPRvCcynVkUKffAWKjwPTuF7R1FO3 10-43614996-G-A \n", + "1043 None uaKTab81sgTH6v26fsDJkGcKidJa_GqD 2-209113113-G-C \n", + "1045 None VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh 2-209113113-G-T \n", + "1047 None 5lDLi65BeuLyMcfhmTprIk1PmmE3mF63 2-209113112-C-A \n", + "\n", + " type expression_hgvs_g category_id category_desc category_label \\\n", + "913 Allele None None None None \n", + "915 Allele None None None None \n", + "917 Allele None None None None \n", + "919 Allele None None None None \n", + "921 Allele None None None None \n", + "... ... ... ... ... ... \n", + "1040 Allele None None None None \n", + "1041 Allele None None None None \n", + "1043 Allele None None None None \n", + "1045 Allele None None None None \n", + "1047 Allele None None None None \n", + "\n", + " var_id var_type var_loc_id var_loc_start var_loc_end label_regex \n", + "913 None None None None None Pattern 3 \n", + "915 None None None None None Pattern 3 \n", + "917 None None None None None Pattern 3 \n", + "919 None None None None None Pattern 3 \n", + "921 None None None None None Pattern 3 \n", + "... ... ... ... ... ... ... \n", + "1040 None None None None None Pattern 3 \n", + "1041 None None None None None Pattern 3 \n", + "1043 None None None None None Pattern 3 \n", + "1045 None None None None None Pattern 3 \n", + "1047 None None None None None Pattern 3 \n", + "\n", + "[74 rows x 16 columns]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['label_regex']=='Pattern 3']\n" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "19 NP_000507.1:p.Ile131=\n", + "174 E746_A750del\n", + "175 R233*\n", + "222 W557_K558del\n", + "257 Q1178*\n", + "293 Q503*\n", + "300 R200W (c.598C>T)\n", + "303 Q56_V60del\n", + "317 D770_N771insGL\n", + "321 D770delinsGY\n", + "395 S2289*\n", + "414 V769_D770insASV\n", + "415 V769_D770insASV\n", + "419 M774DELINSWLV\n", + "447 L747_P753delinsS\n", + "472 D770_N771insG\n", + "475 H773_V774insNPH\n", + "476 L747_S752delinsQ\n", + "479 P772_H773insYNP\n", + "480 P772_V774insPHV\n", + "488 N486_P490del\n", + "490 K3326*\n", + "492 L938*\n", + "496 A502_Y503insAY\n", + "506 P551_E554delPMYE\n", + "Name: label, dtype: object" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['label_regex']=='No Match']['label'][0:25]" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idexpression_hgvs_cexpression_hgvs_pdigestlabeltypeexpression_hgvs_gcategory_idcategory_desccategory_labelvar_idvar_typevar_loc_idvar_loc_startvar_loc_endlabel_regex
0ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]moa.variant:144NoneBRAF p.V600E (Missense)ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MAllelega4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi140753335140753336Pattern 1
1ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L[NM_004333.4:c.1799T>A, ENST00000288602.6:c.17...[NP_004324.2:p.Val600Glu]j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0LV600EAllele[NC_000007.13:g.140453136A>T]civic.mpid:12BRAF V600E has been shown to be recurrent in m...BRAF V600Ega4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_MAllelega4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi140753335140753336Pattern 1
2ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNoneNoneW6xsV-aFm9yT2Bic5cFAV2j0rll6KK5RNM_004333.4:c.1799T>AAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 2
3ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAeNoneNoneOtc5ovrw906Ack087o1fhegB4jDRqCAeNC_000007.13:g.140453136A>TAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 2
4ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-[ENST00000275493.2:c.2369C>T, NM_005228.4:c.23...[NP_005219.2:p.Thr790Met]sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V-T790MAllele[NC_000007.13:g.55249071C>T]moa.variant:242NoneEGFR p.T790M (Missense)ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjypAllelega4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY5518137755181378Pattern 1
...................................................
1043ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqDNoneNoneuaKTab81sgTH6v26fsDJkGcKidJa_GqD2-209113113-G-CAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
1044ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9HNoneNoney-ioX4e_ySwP_LlplLNp0cz04a8BBr9HNoneAlleleNonemoa.variant:860NoneIDH1 p.R132S (Missense)NoneNoneNoneNoneNoneNo Match
1045ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjhNoneNoneVbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh2-209113113-G-TAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
1046ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9NNoneNone2zh_S3hn7AWJd00rhirLhhDW8VcVBo9NNoneAlleleNonemoa.variant:861NoneIDH1 p.R132L (Missense)NoneNoneNoneNoneNoneNo Match
1047ga4gh:VA.5lDLi65BeuLyMcfhmTprIk1PmmE3mF63NoneNone5lDLi65BeuLyMcfhmTprIk1PmmE3mF632-209113112-C-AAlleleNoneNoneNoneNoneNoneNoneNoneNoneNonePattern 3
\n", + "

1048 rows × 16 columns

\n", + "
" + ], + "text/plain": [ + " id \\\n", + "0 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 ga4gh:VA.j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "2 ga4gh:VA.W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "3 ga4gh:VA.Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "4 ga4gh:VA.sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "... ... \n", + "1043 ga4gh:VA.uaKTab81sgTH6v26fsDJkGcKidJa_GqD \n", + "1044 ga4gh:VA.y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H \n", + "1045 ga4gh:VA.VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh \n", + "1046 ga4gh:VA.2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N \n", + "1047 ga4gh:VA.5lDLi65BeuLyMcfhmTprIk1PmmE3mF63 \n", + "\n", + " expression_hgvs_c \\\n", + "0 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "1 [NM_004333.4:c.1799T>A, ENST00000288602.6:c.17... \n", + "2 None \n", + "3 None \n", + "4 [ENST00000275493.2:c.2369C>T, NM_005228.4:c.23... \n", + "... ... \n", + "1043 None \n", + "1044 None \n", + "1045 None \n", + "1046 None \n", + "1047 None \n", + "\n", + " expression_hgvs_p digest \\\n", + "0 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "1 [NP_004324.2:p.Val600Glu] j4XnsLZcdzDIYa5pvvXM7t1wn9OITr0L \n", + "2 None W6xsV-aFm9yT2Bic5cFAV2j0rll6KK5R \n", + "3 None Otc5ovrw906Ack087o1fhegB4jDRqCAe \n", + "4 [NP_005219.2:p.Thr790Met] sMA9h8fzDi0RvweMlxtD0_Oi8B-JZ1V- \n", + "... ... ... \n", + "1043 None uaKTab81sgTH6v26fsDJkGcKidJa_GqD \n", + "1044 None y-ioX4e_ySwP_LlplLNp0cz04a8BBr9H \n", + "1045 None VbDmOJp91MyXUGNKuKtJfm5WvwgjnLjh \n", + "1046 None 2zh_S3hn7AWJd00rhirLhhDW8VcVBo9N \n", + "1047 None 5lDLi65BeuLyMcfhmTprIk1PmmE3mF63 \n", + "\n", + " label type expression_hgvs_g \\\n", + "0 V600E Allele [NC_000007.13:g.140453136A>T] \n", + "1 V600E Allele [NC_000007.13:g.140453136A>T] \n", + "2 NM_004333.4:c.1799T>A Allele None \n", + "3 NC_000007.13:g.140453136A>T Allele None \n", + "4 T790M Allele [NC_000007.13:g.55249071C>T] \n", + "... ... ... ... \n", + "1043 2-209113113-G-C Allele None \n", + "1044 None Allele None \n", + "1045 2-209113113-G-T Allele None \n", + "1046 None Allele None \n", + "1047 2-209113112-C-A Allele None \n", + "\n", + " category_id category_desc \\\n", + "0 moa.variant:144 None \n", + "1 civic.mpid:12 BRAF V600E has been shown to be recurrent in m... \n", + "2 None None \n", + "3 None None \n", + "4 moa.variant:242 None \n", + "... ... ... \n", + "1043 None None \n", + "1044 moa.variant:860 None \n", + "1045 None None \n", + "1046 moa.variant:861 None \n", + "1047 None None \n", + "\n", + " category_label var_id \\\n", + "0 BRAF p.V600E (Missense) ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M \n", + "1 BRAF V600E ga4gh:VA.LX3ooHBAiZdKY4RfTXcliUmkj48mnD_M \n", + "2 None None \n", + "3 None None \n", + "4 EGFR p.T790M (Missense) ga4gh:VA.OvEfBRaS34JkfM0_ZHJVDQEjqtwzyjyp \n", + "... ... ... \n", + "1043 None None \n", + "1044 IDH1 p.R132S (Missense) None \n", + "1045 None None \n", + "1046 IDH1 p.R132L (Missense) None \n", + "1047 None None \n", + "\n", + " var_type var_loc_id var_loc_start \\\n", + "0 Allele ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi 140753335 \n", + "1 Allele ga4gh:SL.XutGzMvqbzN-vnxmPt2MJf7ehxmB0opi 140753335 \n", + "2 None None None \n", + "3 None None None \n", + "4 Allele ga4gh:SL.ZCgOjF-_T0EOBXGc-6yICYui-jgFzJfY 55181377 \n", + "... ... ... ... \n", + "1043 None None None \n", + "1044 None None None \n", + "1045 None None None \n", + "1046 None None None \n", + "1047 None None None \n", + "\n", + " var_loc_end label_regex \n", + "0 140753336 Pattern 1 \n", + "1 140753336 Pattern 1 \n", + "2 None Pattern 2 \n", + "3 None Pattern 2 \n", + "4 55181378 Pattern 1 \n", + "... ... ... \n", + "1043 None Pattern 3 \n", + "1044 None No Match \n", + "1045 None Pattern 3 \n", + "1046 None No Match \n", + "1047 None Pattern 3 \n", + "\n", + "[1048 rows x 16 columns]" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Disease" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Therapy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Gene" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}