From 66a884151feccdfc82a9652ad8a2f83cfb146e9e Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Wed, 11 Dec 2024 16:48:07 +0100 Subject: [PATCH 01/31] started creating parse function for GraphQL query from schema metadata --- tools/pyclient/dev/graphql_data.py | 69 ++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tools/pyclient/dev/graphql_data.py diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py new file mode 100644 index 0000000000..468149d743 --- /dev/null +++ b/tools/pyclient/dev/graphql_data.py @@ -0,0 +1,69 @@ +""" +Development script for fetching data from a schema using the GraphQL API. +""" + +from tools.pyclient.src.molgenis_emx2_pyclient import Client +from tools.pyclient.src.molgenis_emx2_pyclient.metadata import Schema, Table, Column + +URL = "https://emx2.dev.molgenis.org" +SCHEMA = "testCatalogue" + +HEADING = "HEADING" +LOGO = "LOGO" +STRING = "STRING" +TEXT = "TEXT" +INT = "INT" +FLOAT = "FLOAT" +BOOL = "BOOL" +HYPERLINK = "HYPERLINK" +DATE = "DATE" +DATETIME = "DATETIME" + +NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME] + + + +def get_data() -> list: + """Fetches data.""" + + with Client(url=URL, schema=SCHEMA) as client: + + schema_data: Schema = client.get_schema_metadata() + table_metadata = schema_data.get_table(by='id', value='Resources') + table_dict = table_metadata.to_dict() + + # resource_data = client.get('Resources', as_df=False) + api_url = f"{URL}/{SCHEMA}/graphql" + query = {"Resources": ["name", {"type": ["name"]}]} + parsed_query = parse_query(table_dict) + response = client.session.post(url=api_url, + json={"query": parsed_query}) + + resource_data = response.json().get('data') + + + return resource_data + +def parse_query(raw_query: dict) -> str: + accolade_count = 0 + table_id = raw_query.get('id') + columns = raw_query.get('columns') + meta_columns = ["id", "columnType", "refSchemaId", "refTableId"] + columns = [{key: val for (key, val) in col.items() if key in meta_columns} for col in columns] + + query = f"{{\n {table_id} {{\n" + for col in columns: + if col['columnType'] in [HEADING, LOGO]: + continue + if col['columnType'] in NONREFS: + query += f" {col['id']}\n" + query += " }\n" + query += "}" + + return query + + + +if __name__ == '__main__': + data = get_data() + print(data) From 337b378af9c339f6259d84c3059be4aaa2dd2fc4 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 17 Dec 2024 17:12:29 +0100 Subject: [PATCH 02/31] * moved 'parse_query' to Client class * added column types to new constants.py * started creating 'get_pkeys' for metadata.py --- tools/pyclient/dev/graphql_data.py | 24 +++----- .../src/molgenis_emx2_pyclient/client.py | 61 ++++++++++++++++--- .../src/molgenis_emx2_pyclient/constants.py | 12 ++++ .../src/molgenis_emx2_pyclient/metadata.py | 18 ++++++ 4 files changed, 89 insertions(+), 26 deletions(-) create mode 100644 tools/pyclient/src/molgenis_emx2_pyclient/constants.py diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py index 468149d743..505260afd6 100644 --- a/tools/pyclient/dev/graphql_data.py +++ b/tools/pyclient/dev/graphql_data.py @@ -6,7 +6,7 @@ from tools.pyclient.src.molgenis_emx2_pyclient.metadata import Schema, Table, Column URL = "https://emx2.dev.molgenis.org" -SCHEMA = "testCatalogue" +SCHEMA = "catalogue" HEADING = "HEADING" LOGO = "LOGO" @@ -27,24 +27,12 @@ def get_data() -> list: """Fetches data.""" with Client(url=URL, schema=SCHEMA) as client: + resources = client.get(table='Resources') - schema_data: Schema = client.get_schema_metadata() - table_metadata = schema_data.get_table(by='id', value='Resources') - table_dict = table_metadata.to_dict() - # resource_data = client.get('Resources', as_df=False) - api_url = f"{URL}/{SCHEMA}/graphql" - query = {"Resources": ["name", {"type": ["name"]}]} - parsed_query = parse_query(table_dict) - response = client.session.post(url=api_url, - json={"query": parsed_query}) + return resources - resource_data = response.json().get('data') - - - return resource_data - -def parse_query(raw_query: dict) -> str: +def parse_query(raw_query: dict, metadata: Schema) -> str: accolade_count = 0 table_id = raw_query.get('id') columns = raw_query.get('columns') @@ -57,6 +45,10 @@ def parse_query(raw_query: dict) -> str: continue if col['columnType'] in NONREFS: query += f" {col['id']}\n" + if col['columnType'].startswith('ONTOLOGY'): + query += f" {col['id']} {{name}}\n" + if col['columnType'].startswith('REF'): + pkeys = metadata.get_pkeys(table_id) query += " }\n" query += "}" diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index f92b5bb1c4..899d682299 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -1,12 +1,11 @@ import csv import json import logging -import sys import pathlib +import sys import time from functools import cache from io import BytesIO -from typing import Literal import pandas as pd import requests @@ -14,11 +13,12 @@ from . import graphql_queries as queries from . import utils +from .constants import HEADING, LOGO, NONREFS from .exceptions import (NoSuchSchemaException, ServiceUnavailableError, SigninError, ServerNotFoundError, PyclientException, NoSuchTableException, NoContextManagerException, GraphQLException, InvalidTokenException, PermissionDeniedException, TokenSigninException, NonExistentTemplateException) -from .metadata import Schema +from .metadata import Schema, Table logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) log = logging.getLogger("Molgenis EMX2 Pyclient") @@ -403,17 +403,25 @@ def get(self, table: str, query_filter: str = None, schema: str = None, as_df: b table_id = schema_metadata.get_table(by='name', value=table).id filter_part = self._prepare_filter(query_filter, table, schema) - query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}" - response = self.session.get(url=query_url) - self._validate_graphql_response(response=response, - fallback_error_message=f"Failed to retrieve data from {current_schema}::" - f"{table!r}.\nStatus code: {response.status_code}.") + if as_df: + query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}" + response = self.session.get(url=query_url) - response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False) + self._validate_graphql_response(response=response, + fallback_error_message=f"Failed to retrieve data from {current_schema}::" + f"{table!r}.\nStatus code: {response.status_code}.") + + response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False) + else: + query_url = f"{self.url}/{current_schema}/graphql" + query = self._parse_get_table_query(table_id) + response = self.session.post(url=query_url, + json={"query": query}) + response_data = response.json().get('data') if not as_df: - return response_data.to_dict('records') + return response_data return response_data async def export(self, schema: str = None, table: str = None, @@ -1040,3 +1048,36 @@ def _validate_url(self): except requests.exceptions.MissingSchema: raise ServerNotFoundError(f"Invalid URL {self.url!r}. " f"Perhaps you meant 'https://{self.url}'?") + + def _parse_get_table_query(self, table_id) -> str: + """Gathers a table's metadata and parses it to a GraphQL query + for querying the table's contents. + """ + schema_metadata: Schema = self.get_schema_metadata() + table_metadata: Table = schema_metadata.get_table('id', table_id) + + meta_columns = ["id", "columnType", "refSchemaId", "refTableId"] + + query = f"{{\n {table_id} {{\n" + for col in table_metadata.columns: + if col.get('columnType') in [HEADING, LOGO]: + continue + if col.get('columnType') in NONREFS: + query += f" {col.get('id')}\n" + if col.get('columnType').startswith('ONTOLOGY'): + query += f" {col.get('id')} {{name}}\n" + if col.get('columnType').startswith('REF'): + query += f" {col.get('id')} {{" + pkeys = schema_metadata.get_pkeys(col.get('refTableId')) + for pk in pkeys: + if isinstance(pk, str): + query += f"{pk}}}" + if isinstance(pk, dict): + for (key, value) in pk.items(): + query += f" {key} {{{value}}} " + query += "\n" + query += " }\n" + query += "}" + + return query + diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py new file mode 100644 index 0000000000..58fa58d0f0 --- /dev/null +++ b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py @@ -0,0 +1,12 @@ +HEADING = "HEADING" +LOGO = "LOGO" +STRING = "STRING" +TEXT = "TEXT" +INT = "INT" +FLOAT = "FLOAT" +BOOL = "BOOL" +HYPERLINK = "HYPERLINK" +DATE = "DATE" +DATETIME = "DATETIME" + +NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME] diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py b/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py index 5d0bca116a..c731f86505 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py @@ -4,6 +4,7 @@ from itertools import starmap from typing import Literal +from .constants import STRING from .exceptions import NoSuchColumnException, NoSuchTableException @@ -160,6 +161,23 @@ def get(self, attr: str, default: object = None): return self.__getattribute__(attr) return default + def get_pkeys(self, table_id: str) -> list: + """Returns the primary keys of a table.""" + table_meta = self.get_table('id', table_id) + primary_columns = table_meta.get_columns(by='key', value=1) + + primary_keys = [] + for pc in primary_columns: + if pc.get('columnType').startswith('ONT'): + primary_keys.append({pc.id: 'name'}) + elif pc.get('columnType').startswith('REF'): + primary_keys.append({pc.id: self.get_pkeys(pc.get('refTableId'))}) + else: + primary_keys.append(pc.id) + + return primary_keys + + def get_table(self, by: Literal['id', 'name'], value: str) -> Table: """Gets the unique table by either id or name value. Raises NoSuchTableException if the table could not be retrieved from the schema. From 641bfbe10112afb75435c8582dc0a5cae4772bd2 Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Wed, 18 Dec 2024 14:08:58 +0100 Subject: [PATCH 03/31] * created function 'parse_nested_pkeys' * finished '_parse_get_table_query' method --- tools/pyclient/dev/graphql_data.py | 40 +------------------ .../src/molgenis_emx2_pyclient/client.py | 24 +++++------ .../src/molgenis_emx2_pyclient/constants.py | 17 +++++++- .../src/molgenis_emx2_pyclient/utils.py | 25 +++++++++++- 4 files changed, 51 insertions(+), 55 deletions(-) diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py index 505260afd6..06852cd3c0 100644 --- a/tools/pyclient/dev/graphql_data.py +++ b/tools/pyclient/dev/graphql_data.py @@ -1,27 +1,13 @@ """ Development script for fetching data from a schema using the GraphQL API. """ +from pprint import pprint from tools.pyclient.src.molgenis_emx2_pyclient import Client -from tools.pyclient.src.molgenis_emx2_pyclient.metadata import Schema, Table, Column URL = "https://emx2.dev.molgenis.org" SCHEMA = "catalogue" -HEADING = "HEADING" -LOGO = "LOGO" -STRING = "STRING" -TEXT = "TEXT" -INT = "INT" -FLOAT = "FLOAT" -BOOL = "BOOL" -HYPERLINK = "HYPERLINK" -DATE = "DATE" -DATETIME = "DATETIME" - -NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME] - - def get_data() -> list: """Fetches data.""" @@ -32,30 +18,8 @@ def get_data() -> list: return resources -def parse_query(raw_query: dict, metadata: Schema) -> str: - accolade_count = 0 - table_id = raw_query.get('id') - columns = raw_query.get('columns') - meta_columns = ["id", "columnType", "refSchemaId", "refTableId"] - columns = [{key: val for (key, val) in col.items() if key in meta_columns} for col in columns] - - query = f"{{\n {table_id} {{\n" - for col in columns: - if col['columnType'] in [HEADING, LOGO]: - continue - if col['columnType'] in NONREFS: - query += f" {col['id']}\n" - if col['columnType'].startswith('ONTOLOGY'): - query += f" {col['id']} {{name}}\n" - if col['columnType'].startswith('REF'): - pkeys = metadata.get_pkeys(table_id) - query += " }\n" - query += "}" - - return query - if __name__ == '__main__': data = get_data() - print(data) + pprint(data) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 899d682299..312e510af4 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -19,6 +19,7 @@ NoContextManagerException, GraphQLException, InvalidTokenException, PermissionDeniedException, TokenSigninException, NonExistentTemplateException) from .metadata import Schema, Table +from .utils import parse_nested_pkeys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) log = logging.getLogger("Molgenis EMX2 Pyclient") @@ -418,10 +419,8 @@ def get(self, table: str, query_filter: str = None, schema: str = None, as_df: b query = self._parse_get_table_query(table_id) response = self.session.post(url=query_url, json={"query": query}) - response_data = response.json().get('data') + response_data = response.json().get('data').get(table_id) - if not as_df: - return response_data return response_data async def export(self, schema: str = None, table: str = None, @@ -1056,26 +1055,21 @@ def _parse_get_table_query(self, table_id) -> str: schema_metadata: Schema = self.get_schema_metadata() table_metadata: Table = schema_metadata.get_table('id', table_id) - meta_columns = ["id", "columnType", "refSchemaId", "refTableId"] - query = f"{{\n {table_id} {{\n" for col in table_metadata.columns: if col.get('columnType') in [HEADING, LOGO]: continue - if col.get('columnType') in NONREFS: + elif col.get('columnType') in NONREFS: query += f" {col.get('id')}\n" - if col.get('columnType').startswith('ONTOLOGY'): + elif col.get('columnType').startswith('ONTOLOGY'): query += f" {col.get('id')} {{name}}\n" - if col.get('columnType').startswith('REF'): + elif col.get('columnType').startswith('REF'): query += f" {col.get('id')} {{" pkeys = schema_metadata.get_pkeys(col.get('refTableId')) - for pk in pkeys: - if isinstance(pk, str): - query += f"{pk}}}" - if isinstance(pk, dict): - for (key, value) in pk.items(): - query += f" {key} {{{value}}} " - query += "\n" + query += parse_nested_pkeys(pkeys) + query += "}\n" + else: + log.warning(f"Caught column type {col.get('columnType')!r}.") query += " }\n" query += "}" diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py index 58fa58d0f0..5cc380873f 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py @@ -1,3 +1,8 @@ +""" +Constant column type metadata. +""" + +_ARRAY = "_ARRAY" HEADING = "HEADING" LOGO = "LOGO" STRING = "STRING" @@ -9,4 +14,14 @@ DATE = "DATE" DATETIME = "DATETIME" -NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME] +STRING_ARRAY = STRING + _ARRAY +TEXT_ARRAY = TEXT + _ARRAY +INT_ARRAY = INT + _ARRAY +FLOAT_ARRAY = FLOAT + _ARRAY +BOOL_ARRAY = BOOL + _ARRAY +DATE_ARRAY = DATE + _ARRAY +DATETIME_ARRAY = DATETIME + _ARRAY +HYPERLINK_ARRAY = HYPERLINK + _ARRAY + +NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME, HYPERLINK, + STRING_ARRAY, TEXT_ARRAY, INT_ARRAY, FLOAT_ARRAY, BOOL_ARRAY, DATE_ARRAY, DATETIME_ARRAY, HYPERLINK_ARRAY] diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 75077ef7bf..2812f1d3ce 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -1,8 +1,9 @@ """ Utility functions for the Molgenis EMX2 Pyclient package """ +import logging + - def read_file(file_path: str) -> str: """Reads and imports data from a file. @@ -15,3 +16,25 @@ def read_file(file_path: str) -> str: data = stream.read() stream.close() return data + +def parse_nested_pkeys(pkeys: list) -> str: + """Converts a list of primary keys and nested primary keys to a string + suitable for inclusion in a GraphQL query. + """ + converted_pkeys = [] + for pk in pkeys: + if isinstance(pk, str): + converted_pkeys.append(pk) + elif isinstance(pk, dict): + for nested_key, nested_values in pk.items(): + converted_pkeys.append(nested_key) + converted_pkeys.append("{") + if isinstance(nested_values, str): + converted_pkeys.append(nested_values) + else: + converted_pkeys.append(parse_nested_pkeys(nested_values).strip()) + converted_pkeys.append("}") + else: + logging.warning(f"Unexpected data type encountered: {type(pk)!r}.") + + return " ".join(converted_pkeys) \ No newline at end of file From feac320c6d7d535aba8ea386a9091fa27ece5871 Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Wed, 18 Dec 2024 15:29:03 +0100 Subject: [PATCH 04/31] * implemented 'columns' filter in `get` --- tools/pyclient/dev/graphql_data.py | 7 +++-- .../src/molgenis_emx2_pyclient/client.py | 27 ++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py index 06852cd3c0..9562f574ad 100644 --- a/tools/pyclient/dev/graphql_data.py +++ b/tools/pyclient/dev/graphql_data.py @@ -13,7 +13,7 @@ def get_data() -> list: """Fetches data.""" with Client(url=URL, schema=SCHEMA) as client: - resources = client.get(table='Resources') + resources = client.get(table='Resources', columns=['name', 'external identifiers'], as_df=True) return resources @@ -22,4 +22,7 @@ def get_data() -> list: if __name__ == '__main__': data = get_data() - pprint(data) + if isinstance(data, list): + pprint(data) + else: + print(data) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 312e510af4..d34992acde 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -9,6 +9,7 @@ import pandas as pd import requests +from molgenis_emx2_pyclient.exceptions import NoSuchColumnException from requests import Response from . import graphql_queries as queries @@ -373,7 +374,12 @@ def delete_records(self, table: str, schema: str = None, file: str = None, data: errors = '\n'.join([err['message'] for err in response.json().get('errors')]) log.error("Failed to delete data from %s::%s\n%s.", current_schema, table, errors) - def get(self, table: str, query_filter: str = None, schema: str = None, as_df: bool = False) -> list | pd.DataFrame: + def get(self, + table: str, + columns: list = None, + query_filter: str = None, + schema: str = None, + as_df: bool = False) -> list | pd.DataFrame: """Retrieves data from a schema and returns as a list of dictionaries or as a pandas DataFrame (as pandas is used to parse the response). @@ -408,17 +414,28 @@ def get(self, table: str, query_filter: str = None, schema: str = None, as_df: b if as_df: query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}" response = self.session.get(url=query_url) - self._validate_graphql_response(response=response, fallback_error_message=f"Failed to retrieve data from {current_schema}::" f"{table!r}.\nStatus code: {response.status_code}.") response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False) + if columns: + try: + response_data = response_data[columns] + except KeyError as e: + if "not in index" in e.args[0]: + raise NoSuchColumnException(f"Columns {e.args[0]}") + else: + raise NoSuchColumnException(f"Columns {e.args[0].split('Index(')[1].split(', dtype')}" + f" not in index.") else: query_url = f"{self.url}/{current_schema}/graphql" - query = self._parse_get_table_query(table_id) + query = self._parse_get_table_query(table_id, columns) response = self.session.post(url=query_url, json={"query": query}) + self._validate_graphql_response(response=response, + fallback_error_message=f"Failed to retrieve data from {current_schema}::" + f"{table!r}.\nStatus code: {response.status_code}.") response_data = response.json().get('data').get(table_id) return response_data @@ -1048,7 +1065,7 @@ def _validate_url(self): raise ServerNotFoundError(f"Invalid URL {self.url!r}. " f"Perhaps you meant 'https://{self.url}'?") - def _parse_get_table_query(self, table_id) -> str: + def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: """Gathers a table's metadata and parses it to a GraphQL query for querying the table's contents. """ @@ -1057,6 +1074,8 @@ def _parse_get_table_query(self, table_id) -> str: query = f"{{\n {table_id} {{\n" for col in table_metadata.columns: + if col.id not in columns and col.name not in columns: + continue if col.get('columnType') in [HEADING, LOGO]: continue elif col.get('columnType') in NONREFS: From d49ac6b8469c7264621d89699a887c5c78430806 Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Tue, 7 Jan 2025 09:30:28 +0100 Subject: [PATCH 05/31] * added check for None columns --- tools/pyclient/dev/dev.py | 4 ++-- tools/pyclient/dev/graphql_data.py | 3 ++- tools/pyclient/src/molgenis_emx2_pyclient/client.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/pyclient/dev/dev.py b/tools/pyclient/dev/dev.py index 6fd99d5519..99114012e1 100644 --- a/tools/pyclient/dev/dev.py +++ b/tools/pyclient/dev/dev.py @@ -40,8 +40,8 @@ async def main(): async with Client('https://emx2.dev.molgenis.org/', schema='catalogue') as client: participant_range = [10_000, 20_000.5] - big_data = client.get(table='Collection subcohorts', - query_filter=f'`numberOfParticipants` between {participant_range}', as_df=True) + big_data = client.get(table='Subpopulations', + query_filter=f'`numberOfParticipants` between {participant_range}', as_df=False) print(big_data.head().to_string()) excluded_countries = ["Denmark", "France"] diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py index 9562f574ad..f060dcd6fe 100644 --- a/tools/pyclient/dev/graphql_data.py +++ b/tools/pyclient/dev/graphql_data.py @@ -13,7 +13,8 @@ def get_data() -> list: """Fetches data.""" with Client(url=URL, schema=SCHEMA) as client: - resources = client.get(table='Resources', columns=['name', 'external identifiers'], as_df=True) + resources = client.get(table='Resources', columns=['name', 'description', 'subpopulations'], as_df=False) + return resources diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index d34992acde..8fc5052462 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -1074,7 +1074,7 @@ def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: query = f"{{\n {table_id} {{\n" for col in table_metadata.columns: - if col.id not in columns and col.name not in columns: + if columns is not None and (col.id not in columns and col.name not in columns): continue if col.get('columnType') in [HEADING, LOGO]: continue From 7ad5a7da1e980de6ce72457db7a7200dfe567af0 Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Tue, 7 Jan 2025 16:36:04 +0100 Subject: [PATCH 06/31] * split filter method results for CSV and GraphQL API --- tools/pyclient/dev/graphql_data.py | 29 +++++++++++++++---- .../src/molgenis_emx2_pyclient/client.py | 26 ++++++++++------- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py index f060dcd6fe..6976058971 100644 --- a/tools/pyclient/dev/graphql_data.py +++ b/tools/pyclient/dev/graphql_data.py @@ -11,13 +11,30 @@ def get_data() -> list: """Fetches data.""" - + participant_range = [10_000, 20_000] with Client(url=URL, schema=SCHEMA) as client: - resources = client.get(table='Resources', columns=['name', 'description', 'subpopulations'], as_df=False) - - - - return resources + subpop = client.get(table='Subpopulations', columns=['name', 'resource', 'numberOfParticipants'], + query_filter=f'`numberOfParticipants` between {participant_range}', + as_df=False) + + pprint(subpop) + + excluded_countries = ["Denmark", "France"] + resources = client.get(table="Resources", + columns=["name", "id"], + query_filter=f"subpopulations.countries.name != {excluded_countries}", + as_df=False) + pprint(resources) + print(len(resources)) + resources = client.get(table="Resources", + columns=["name", "id"], + query_filter=f"subpopulations.countries.name != {excluded_countries}", + as_df=True) + print(resources.to_string()) + print(len(resources.index)) + + + return subpop diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 8fc5052462..7deb8a1f07 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -376,19 +376,21 @@ def delete_records(self, table: str, schema: str = None, file: str = None, data: def get(self, table: str, - columns: list = None, + columns: list[str] = None, query_filter: str = None, schema: str = None, as_df: bool = False) -> list | pd.DataFrame: """Retrieves data from a schema and returns as a list of dictionaries or as a pandas DataFrame (as pandas is used to parse the response). - :param schema: name of a schema - :type schema: str - :param query_filter: the query to filter the output - :type query_filter: str :param table: the name of the table :type table: str + :param columns: list of column names to filter on + :type columns: list + :param query_filter: the query to filter the output, optional + :type query_filter: str + :param schema: name of a schema, default self.default_schema + :type schema: str :param as_df: if True, the response will be returned as a pandas DataFrame. Otherwise, a recordset will be returned. :type as_df: bool @@ -412,6 +414,8 @@ def get(self, filter_part = self._prepare_filter(query_filter, table, schema) if as_df: + if filter_part: + filter_part = "?filter=" + json.dumps(filter_part) query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}" response = self.session.get(url=query_url) self._validate_graphql_response(response=response, @@ -432,7 +436,7 @@ def get(self, query_url = f"{self.url}/{current_schema}/graphql" query = self._parse_get_table_query(table_id, columns) response = self.session.post(url=query_url, - json={"query": query}) + json={"query": query, "variables": {"filter": filter_part}}) self._validate_graphql_response(response=response, fallback_error_message=f"Failed to retrieve data from {current_schema}::" f"{table!r}.\nStatus code: {response.status_code}.") @@ -713,10 +717,10 @@ def get_schema_metadata(self, name: str = None) -> Schema: metadata = Schema(**response_json.get('data').get('_schema')) return metadata - def _prepare_filter(self, expr: str, _table: str, _schema: str) -> str: + def _prepare_filter(self, expr: str, _table: str, _schema: str) -> dict | None: """Prepares a GraphQL filter based on the expression passed into `get`.""" if expr in [None, ""]: - return "" + return None statements = expr.split(' and ') _filter = dict() for stmt in statements: @@ -734,7 +738,7 @@ def _prepare_filter(self, expr: str, _table: str, _schema: str) -> str: raise ValueError(f"Cannot process statement {stmt!r}, " f"ensure specifying one of the operators '==', '>', '<', '!=', 'between' " f"in your statement.") - return "?filter=" + json.dumps(_filter) + return _filter def __prepare_equals_filter(self, stmt: str, _table: str, _schema: str) -> dict: """Prepares the filter part if the statement filters on equality.""" @@ -1072,7 +1076,9 @@ def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: schema_metadata: Schema = self.get_schema_metadata() table_metadata: Table = schema_metadata.get_table('id', table_id) - query = f"{{\n {table_id} {{\n" + query = (f"query {table_id}($filter: {table_id}Filter) {{\n" + f" {table_id}(filter: $filter) {{\n") + # query = f"{{\n {table_id} {{\n" for col in table_metadata.columns: if columns is not None and (col.id not in columns and col.name not in columns): continue From 2f0c01688a704c30fc64262ba4674814943af832 Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Wed, 8 Jan 2025 10:06:43 +0100 Subject: [PATCH 07/31] * fixed logging issues --- tools/pyclient/dev/graphql_data.py | 16 +++++---------- .../src/molgenis_emx2_pyclient/client.py | 20 ++++++++++++------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py index 6976058971..cc5ec72cdb 100644 --- a/tools/pyclient/dev/graphql_data.py +++ b/tools/pyclient/dev/graphql_data.py @@ -9,7 +9,7 @@ SCHEMA = "catalogue" -def get_data() -> list: +def get_data(): """Fetches data.""" participant_range = [10_000, 20_000] with Client(url=URL, schema=SCHEMA) as client: @@ -21,26 +21,20 @@ def get_data() -> list: excluded_countries = ["Denmark", "France"] resources = client.get(table="Resources", - columns=["name", "id"], + columns=["id", "name"], query_filter=f"subpopulations.countries.name != {excluded_countries}", as_df=False) pprint(resources) print(len(resources)) + resources = client.get(table="Resources", - columns=["name", "id"], + columns=["id", "name"], query_filter=f"subpopulations.countries.name != {excluded_countries}", as_df=True) print(resources.to_string()) print(len(resources.index)) - return subpop - - if __name__ == '__main__': - data = get_data() - if isinstance(data, list): - pprint(data) - else: - print(data) + get_data() diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 7deb8a1f07..831f023df5 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -2,14 +2,12 @@ import json import logging import pathlib -import sys import time from functools import cache from io import BytesIO import pandas as pd import requests -from molgenis_emx2_pyclient.exceptions import NoSuchColumnException from requests import Response from . import graphql_queries as queries @@ -18,11 +16,13 @@ from .exceptions import (NoSuchSchemaException, ServiceUnavailableError, SigninError, ServerNotFoundError, PyclientException, NoSuchTableException, NoContextManagerException, GraphQLException, InvalidTokenException, - PermissionDeniedException, TokenSigninException, NonExistentTemplateException) + PermissionDeniedException, TokenSigninException, NonExistentTemplateException, + NoSuchColumnException) from .metadata import Schema, Table from .utils import parse_nested_pkeys -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logging.getLogger("requests").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) log = logging.getLogger("Molgenis EMX2 Pyclient") @@ -432,6 +432,7 @@ def get(self, else: raise NoSuchColumnException(f"Columns {e.args[0].split('Index(')[1].split(', dtype')}" f" not in index.") + response_data = response_data.drop_duplicates(keep='first').reset_index(drop=True) else: query_url = f"{self.url}/{current_schema}/graphql" query = self._parse_get_table_query(table_id, columns) @@ -854,17 +855,22 @@ def __prepare_between_filter(self, stmt: str, _table: str, _schema: str) -> dict return {col.id: {'between': val}} - @staticmethod - def __prepare_nested_filter(columns: str, value: str | int | float | list, comparison: str): + def __prepare_nested_filter(self, columns: str, value: str | int | float | list, comparison: str): _filter = {} current = _filter for (i, segment) in enumerate(columns.split('.')[:-1]): current[segment] = {} current = current[segment] last_segment = columns.split('.')[-1] - current[last_segment] = {comparison: value} + current[last_segment] = {comparison: self.__prepare_value(value)} return _filter + @staticmethod + def __prepare_value(value: str): + if value.startswith('[') and value.endswith(']'): + return json.loads(value.replace('\'', '"')) + return value + @staticmethod def _prep_data_or_file(file_path: str = None, data: list | pd.DataFrame = None) -> str | None: """Prepares the data from memory or loaded from disk for addition or deletion action. From 2062b5647b48cbe77512c764761106525f764bcd Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Wed, 8 Jan 2025 16:34:45 +0100 Subject: [PATCH 08/31] * added dtype conversion to DataFrame output --- tools/pyclient/dev/graphql_data.py | 26 +++++++++++++++---- .../src/molgenis_emx2_pyclient/client.py | 12 ++++++--- .../src/molgenis_emx2_pyclient/utils.py | 21 ++++++++++++++- 3 files changed, 50 insertions(+), 9 deletions(-) diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py index cc5ec72cdb..852c16661f 100644 --- a/tools/pyclient/dev/graphql_data.py +++ b/tools/pyclient/dev/graphql_data.py @@ -13,23 +13,39 @@ def get_data(): """Fetches data.""" participant_range = [10_000, 20_000] with Client(url=URL, schema=SCHEMA) as client: - subpop = client.get(table='Subpopulations', columns=['name', 'resource', 'numberOfParticipants'], - query_filter=f'`numberOfParticipants` between {participant_range}', - as_df=False) - pprint(subpop) + organisations = client.get(table="Organisations", + columns=["resource", "id", "is lead organisation"], + query_filter="id != UMCG", + as_df=True) + print(organisations) + print(len(organisations.index)) + excluded_countries = ["Denmark", "France"] resources = client.get(table="Resources", columns=["id", "name"], query_filter=f"subpopulations.countries.name != {excluded_countries}", - as_df=False) + as_df=True) pprint(resources) print(len(resources)) resources = client.get(table="Resources", columns=["id", "name"], query_filter=f"subpopulations.countries.name != {excluded_countries}", + as_df=False) + pprint(resources) + print(len(resources)) + + subpop = client.get(table='Subpopulations', columns=['name', 'resource', 'numberOfParticipants'], + query_filter=f'`numberOfParticipants` between {participant_range}', + as_df=False) + + pprint(subpop) + + resources = client.get(table="Resources", + columns=["id", "name", "start year"], + query_filter=f"startYear < 1999", as_df=True) print(resources.to_string()) print(len(resources.index)) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 831f023df5..45c1f33125 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -19,7 +19,7 @@ PermissionDeniedException, TokenSigninException, NonExistentTemplateException, NoSuchColumnException) from .metadata import Schema, Table -from .utils import parse_nested_pkeys +from .utils import parse_nested_pkeys, convert_dtypes logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) @@ -409,7 +409,8 @@ def get(self, raise NoSuchTableException(f"Table {table!r} not found in schema {current_schema!r}.") schema_metadata: Schema = self.get_schema_metadata(current_schema) - table_id = schema_metadata.get_table(by='name', value=table).id + table_meta = schema_metadata.get_table(by='name', value=table) + table_id = table_meta.id filter_part = self._prepare_filter(query_filter, table, schema) @@ -422,7 +423,12 @@ def get(self, fallback_error_message=f"Failed to retrieve data from {current_schema}::" f"{table!r}.\nStatus code: {response.status_code}.") - response_data = pd.read_csv(BytesIO(response.content), keep_default_na=False) + response_data = pd.read_csv(BytesIO(response.content), keep_default_na=True) + dtypes = {c: t for (c, t) in convert_dtypes(table_meta).items() if c in response_data.columns} + bool_columns = [c for (c, t) in dtypes.items() if t == 'bool'] + response_data[bool_columns] = response_data[bool_columns].replace({'true': True, 'false': False}) + response_data = response_data.astype(dtypes) + if columns: try: response_data = response_data[columns] diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 2812f1d3ce..a9055aa766 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -3,6 +3,10 @@ """ import logging +from .constants import INT, FLOAT, DATETIME, BOOL + +from .metadata import Table + def read_file(file_path: str) -> str: """Reads and imports data from a file. @@ -37,4 +41,19 @@ def parse_nested_pkeys(pkeys: list) -> str: else: logging.warning(f"Unexpected data type encountered: {type(pk)!r}.") - return " ".join(converted_pkeys) \ No newline at end of file + return " ".join(converted_pkeys) + +def convert_dtypes(table_meta: Table) -> dict: + """Parses column metadata of a table to a dictionary of column ids to pandas dtypes.""" + + type_map = {INT: 'Int64', + FLOAT: 'Float64', + DATETIME: 'datetime64[ns]', + BOOL: 'bool' + } + + dtypes = {} + for col in table_meta.columns: + dtypes[col.name] = type_map.get(col.get('columnType'), 'object') + + return dtypes From 941ab62b3c31b613cbb0a91310b4d7a9d6367eee Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 9 Jan 2025 10:30:17 +0100 Subject: [PATCH 09/31] * fixed column type float to decimal --- tools/pyclient/src/molgenis_emx2_pyclient/constants.py | 8 ++++---- tools/pyclient/src/molgenis_emx2_pyclient/utils.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py index 5cc380873f..e05c4bad9d 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py @@ -8,7 +8,7 @@ STRING = "STRING" TEXT = "TEXT" INT = "INT" -FLOAT = "FLOAT" +DECIMAL = "DECIMAL" BOOL = "BOOL" HYPERLINK = "HYPERLINK" DATE = "DATE" @@ -17,11 +17,11 @@ STRING_ARRAY = STRING + _ARRAY TEXT_ARRAY = TEXT + _ARRAY INT_ARRAY = INT + _ARRAY -FLOAT_ARRAY = FLOAT + _ARRAY +DECIMAL_ARRAY = DECIMAL + _ARRAY BOOL_ARRAY = BOOL + _ARRAY DATE_ARRAY = DATE + _ARRAY DATETIME_ARRAY = DATETIME + _ARRAY HYPERLINK_ARRAY = HYPERLINK + _ARRAY -NONREFS = [STRING, TEXT, INT, FLOAT, BOOL, DATE, DATETIME, HYPERLINK, - STRING_ARRAY, TEXT_ARRAY, INT_ARRAY, FLOAT_ARRAY, BOOL_ARRAY, DATE_ARRAY, DATETIME_ARRAY, HYPERLINK_ARRAY] +NONREFS = [STRING, TEXT, INT, DECIMAL, BOOL, DATE, DATETIME, HYPERLINK, + STRING_ARRAY, TEXT_ARRAY, INT_ARRAY, DECIMAL_ARRAY, BOOL_ARRAY, DATE_ARRAY, DATETIME_ARRAY, HYPERLINK_ARRAY] diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index a9055aa766..65b4095f76 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -3,7 +3,7 @@ """ import logging -from .constants import INT, FLOAT, DATETIME, BOOL +from .constants import INT, DECIMAL, DATETIME, BOOL from .metadata import Table @@ -47,7 +47,7 @@ def convert_dtypes(table_meta: Table) -> dict: """Parses column metadata of a table to a dictionary of column ids to pandas dtypes.""" type_map = {INT: 'Int64', - FLOAT: 'Float64', + DECIMAL: 'Float64', DATETIME: 'datetime64[ns]', BOOL: 'bool' } From bdaa8bca9359e8137ee4f5f73c56fce1da865fac Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 9 Jan 2025 10:31:46 +0100 Subject: [PATCH 10/31] * fixed column type float to decimal * added remaining types --- tools/pyclient/src/molgenis_emx2_pyclient/client.py | 8 +++----- tools/pyclient/src/molgenis_emx2_pyclient/constants.py | 10 ++++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 45c1f33125..264e7818f0 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -12,7 +12,7 @@ from . import graphql_queries as queries from . import utils -from .constants import HEADING, LOGO, NONREFS +from .constants import HEADING, LOGO from .exceptions import (NoSuchSchemaException, ServiceUnavailableError, SigninError, ServerNotFoundError, PyclientException, NoSuchTableException, NoContextManagerException, GraphQLException, InvalidTokenException, @@ -1090,14 +1090,12 @@ def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: query = (f"query {table_id}($filter: {table_id}Filter) {{\n" f" {table_id}(filter: $filter) {{\n") - # query = f"{{\n {table_id} {{\n" + for col in table_metadata.columns: if columns is not None and (col.id not in columns and col.name not in columns): continue if col.get('columnType') in [HEADING, LOGO]: continue - elif col.get('columnType') in NONREFS: - query += f" {col.get('id')}\n" elif col.get('columnType').startswith('ONTOLOGY'): query += f" {col.get('id')} {{name}}\n" elif col.get('columnType').startswith('REF'): @@ -1106,7 +1104,7 @@ def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: query += parse_nested_pkeys(pkeys) query += "}\n" else: - log.warning(f"Caught column type {col.get('columnType')!r}.") + query += f" {col.get('id')}\n" query += " }\n" query += "}" diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py index e05c4bad9d..05c0e0fc93 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py @@ -13,15 +13,17 @@ HYPERLINK = "HYPERLINK" DATE = "DATE" DATETIME = "DATETIME" +REF = "REF" +REFBACK = "REFBACK" +ONTOLOGY = "ONTOLOGY" STRING_ARRAY = STRING + _ARRAY TEXT_ARRAY = TEXT + _ARRAY INT_ARRAY = INT + _ARRAY DECIMAL_ARRAY = DECIMAL + _ARRAY BOOL_ARRAY = BOOL + _ARRAY +HYPERLINK_ARRAY = HYPERLINK + _ARRAY DATE_ARRAY = DATE + _ARRAY DATETIME_ARRAY = DATETIME + _ARRAY -HYPERLINK_ARRAY = HYPERLINK + _ARRAY - -NONREFS = [STRING, TEXT, INT, DECIMAL, BOOL, DATE, DATETIME, HYPERLINK, - STRING_ARRAY, TEXT_ARRAY, INT_ARRAY, DECIMAL_ARRAY, BOOL_ARRAY, DATE_ARRAY, DATETIME_ARRAY, HYPERLINK_ARRAY] +REF_ARRAY = REF + _ARRAY +ONTOLOGY_ARRAY = ONTOLOGY + _ARRAY From bc34491fbd0de13db3f4799cd4d5796a4b3d4605 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 9 Jan 2025 11:48:49 +0100 Subject: [PATCH 11/31] * fixed server URL ending with '/' --- tools/pyclient/src/molgenis_emx2_pyclient/client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 264e7818f0..839ff09724 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -40,7 +40,7 @@ def __init__(self, url: str, schema: str = None, token: str = None, job: str = N self._token = token self._job = job - self.url: str = url + self.url: str = url if not url.endswith('/') else url[:-1] self.api_graphql = self.url + "/api/graphql" self.signin_status: str = 'unknown' @@ -417,6 +417,8 @@ def get(self, if as_df: if filter_part: filter_part = "?filter=" + json.dumps(filter_part) + else: + filter_part = "" query_url = f"{self.url}/{current_schema}/api/csv/{table_id}{filter_part}" response = self.session.get(url=query_url) self._validate_graphql_response(response=response, From 6b4579ba7acc9f790250812e78a2ebc066f88894 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 9 Jan 2025 17:19:29 +0100 Subject: [PATCH 12/31] * updated dev script for catalogue model --- tools/pyclient/dev/dev.py | 46 ++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/tools/pyclient/dev/dev.py b/tools/pyclient/dev/dev.py index 99114012e1..6f17c381cb 100644 --- a/tools/pyclient/dev/dev.py +++ b/tools/pyclient/dev/dev.py @@ -39,16 +39,18 @@ async def main(): async with Client('https://emx2.dev.molgenis.org/', schema='catalogue') as client: - participant_range = [10_000, 20_000.5] - big_data = client.get(table='Subpopulations', - query_filter=f'`numberOfParticipants` between {participant_range}', as_df=False) - print(big_data.head().to_string()) + participant_range = [10_000, 20_000] + subpopulations = client.get(table='Subpopulations', + query_filter=f'`numberOfParticipants` between {participant_range}', + columns=['name', 'description', 'numberOfParticipants'], + as_df=False) + print(subpopulations) excluded_countries = ["Denmark", "France"] - collections = client.get(table='Collections', - query_filter=f'subcohorts.countries.name != {excluded_countries}', + resources = client.get(table='Resources', + query_filter=f'subpopulations.countries.name != {excluded_countries}', as_df=True) - print(collections.head().to_string()) + print(resources.head().to_string()) var_values = client.get(table='Variable values', query_filter='label != No and value != 1', as_df=True) @@ -77,7 +79,6 @@ async def main(): # Export the entire 'pet store' schema to memory in Excel format, # print its table names and the contents of the 'Pet' table. - # Export the 'Collections' table from schema 'catalogue' to memory and print a sample of its contents pet_store_excel = await client.export(schema='pet store', as_excel=True) pet_store = openpyxl.load_workbook(pet_store_excel, data_only=True) @@ -86,9 +87,10 @@ async def main(): pet_sheet = pd.DataFrame((ps := pd.DataFrame(pet_store['Pet'].values)).values[1:], columns=ps.iloc[0].values) print(pet_sheet.to_string()) - raw_collections = await client.export(schema='catalogue', table='Collections') - collections = pd.read_csv(raw_collections) - print(collections.sample(5).to_string()) + # Export the 'Resources' table from schema 'catalogue' to memory and print a sample of its contents + raw_resources = await client.export(schema='catalogue', table='Resources') + resources = pd.read_csv(raw_resources) + print(resources.sample(5).to_string()) # Connect to server with a default schema specified with Client('https://emx2.dev.molgenis.org/', schema='pet store', token=token) as client: @@ -264,29 +266,29 @@ async def main(): # Use the Schema, Table, and Column classes catalogue_schema = Client('https://emx2.dev.molgenis.org/').get_schema_metadata('catalogue') - # Find the tables inheriting from the 'Collections' table - resource_children = catalogue_schema.get_tables(by='inheritName', value='Collections') + # Find the tables inheriting from the 'Resources' table + resource_children = catalogue_schema.get_tables(by='inheritName', value='Resources') - print("Tables in the schema inheriting from the 'Collections' table.") + print("Tables in the schema inheriting from the 'Resources' table.") for res_chi in resource_children: print(f"{res_chi!s}\n{res_chi!r}") print("\n") # Find the table - collections_meta = catalogue_schema.get_table(by='name', value='Collections') - print(collections_meta) + resources_meta = catalogue_schema.get_table(by='name', value='Resources') + print(resources_meta) - # Find the columns in the Collections table referencing the Organisations table - orgs_refs = collections_meta.get_columns(by='refTableName', value='Organisations') + # Find the columns in the Resources table referencing the Organisations table + orgs_refs = resources_meta.get_columns(by='refTableName', value='Organisations') print(orgs_refs) - # Find the columns in the Collections table referencing the Organisations table in a reference array - orgs_array_refs = collections_meta.get_columns(by=['columnType', 'refTableName'], - value=['REF_ARRAY', 'Collection organisations']) + # Find the columns in the Resources table referencing the Organisations table in a reference array + orgs_array_refs = resources_meta.get_columns(by=['columnType', 'refTableName'], + value=['REF_ARRAY', 'Organisations']) print(orgs_array_refs) # Print the __str__ and __repr__ representations of these columns - print("Columns in the Collections table referencing the Collection organisations table in an array.") + print("Columns in the Resources table referencing the Organisations table in an array.") for orgs_ref in orgs_array_refs: print(f"{orgs_ref!s}\n{orgs_ref!r}\n") From 14642604546bccb141a393a550b71070b54ecedb Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 9 Jan 2025 17:38:44 +0100 Subject: [PATCH 13/31] * implemented truncate --- .../src/molgenis_emx2_pyclient/client.py | 36 +++++++++++++++++-- .../molgenis_emx2_pyclient/graphql_queries.py | 4 +++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 839ff09724..8bf195247b 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -303,6 +303,38 @@ async def upload_file(self, file_path: str | pathlib.Path, schema: str = None): # Report on task progress await self._report_task_progress(process_id) + def truncate(self, table: str, schema: str): + """Truncates the table. + + :param table: the name of the table + :type table: str + :param schema: name of a schema + :type schema: str + """ + current_schema = schema + if current_schema is None: + current_schema = self.default_schema + + if current_schema not in self.schema_names: + raise NoSuchSchemaException(f"Schema {current_schema!r} not available.") + + if not self._table_in_schema(table, current_schema): + raise NoSuchTableException(f"Table {table!r} not found in schema {current_schema!r}.") + + table_id = self.get_schema_metadata(current_schema).get_table(by='name', value=table).id + query = queries.truncate() + + response = self.session.post( + url=self.api_graphql, + json={"query": query, "variables": {"table": table_id}} + ) + + self._validate_graphql_response(response, mutation='truncate', + fallback_error_message=f"Failed to truncate table {current_schema}::{table}.") + log.info(f"Truncated table {table!r}.") + + + def _upload_csv(self, file_path: pathlib.Path, schema: str) -> str: """Uploads the CSV file from the filename to the schema. Returns the success or error message.""" file_name = file_path.name @@ -332,10 +364,10 @@ def _upload_csv(self, file_path: pathlib.Path, schema: str) -> str: def delete_records(self, table: str, schema: str = None, file: str = None, data: list | pd.DataFrame = None): """Deletes records from a table. - :param schema: name of a schema - :type schema: str :param table: the name of the table :type table: str + :param schema: name of a schema + :type schema: str :param file: location of the file containing records to import or update :type file: str :param data: a dataset containing records to delete (list of dictionaries) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/graphql_queries.py b/tools/pyclient/src/molgenis_emx2_pyclient/graphql_queries.py index 53b4caa7e4..60faa116f5 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/graphql_queries.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/graphql_queries.py @@ -133,6 +133,10 @@ def update_schema(): } """ +def truncate(): + """GraphQL query to truncate a table.""" + return """mutation($table: String) {truncate(tables: [$table]) {message}}""" + def list_schemas(): """GraphQL query to view all available schemas.""" From dd3488061d2238a3ce44a284d687b6acf36b4e44 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 10:38:59 +0100 Subject: [PATCH 14/31] * fixed imports --- tools/pyclient/src/molgenis_emx2_pyclient/metadata.py | 1 - tools/pyclient/src/molgenis_emx2_pyclient/utils.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py b/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py index c731f86505..9e499979f6 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/metadata.py @@ -4,7 +4,6 @@ from itertools import starmap from typing import Literal -from .constants import STRING from .exceptions import NoSuchColumnException, NoSuchTableException diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 65b4095f76..752a30966b 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -4,7 +4,6 @@ import logging from .constants import INT, DECIMAL, DATETIME, BOOL - from .metadata import Table From d4cda907c530bfb52c9eff1f2203105006301f71 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 11:02:37 +0100 Subject: [PATCH 15/31] * fixed truncate GraphQL url --- tools/pyclient/src/molgenis_emx2_pyclient/client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 8bf195247b..095ad831e7 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -321,11 +321,12 @@ def truncate(self, table: str, schema: str): if not self._table_in_schema(table, current_schema): raise NoSuchTableException(f"Table {table!r} not found in schema {current_schema!r}.") + query_url = f"{self.url}/{current_schema}/graphql" table_id = self.get_schema_metadata(current_schema).get_table(by='name', value=table).id query = queries.truncate() response = self.session.post( - url=self.api_graphql, + url=query_url, json={"query": query, "variables": {"table": table_id}} ) From 1e5236e0bd07cbbac96885e9100b5c92a77195c5 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 11:19:19 +0100 Subject: [PATCH 16/31] * created `ReferenceException` --- docs/molgenis/use_usingpyclient.md | 15 +++++++++++++-- .../pyclient/src/molgenis_emx2_pyclient/client.py | 7 ++++++- .../src/molgenis_emx2_pyclient/exceptions.py | 3 +++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/docs/molgenis/use_usingpyclient.md b/docs/molgenis/use_usingpyclient.md index 85bd9706d7..978b32d8cd 100644 --- a/docs/molgenis/use_usingpyclient.md +++ b/docs/molgenis/use_usingpyclient.md @@ -124,22 +124,24 @@ Raises the `TokenSigninException` when the client is already signed in with a us ```python def get(self, table: str, + columns: list[str] = None, query_filter: str = None, schema: str = None, as_df: bool = False) -> list | pandas.DataFrame: ... ``` Retrieves data from a table on a schema and returns the result either as a list of dictionaries or as a pandas DataFrame. +Use the `columns` parameter to specify which columns to retrieve. Note that in case `as_df=True` the column _names_ should be supplied, otherwise the column _ids_. Use the `query_filter` parameter to filter the results based on filters applied to the columns. This query requires a special syntax. -Columns can be filtered on equality `==`, inequality `!=`, greater `>` and smaller `<` than. +Values in columns can be filtered on equality `==`, inequality `!=`, greater `>` and smaller `<` than. Values within an interval can also be filtered by using the operand `between`, followed by list of the upper bound and lower bound. The values of reference and ontology columns can also be filtered by joining the column id of the table with the column id of the reference/ontology table by a dot, as in the example `countries.name`, where `countries` is a column in the table `My table` and `name` is the column id of the referenced table specifying the names of countries. It is possible to add filters on multiple columns by separating the filter statements with _' and '_. It is recommended to supply the filters that are compared as variables passed in an f-string. Throws the `NoSuchSchemaException` if the user does not have at least _viewer_ permissions or if the schema does not exist. -Throws the `NoSuchColumnException` if the query filter contains a column id that is not present in the table. +Throws the `NoSuchColumnException` if the `columns` argument or query filter contains a column that is not present in the table. | parameter | type | description | required | default | @@ -318,6 +320,15 @@ client.delete_records(schema='MySchema', table='Cohorts', data=cohorts_df) client.delete_records(schema='MySchema', table='Cohorts', file='Cohorts-to-delete.csv') ``` +### truncate +```python +client.truncate(table='My table', schema='My Schema') +``` +Truncates the table and removes all its contents. +This will fail if entries in the table are referenced from other tables. + +Throws the `ReferenceException` if entries in the table are referenced in other tables. + ### create_schema ```python async def create_schema(self, diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 095ad831e7..97a2947670 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -17,7 +17,7 @@ ServerNotFoundError, PyclientException, NoSuchTableException, NoContextManagerException, GraphQLException, InvalidTokenException, PermissionDeniedException, TokenSigninException, NonExistentTemplateException, - NoSuchColumnException) + NoSuchColumnException, ReferenceException) from .metadata import Schema, Table from .utils import parse_nested_pkeys, convert_dtypes @@ -1033,6 +1033,11 @@ def _validate_graphql_response(self, response: Response, mutation: str = None, f msg = response.json().get("errors", [])[0].get('message') log.error(msg) raise GraphQLException(msg) + if "violates foreign key constraint" in response.text: + msg = response.json().get("errors", [])[0].get('message', '') + log.error(msg) + raise ReferenceException(msg) + msg = response.json().get("errors", [])[0].get('message', '') log.error(msg) raise PyclientException("An unknown error occurred when trying to reach this server.") diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/exceptions.py b/tools/pyclient/src/molgenis_emx2_pyclient/exceptions.py index 2af6d955af..536376f3aa 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/exceptions.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/exceptions.py @@ -59,3 +59,6 @@ class TokenSigninException(PyclientException): class NonExistentTemplateException(PyclientException): """Thrown when creation of schema with non-existent template is attempted.""" + +class ReferenceException(PyclientException): + """Thrown when update or deletion of an entry that is referenced in another table is attempted.""" From e40cba4cb87ffdb1608fa7ec00228347b96dbcc5 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 11:34:41 +0100 Subject: [PATCH 17/31] * refactored table names 'Collections', 'Cohorts' to 'Resources' --- docs/molgenis/use_usingpyclient.md | 55 +++++++++++++++--------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/docs/molgenis/use_usingpyclient.md b/docs/molgenis/use_usingpyclient.md index 978b32d8cd..087808f15e 100644 --- a/docs/molgenis/use_usingpyclient.md +++ b/docs/molgenis/use_usingpyclient.md @@ -152,24 +152,25 @@ Throws the `NoSuchColumnException` if the `columns` argument or query filter con | `as_df` | bool | if true: returns data as pandas DataFrame
else as a list of dictionaries | False | False | ##### examples + ```python -# Get all entries for the table 'Collections' on the schema 'MySchema' -table_data = client.get(table='Collections', schema='MySchema') +# Get all entries for the table 'Resources' on the schema 'MySchema' +table_data = client.get(table='Resources', schema='MySchema') # Set the default schema to 'MySchema' client.set_schema('MySchema') # Get the same entries and return them as pandas DataFrame -table_data = client.get(table='Collections', as_df=True) +table_data = client.get(table='Resources', as_df=True) # Get the entries where the value of a particular column 'number of participants' is greater than 10000 -table_data = client.get(table='Collections', query_filter='numberOfParticipants > 10000') +table_data = client.get(table='Resources', query_filter='numberOfParticipants > 10000') -# Get the entries where 'number of participants' is greater than 10000 and the cohort type is a 'Population cohort' +# Get the entries where 'number of participants' is greater than 10000 and the resource type is a 'Population cohort' # Store the information in variables, first -min_subcohorts = 10000 +min_subpop = 10000 cohort_type = 'Population cohort' -table_data = client.get(table='Collections', query_filter=f'numberOfParticipants > {min_subcohorts}' - f'and cohortType == {cohort_type}') +table_data = client.get(table='Resources', query_filter=f'numberOfParticipants > {min_subpop}' + f'and cohortType == {cohort_type}') ``` ### get_schema_metadata @@ -214,11 +215,11 @@ Throws the `NoSuchSchemaException` if the user does not have at least _viewer_ p ##### examples ```python -# Export the table 'Collections' on the schema 'MySchema' from the CSV API to a BytesIO object -collections_raw: BytesIO = await client.export(schema='MySchema', table='Collections') +# Export the table 'Resources' on the schema 'MySchema' from the CSV API to a BytesIO object +resources_raw: BytesIO = await client.export(schema='MySchema', table='Resources') -# Export 'Collections' from the Excel API to the file 'Collections-export.xlsx' -await client.export(schema='MySchema', table='Collections', filename='Collections-export.xlsx') +# Export 'Resources' from the Excel API to the file 'Resources-export.xlsx' +await client.export(schema='MySchema', table='Resources', filename='Resources-export.xlsx') ``` @@ -245,12 +246,12 @@ Throws the `NoSuchSchemaException` if the schema is not found on the server. ##### examples ```python -# Save an edited table with Collections data from a CSV file to the Collections table -client.save_schema(table='Collections', file='Collections-edited.csv') +# Save an edited table with Resources data from a CSV file to the Resources table +client.save_schema(table='Resources', file='Resources-edited.csv') -# Save an edited table with Collections data from memory to the Collections table -collections: pandas.DataFrame = ... -client.save_schema(table='Collections', data=collections) +# Save an edited table with Resources data from memory to the Resources table +resources: pandas.DataFrame = ... +client.save_schema(table='Resources', data=resources) ``` ### upload_file @@ -271,8 +272,8 @@ Throws the `NoSuchSchemaException` if the schema is not found on the server. ##### examples ```python -# Upload a file containing Collections data to a schema -await client.upload_file(file_path='data/Collections.csv') +# Upload a file containing Resources data to a schema +await client.upload_file(file_path='data/Resources.csv') # Upload a file containing members information to a schema await client.upload_file(file_path='molgenis_members.csv', schema='MySchema') @@ -308,16 +309,16 @@ Throws the `NoSuchSchemaException` if the schema is not found on the server. ##### examples ```python -# Delete cohorts from a list of ids -cohorts = [{'name': 'Cohort 1', 'name': 'Cohort 2'}] -client.delete_records(schema='MySchema', table='Cohorts', data=cohorts) +# Delete resources from a list of ids +resources = [{'name': 'Resource 1', 'name': 'Resource 2'}] +client.delete_records(schema='MySchema', table='Resources', data=resources) -# Delete cohorts from pandas DataFrame -cohorts_df = pandas.DataFrame(data=cohorts) -client.delete_records(schema='MySchema', table='Cohorts', data=cohorts_df) +# Delete resources from pandas DataFrame +resources_df = pandas.DataFrame(data=resources) +client.delete_records(schema='MySchema', table='Resources', data=resources_df) -# Delete cohorts from entries in a CSV file -client.delete_records(schema='MySchema', table='Cohorts', file='Cohorts-to-delete.csv') +# Delete resources from entries in a CSV file +client.delete_records(schema='MySchema', table='Resources', file='Resources-to-delete.csv') ``` ### truncate From ed5387eb599b18413d7fe886e6c831e91a76312c Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 11:39:12 +0100 Subject: [PATCH 18/31] * small fixes --- docs/molgenis/use_usingpyclient.md | 5 +++-- tools/pyclient/src/molgenis_emx2_pyclient/utils.py | 3 +-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/molgenis/use_usingpyclient.md b/docs/molgenis/use_usingpyclient.md index 087808f15e..9266a1d6ee 100644 --- a/docs/molgenis/use_usingpyclient.md +++ b/docs/molgenis/use_usingpyclient.md @@ -147,6 +147,7 @@ Throws the `NoSuchColumnException` if the `columns` argument or query filter con | parameter | type | description | required | default | |----------------|------|--------------------------------------------------------------------------------|----------|---------| | `table` | str | the name of a table | True | None | +| `columns` | list | a list of column names or ids to filter on | False | None | | `schema` | str | the name of a schema | False | None | | `query_filter` | str | a string to filter the results on | False | None | | `as_df` | bool | if true: returns data as pandas DataFrame
else as a list of dictionaries | False | False | @@ -155,12 +156,12 @@ Throws the `NoSuchColumnException` if the `columns` argument or query filter con ```python # Get all entries for the table 'Resources' on the schema 'MySchema' -table_data = client.get(table='Resources', schema='MySchema') +table_data = client.get(table='Resources', schema='MySchema', columns=['name', 'collectionEvents']) # Set the default schema to 'MySchema' client.set_schema('MySchema') # Get the same entries and return them as pandas DataFrame -table_data = client.get(table='Resources', as_df=True) +table_data = client.get(table='Resources', columns=['name', 'collection events'], as_df=True) # Get the entries where the value of a particular column 'number of participants' is greater than 10000 table_data = client.get(table='Resources', query_filter='numberOfParticipants > 10000') diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 752a30966b..813fd49ff1 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -48,8 +48,7 @@ def convert_dtypes(table_meta: Table) -> dict: type_map = {INT: 'Int64', DECIMAL: 'Float64', DATETIME: 'datetime64[ns]', - BOOL: 'bool' - } + BOOL: 'bool'} dtypes = {} for col in table_meta.columns: From 06977bae2356fa5ca3a29ee38ae8aeb1bb85f6fa Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 14:54:33 +0100 Subject: [PATCH 19/31] * fixed examples in dev.py --- tools/pyclient/dev/dev.py | 61 ++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/tools/pyclient/dev/dev.py b/tools/pyclient/dev/dev.py index 6f17c381cb..33fc02c805 100644 --- a/tools/pyclient/dev/dev.py +++ b/tools/pyclient/dev/dev.py @@ -2,7 +2,7 @@ # FILE: dev.py # AUTHOR: David Ruvolo, Ype Zijlstra # CREATED: 2023-05-22 -# MODIFIED: 2024-09-11 +# MODIFIED: 2025-01-14 # PURPOSE: development script for initial testing of the py-client # STATUS: ongoing # PACKAGES: pandas, python-dotenv @@ -24,7 +24,8 @@ from tools.pyclient.src.molgenis_emx2_pyclient import Client from tools.pyclient.src.molgenis_emx2_pyclient.exceptions import (NoSuchSchemaException, NoSuchTableException, - GraphQLException, PermissionDeniedException) + GraphQLException, PermissionDeniedException, + ReferenceException) async def main(): @@ -234,6 +235,23 @@ async def main(): try: schema_create = asyncio.create_task(client.create_schema(name='myNewSchema')) print(client.schema_names) + + # Import the pet store data, downloaded earlier + await schema_create + upload_task = asyncio.create_task(client.upload_file(schema='myNewSchema', file_path='pet store.zip')) + + # Truncate the 'Pet' table + await upload_task + try: + client.truncate(table='Pet', schema='myNewSchema') + except ReferenceException: + print("Could not truncate table 'Pet', as it is referenced to in another table.") + + try: + client.truncate(table='User', schema='myNewSchema') + except ReferenceException: + print("This cannot happen, as table 'User' is not referenced to by other tables.") + except (GraphQLException, PermissionDeniedException) as e: print(e) @@ -241,15 +259,12 @@ async def main(): try: await schema_create client.update_schema(name='myNewSchema', description='I forgot the description') - print(client.schema_names) - print(client.schemas) except (GraphQLException, NoSuchSchemaException) as e: print(e) # Recreate the schema: delete and create try: await client.recreate_schema(name='myNewSchema') - print(client.schema_names) except (GraphQLException, NoSuchSchemaException) as e: print(e) @@ -257,40 +272,32 @@ async def main(): try: await schema_create await asyncio.create_task(client.delete_schema(name='myNewSchema')) - print(client.schema_names) except (GraphQLException, NoSuchSchemaException) as e: print(e) - print("\n\n") - # Use the Schema, Table, and Column classes + # ////////////////////////////////////////////////////////////////////////////////////////// + # Examples for using the Schema, Table, and Column classes + # Get the metadata for the 'catalogue' schema catalogue_schema = Client('https://emx2.dev.molgenis.org/').get_schema_metadata('catalogue') - # Find the tables inheriting from the 'Resources' table - resource_children = catalogue_schema.get_tables(by='inheritName', value='Resources') - - print("Tables in the schema inheriting from the 'Resources' table.") - for res_chi in resource_children: - print(f"{res_chi!s}\n{res_chi!r}") - print("\n") - - # Find the table + # Get the metadata for the Resources table resources_meta = catalogue_schema.get_table(by='name', value='Resources') print(resources_meta) - # Find the columns in the Resources table referencing the Organisations table - orgs_refs = resources_meta.get_columns(by='refTableName', value='Organisations') - print(orgs_refs) + # Find the columns in the Resources table referencing entries in the Resources table + resources_refs = resources_meta.get_columns(by='refTableName', value='Resources') + print(resources_refs) - # Find the columns in the Resources table referencing the Organisations table in a reference array - orgs_array_refs = resources_meta.get_columns(by=['columnType', 'refTableName'], - value=['REF_ARRAY', 'Organisations']) - print(orgs_array_refs) + # Find the columns in the Resources table referencing the Resources table in a reference array + res_arrays_refs = resources_meta.get_columns(by=['columnType', 'refTableName'], + value=['REF_ARRAY', 'Resources']) + print(res_arrays_refs) # Print the __str__ and __repr__ representations of these columns - print("Columns in the Resources table referencing the Organisations table in an array.") - for orgs_ref in orgs_array_refs: - print(f"{orgs_ref!s}\n{orgs_ref!r}\n") + print("Columns in the Resources table referencing the Resources table in an array.") + for res_ref in res_arrays_refs: + print(f"{res_ref!s}\n{res_ref!r}\n") if __name__ == '__main__': asyncio.run(main()) From 59070bff66bc40df0ae235103c6ed74aad767006 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 16:10:08 +0100 Subject: [PATCH 20/31] * updated README.md --- tools/pyclient/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/pyclient/README.md b/tools/pyclient/README.md index 38784fc75d..3f2370b56c 100644 --- a/tools/pyclient/README.md +++ b/tools/pyclient/README.md @@ -13,6 +13,12 @@ pip install molgenis-emx2-pyclient Releases of the Molgenis EMX2 Pyclient follow the release number of the accompanying release of the Molgenis EMX2 software. Therefore, releases of the Pyclient are less frequent than those of EMX2 and the latest version of the Pyclient may differ from the latest version of Molgenis EMX2. +#### 11.42.3 +- Added: feature 'truncate' to remove all entries from a table +- Added: option to filter results of `get` method by columns +- Improved: results returned from `get` with `as_df=False` by implementing the GraphQL API +- Fixed: log level was set to `DEBUG` without possibility to change this. The user can now set the log level again at their preferred level + #### 11.23.0 Added: an optional `job` argument to the `Client` initialization, allowing the Pyclient to run asynchronous methods within a job in EMX2." From 7416741e9dcf58fbef6f66895d4d4fa0a344f56f Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 16:13:54 +0100 Subject: [PATCH 21/31] * updated README.md --- tools/pyclient/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/pyclient/README.md b/tools/pyclient/README.md index 3f2370b56c..428f051527 100644 --- a/tools/pyclient/README.md +++ b/tools/pyclient/README.md @@ -17,6 +17,7 @@ Therefore, releases of the Pyclient are less frequent than those of EMX2 and the - Added: feature 'truncate' to remove all entries from a table - Added: option to filter results of `get` method by columns - Improved: results returned from `get` with `as_df=False` by implementing the GraphQL API +- Improved: added additional parsing for data returned from the CSV API to pandas DataFrame in `get` method - Fixed: log level was set to `DEBUG` without possibility to change this. The user can now set the log level again at their preferred level #### 11.23.0 From 35ba62d26008d538b0765a10d39890757174a276 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 14 Jan 2025 16:20:19 +0100 Subject: [PATCH 22/31] * removed redundant script --- tools/pyclient/dev/graphql_data.py | 56 ------------------------------ 1 file changed, 56 deletions(-) delete mode 100644 tools/pyclient/dev/graphql_data.py diff --git a/tools/pyclient/dev/graphql_data.py b/tools/pyclient/dev/graphql_data.py deleted file mode 100644 index 852c16661f..0000000000 --- a/tools/pyclient/dev/graphql_data.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -Development script for fetching data from a schema using the GraphQL API. -""" -from pprint import pprint - -from tools.pyclient.src.molgenis_emx2_pyclient import Client - -URL = "https://emx2.dev.molgenis.org" -SCHEMA = "catalogue" - - -def get_data(): - """Fetches data.""" - participant_range = [10_000, 20_000] - with Client(url=URL, schema=SCHEMA) as client: - - organisations = client.get(table="Organisations", - columns=["resource", "id", "is lead organisation"], - query_filter="id != UMCG", - as_df=True) - print(organisations) - print(len(organisations.index)) - - - excluded_countries = ["Denmark", "France"] - resources = client.get(table="Resources", - columns=["id", "name"], - query_filter=f"subpopulations.countries.name != {excluded_countries}", - as_df=True) - pprint(resources) - print(len(resources)) - - resources = client.get(table="Resources", - columns=["id", "name"], - query_filter=f"subpopulations.countries.name != {excluded_countries}", - as_df=False) - pprint(resources) - print(len(resources)) - - subpop = client.get(table='Subpopulations', columns=['name', 'resource', 'numberOfParticipants'], - query_filter=f'`numberOfParticipants` between {participant_range}', - as_df=False) - - pprint(subpop) - - resources = client.get(table="Resources", - columns=["id", "name", "start year"], - query_filter=f"startYear < 1999", - as_df=True) - print(resources.to_string()) - print(len(resources.index)) - - - -if __name__ == '__main__': - get_data() From 873870374dea03c1e077e9bb80c6d2329e2c1e6b Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 23 Jan 2025 11:38:58 +0100 Subject: [PATCH 23/31] * fixed GraphQL query for column type FILE --- tools/pyclient/src/molgenis_emx2_pyclient/client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 97a2947670..2888830b74 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -1134,7 +1134,7 @@ def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: for col in table_metadata.columns: if columns is not None and (col.id not in columns and col.name not in columns): continue - if col.get('columnType') in [HEADING, LOGO]: + if col.get('columnType') in [HEADING]: continue elif col.get('columnType').startswith('ONTOLOGY'): query += f" {col.get('id')} {{name}}\n" @@ -1143,6 +1143,8 @@ def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: pkeys = schema_metadata.get_pkeys(col.get('refTableId')) query += parse_nested_pkeys(pkeys) query += "}\n" + elif col.get('columnType').startswith('FILE'): + query += f" {col.get('id')} {{id}}\n" else: query += f" {col.get('id')}\n" query += " }\n" From 33572b45381739fd8a33af5e446ad06301b0f8ac Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 23 Jan 2025 14:43:58 +0100 Subject: [PATCH 24/31] * added parser for ontology columns, top-level only --- .../src/molgenis_emx2_pyclient/client.py | 4 +++- .../src/molgenis_emx2_pyclient/utils.py | 21 +++++++++++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 2888830b74..09c36fed8b 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -19,7 +19,7 @@ PermissionDeniedException, TokenSigninException, NonExistentTemplateException, NoSuchColumnException, ReferenceException) from .metadata import Schema, Table -from .utils import parse_nested_pkeys, convert_dtypes +from .utils import parse_nested_pkeys, convert_dtypes, parse_ontology logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) @@ -483,6 +483,8 @@ def get(self, fallback_error_message=f"Failed to retrieve data from {current_schema}::" f"{table!r}.\nStatus code: {response.status_code}.") response_data = response.json().get('data').get(table_id) + response_data = parse_ontology(response_data, table_id, schema_metadata) + return response_data diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 813fd49ff1..14a202d73f 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -3,8 +3,8 @@ """ import logging -from .constants import INT, DECIMAL, DATETIME, BOOL -from .metadata import Table +from .constants import INT, DECIMAL, DATETIME, BOOL, ONTOLOGY +from .metadata import Table, Schema def read_file(file_path: str) -> str: @@ -55,3 +55,20 @@ def convert_dtypes(table_meta: Table) -> dict: dtypes[col.name] = type_map.get(col.get('columnType'), 'object') return dtypes + +def parse_ontology(data: list, table_id: str, schema: Schema) -> list: + """Parses the ontology columns from a GraphQL response.""" + table_meta = schema.get_table('id', table_id) + parsed_data = [] + for row in data: + parsed_row = {} + for (col, value) in row.items(): + match table_meta.get_column('id', col).get('columnType'): + case "ONTOLOGY": + parsed_row[col] = value['name'] + case "ONTOLOGY_ARRAY": + parsed_row[col] = [val['name'] for val in value] + case _: + parsed_row[col] = value + parsed_data.append(parsed_row) + return parsed_data From b65b78e4f4bf3a868efe6301e8072c316bd0a1c5 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 23 Jan 2025 15:10:11 +0100 Subject: [PATCH 25/31] * implemented parser for ontology columns nested in ref/ref_array/refback columns --- tools/pyclient/src/molgenis_emx2_pyclient/utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 14a202d73f..05447eed19 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -63,11 +63,18 @@ def parse_ontology(data: list, table_id: str, schema: Schema) -> list: for row in data: parsed_row = {} for (col, value) in row.items(): - match table_meta.get_column('id', col).get('columnType'): + column_meta = table_meta.get_column('id', col) + match column_meta.get('columnType'): case "ONTOLOGY": parsed_row[col] = value['name'] case "ONTOLOGY_ARRAY": parsed_row[col] = [val['name'] for val in value] + case "REF": + parsed_row[col] = parse_ontology([value], column_meta.get('refTableId'), schema)[0] + case "REF_ARRAY": + parsed_row[col] = parse_ontology(value, column_meta.get('refTableId'), schema) + case "REFBACK": + parsed_row[col] = parse_ontology(value, column_meta.get('refTableId'), schema) case _: parsed_row[col] = value parsed_data.append(parsed_row) From c20b715e10886014092095bdfea3d9ebcac5408b Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Thu, 23 Jan 2025 15:13:39 +0100 Subject: [PATCH 26/31] * fixed as_df=False return empty list instead of None in case of empty table --- tools/pyclient/src/molgenis_emx2_pyclient/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 09c36fed8b..aec8e7080b 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -482,7 +482,7 @@ def get(self, self._validate_graphql_response(response=response, fallback_error_message=f"Failed to retrieve data from {current_schema}::" f"{table!r}.\nStatus code: {response.status_code}.") - response_data = response.json().get('data').get(table_id) + response_data = response.json().get('data').get(table_id, []) response_data = parse_ontology(response_data, table_id, schema_metadata) From 49888b6a6d7f7e8fbdb3b086bb2b56b36f15ce99 Mon Sep 17 00:00:00 2001 From: Ype Zijlstra Date: Tue, 28 Jan 2025 16:27:06 +0100 Subject: [PATCH 27/31] * improved parsing for cases where ref columns reference ontology tables --- tools/pyclient/dev/directory-ref.py | 37 +++++++++++++++++++ .../src/molgenis_emx2_pyclient/client.py | 12 ++++-- 2 files changed, 45 insertions(+), 4 deletions(-) create mode 100644 tools/pyclient/dev/directory-ref.py diff --git a/tools/pyclient/dev/directory-ref.py b/tools/pyclient/dev/directory-ref.py new file mode 100644 index 0000000000..964693936e --- /dev/null +++ b/tools/pyclient/dev/directory-ref.py @@ -0,0 +1,37 @@ +""" +Script to test the 'get' functionality on a Directory schema with reference to ontology table. +TODO delete when done +""" +import asyncio +import logging +import os +from pprint import pprint + +import numpy +import openpyxl +import pandas as pd +from dotenv import load_dotenv + +from tools.pyclient.src.molgenis_emx2_pyclient import Client +from tools.pyclient.src.molgenis_emx2_pyclient.exceptions import (NoSuchSchemaException, NoSuchTableException, + GraphQLException, PermissionDeniedException) + + +async def main(): + # Set up the logger + logging.basicConfig(level='INFO') + logging.getLogger("requests").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) + + # Load the login details into the environment + load_dotenv() + token = os.environ.get('MG_TOKEN') + + with Client(url="https://emx2.dev.molgenis.org/", schema="directory-demo") as client: + collections = client.get(table="Collections", as_df=False) + + pprint(collections) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index aec8e7080b..28ad1dd945 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -476,7 +476,7 @@ def get(self, response_data = response_data.drop_duplicates(keep='first').reset_index(drop=True) else: query_url = f"{self.url}/{current_schema}/graphql" - query = self._parse_get_table_query(table_id, columns) + query = self._parse_get_table_query(table_id, current_schema, columns) response = self.session.post(url=query_url, json={"query": query, "variables": {"filter": filter_part}}) self._validate_graphql_response(response=response, @@ -1123,11 +1123,11 @@ def _validate_url(self): raise ServerNotFoundError(f"Invalid URL {self.url!r}. " f"Perhaps you meant 'https://{self.url}'?") - def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: + def _parse_get_table_query(self, table_id: str, schema: str, columns: list = None) -> str: """Gathers a table's metadata and parses it to a GraphQL query for querying the table's contents. """ - schema_metadata: Schema = self.get_schema_metadata() + schema_metadata: Schema = self.get_schema_metadata(schema) table_metadata: Table = schema_metadata.get_table('id', table_id) query = (f"query {table_id}($filter: {table_id}Filter) {{\n" @@ -1141,8 +1141,12 @@ def _parse_get_table_query(self, table_id: str, columns: list = None) -> str: elif col.get('columnType').startswith('ONTOLOGY'): query += f" {col.get('id')} {{name}}\n" elif col.get('columnType').startswith('REF'): + if (ref_schema := col.get('refSchemaName', schema)) == schema: + pkeys = schema_metadata.get_pkeys(col.get('refTableId')) + else: + ref_schema_meta = self.get_schema_metadata(ref_schema) + pkeys = ref_schema_meta.get_pkeys(col.get('refTableId')) query += f" {col.get('id')} {{" - pkeys = schema_metadata.get_pkeys(col.get('refTableId')) query += parse_nested_pkeys(pkeys) query += "}\n" elif col.get('columnType').startswith('FILE'): From 7a54886e4f64bdb3f760e670fceb743d474420b6 Mon Sep 17 00:00:00 2001 From: YpeZ Date: Wed, 29 Jan 2025 16:24:37 +0100 Subject: [PATCH 28/31] * fixed referencing tables in other schemas --- tools/pyclient/dev/directory-ref.py | 2 +- .../src/molgenis_emx2_pyclient/client.py | 32 +++++++++++++++++-- .../src/molgenis_emx2_pyclient/utils.py | 24 -------------- 3 files changed, 31 insertions(+), 27 deletions(-) diff --git a/tools/pyclient/dev/directory-ref.py b/tools/pyclient/dev/directory-ref.py index 964693936e..b7eaf94155 100644 --- a/tools/pyclient/dev/directory-ref.py +++ b/tools/pyclient/dev/directory-ref.py @@ -27,7 +27,7 @@ async def main(): load_dotenv() token = os.environ.get('MG_TOKEN') - with Client(url="https://emx2.dev.molgenis.org/", schema="directory-demo") as client: + with Client(url="http://localhost:8080", schema="directory-demo") as client: collections = client.get(table="Collections", as_df=False) pprint(collections) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 28ad1dd945..01a33a0edb 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -19,7 +19,7 @@ PermissionDeniedException, TokenSigninException, NonExistentTemplateException, NoSuchColumnException, ReferenceException) from .metadata import Schema, Table -from .utils import parse_nested_pkeys, convert_dtypes, parse_ontology +from .utils import parse_nested_pkeys, convert_dtypes logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) @@ -483,7 +483,7 @@ def get(self, fallback_error_message=f"Failed to retrieve data from {current_schema}::" f"{table!r}.\nStatus code: {response.status_code}.") response_data = response.json().get('data').get(table_id, []) - response_data = parse_ontology(response_data, table_id, schema_metadata) + response_data = self._parse_ontology(response_data, table_id, schema) return response_data @@ -1158,3 +1158,31 @@ def _parse_get_table_query(self, table_id: str, schema: str, columns: list = Non return query + def _parse_ontology(self, data: list, table_id: str, schema: str) -> list: + """Parses the ontology columns from a GraphQL response.""" + schema_meta = self.get_schema_metadata(schema) + table_meta = schema_meta.get_table('id', table_id) + parsed_data = [] + for row in data: + parsed_row = {} + for (col, value) in row.items(): + column_meta = table_meta.get_column('id', col) + match column_meta.get('columnType'): + case "ONTOLOGY": + parsed_row[col] = value['name'] + case "ONTOLOGY_ARRAY": + parsed_row[col] = [val['name'] for val in value] + case "REF": + _schema = column_meta.get('refSchemaName', schema) + parsed_row[col] = self._parse_ontology([value], column_meta.get('refTableId'), _schema)[0] + case "REF_ARRAY": + _schema = column_meta.get('refSchemaName', schema) + parsed_row[col] = self._parse_ontology(value, column_meta.get('refTableId'), _schema) + case "REFBACK": + _schema = column_meta.get('refSchemaName', schema) + parsed_row[col] = self._parse_ontology(value, column_meta.get('refTableId'), _schema) + case _: + parsed_row[col] = value + parsed_data.append(parsed_row) + return parsed_data + diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 05447eed19..90c10ae239 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -55,27 +55,3 @@ def convert_dtypes(table_meta: Table) -> dict: dtypes[col.name] = type_map.get(col.get('columnType'), 'object') return dtypes - -def parse_ontology(data: list, table_id: str, schema: Schema) -> list: - """Parses the ontology columns from a GraphQL response.""" - table_meta = schema.get_table('id', table_id) - parsed_data = [] - for row in data: - parsed_row = {} - for (col, value) in row.items(): - column_meta = table_meta.get_column('id', col) - match column_meta.get('columnType'): - case "ONTOLOGY": - parsed_row[col] = value['name'] - case "ONTOLOGY_ARRAY": - parsed_row[col] = [val['name'] for val in value] - case "REF": - parsed_row[col] = parse_ontology([value], column_meta.get('refTableId'), schema)[0] - case "REF_ARRAY": - parsed_row[col] = parse_ontology(value, column_meta.get('refTableId'), schema) - case "REFBACK": - parsed_row[col] = parse_ontology(value, column_meta.get('refTableId'), schema) - case _: - parsed_row[col] = value - parsed_data.append(parsed_row) - return parsed_data From c3dd8c078a6b1ec47eb30ad356f9fd7c76c5a600 Mon Sep 17 00:00:00 2001 From: YpeZ Date: Wed, 29 Jan 2025 17:01:48 +0100 Subject: [PATCH 29/31] * replaced dtype 'bool' by 'boolean' for data type BOOL --- tools/pyclient/dev/directory-ref.py | 22 +++++++++++++++---- .../src/molgenis_emx2_pyclient/client.py | 5 +++-- .../src/molgenis_emx2_pyclient/utils.py | 2 +- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tools/pyclient/dev/directory-ref.py b/tools/pyclient/dev/directory-ref.py index b7eaf94155..a2db90ac84 100644 --- a/tools/pyclient/dev/directory-ref.py +++ b/tools/pyclient/dev/directory-ref.py @@ -16,6 +16,23 @@ from tools.pyclient.src.molgenis_emx2_pyclient.exceptions import (NoSuchSchemaException, NoSuchTableException, GraphQLException, PermissionDeniedException) +def get_collections(): + """Gets Collections from directory-demo in list[dict] format.""" + with Client(url="http://localhost:8080", schema="directory-demo") as client: + collections = client.get(table="Collections", as_df=False) + for col in collections: + for (k, v) in col.items(): + print(f"{k:30}: {v}") + +def get_orders(): + """Gets Orders from pet store in DataFrame format.""" + + with Client(url="http://localhost:8080", schema="pet store") as client: + orders = client.get(table="Order", as_df=True) + + print(orders.to_string()) + + async def main(): # Set up the logger @@ -27,10 +44,7 @@ async def main(): load_dotenv() token = os.environ.get('MG_TOKEN') - with Client(url="http://localhost:8080", schema="directory-demo") as client: - collections = client.get(table="Collections", as_df=False) - - pprint(collections) + get_orders() if __name__ == '__main__': diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index 01a33a0edb..a18dcdfa3b 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -6,6 +6,7 @@ from functools import cache from io import BytesIO +import numpy as np import pandas as pd import requests from requests import Response @@ -460,9 +461,9 @@ def get(self, response_data = pd.read_csv(BytesIO(response.content), keep_default_na=True) dtypes = {c: t for (c, t) in convert_dtypes(table_meta).items() if c in response_data.columns} - bool_columns = [c for (c, t) in dtypes.items() if t == 'bool'] + bool_columns = [c for (c, t) in dtypes.items() if t == 'boolean'] response_data[bool_columns] = response_data[bool_columns].replace({'true': True, 'false': False}) - response_data = response_data.astype(dtypes) + response_data = response_data.astype(dtypes).fillna(np.nan) if columns: try: diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index 90c10ae239..d5deee575d 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -48,7 +48,7 @@ def convert_dtypes(table_meta: Table) -> dict: type_map = {INT: 'Int64', DECIMAL: 'Float64', DATETIME: 'datetime64[ns]', - BOOL: 'bool'} + BOOL: 'boolean'} dtypes = {} for col in table_meta.columns: From dcccb0054ef99708fcfe4541d1318fa7457b28d2 Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Thu, 30 Jan 2025 13:26:55 +0100 Subject: [PATCH 30/31] * added data type LONG for conversion --- tools/pyclient/dev/directory-ref.py | 10 +++++++++- .../src/molgenis_emx2_pyclient/client.py | 2 +- .../src/molgenis_emx2_pyclient/constants.py | 2 ++ .../pyclient/src/molgenis_emx2_pyclient/utils.py | 16 ++++++++++------ 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/tools/pyclient/dev/directory-ref.py b/tools/pyclient/dev/directory-ref.py index a2db90ac84..7127a6df04 100644 --- a/tools/pyclient/dev/directory-ref.py +++ b/tools/pyclient/dev/directory-ref.py @@ -32,6 +32,14 @@ def get_orders(): print(orders.to_string()) +def get_longlat(): + """Gets long/lat info from directory-demo::Biobanks""" + with Client(url="http://localhost:8080", schema="directory-demo") as client: + biobanks = client.get(table="Biobanks", as_df=True) + + print(biobanks.to_string()) + print(biobanks.dtypes) + async def main(): @@ -44,7 +52,7 @@ async def main(): load_dotenv() token = os.environ.get('MG_TOKEN') - get_orders() + get_longlat() if __name__ == '__main__': diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index a18dcdfa3b..b79bbe8ad9 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -463,7 +463,7 @@ def get(self, dtypes = {c: t for (c, t) in convert_dtypes(table_meta).items() if c in response_data.columns} bool_columns = [c for (c, t) in dtypes.items() if t == 'boolean'] response_data[bool_columns] = response_data[bool_columns].replace({'true': True, 'false': False}) - response_data = response_data.astype(dtypes).fillna(np.nan) + response_data = response_data.astype(dtypes) if columns: try: diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py index 05c0e0fc93..499481c73c 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/constants.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/constants.py @@ -9,6 +9,7 @@ TEXT = "TEXT" INT = "INT" DECIMAL = "DECIMAL" +LONG = "LONG" BOOL = "BOOL" HYPERLINK = "HYPERLINK" DATE = "DATE" @@ -21,6 +22,7 @@ TEXT_ARRAY = TEXT + _ARRAY INT_ARRAY = INT + _ARRAY DECIMAL_ARRAY = DECIMAL + _ARRAY +LONG = LONG + _ARRAY BOOL_ARRAY = BOOL + _ARRAY HYPERLINK_ARRAY = HYPERLINK + _ARRAY DATE_ARRAY = DATE + _ARRAY diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py index d5deee575d..7a63601018 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/utils.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/utils.py @@ -3,8 +3,8 @@ """ import logging -from .constants import INT, DECIMAL, DATETIME, BOOL, ONTOLOGY -from .metadata import Table, Schema +from .constants import INT, DECIMAL, DATETIME, BOOL, LONG, STRING +from .metadata import Table def read_file(file_path: str) -> str: @@ -45,10 +45,14 @@ def parse_nested_pkeys(pkeys: list) -> str: def convert_dtypes(table_meta: Table) -> dict: """Parses column metadata of a table to a dictionary of column ids to pandas dtypes.""" - type_map = {INT: 'Int64', - DECIMAL: 'Float64', - DATETIME: 'datetime64[ns]', - BOOL: 'boolean'} + type_map = { + STRING: 'string', + INT: 'Int64', + LONG: 'Int64', + DECIMAL: 'Float64', + DATETIME: 'datetime64[ns]', + BOOL: 'boolean' + } dtypes = {} for col in table_meta.columns: From 7e7039665b4e3252c70acb256c392e5f8a36f10e Mon Sep 17 00:00:00 2001 From: ypezijlstra Date: Thu, 30 Jan 2025 18:51:05 +0100 Subject: [PATCH 31/31] * fixed issue with rounding of numeric values in string type columns --- tools/pyclient/src/molgenis_emx2_pyclient/client.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/pyclient/src/molgenis_emx2_pyclient/client.py b/tools/pyclient/src/molgenis_emx2_pyclient/client.py index b79bbe8ad9..c8082ebbdd 100644 --- a/tools/pyclient/src/molgenis_emx2_pyclient/client.py +++ b/tools/pyclient/src/molgenis_emx2_pyclient/client.py @@ -459,8 +459,10 @@ def get(self, fallback_error_message=f"Failed to retrieve data from {current_schema}::" f"{table!r}.\nStatus code: {response.status_code}.") - response_data = pd.read_csv(BytesIO(response.content), keep_default_na=True) - dtypes = {c: t for (c, t) in convert_dtypes(table_meta).items() if c in response_data.columns} + response_columns = pd.read_csv(BytesIO(response.content)).columns + dtypes = {c: t for (c, t) in convert_dtypes(table_meta).items() if c in response_columns} + response_data = pd.read_csv(BytesIO(response.content), keep_default_na=True, dtype=dtypes) + bool_columns = [c for (c, t) in dtypes.items() if t == 'boolean'] response_data[bool_columns] = response_data[bool_columns].replace({'true': True, 'false': False}) response_data = response_data.astype(dtypes)