From e15fcf0f4b76b2e710a869af2bf563f93f646a67 Mon Sep 17 00:00:00 2001 From: Louise Deconinck Date: Thu, 19 Sep 2024 21:02:26 +0200 Subject: [PATCH] Add generator scripts --- src/dummy_anndata/__init__.py | 8 +- src/dummy_anndata/generate_dataframe.py | 23 +++++ src/dummy_anndata/generate_dataset.py | 110 ++++++++++++++++++++++++ src/dummy_anndata/generate_dict.py | 50 +++++++++++ src/dummy_anndata/generate_matrix.py | 65 ++++++++++++++ src/dummy_anndata/generate_vector.py | 73 ++++++++++++++++ src/dummy_anndata/pl/__init__.py | 1 - src/dummy_anndata/pl/basic.py | 63 -------------- src/dummy_anndata/pp/__init__.py | 1 - src/dummy_anndata/pp/basic.py | 17 ---- src/dummy_anndata/tl/__init__.py | 1 - src/dummy_anndata/tl/basic.py | 17 ---- 12 files changed, 326 insertions(+), 103 deletions(-) create mode 100644 src/dummy_anndata/generate_dataframe.py create mode 100644 src/dummy_anndata/generate_dataset.py create mode 100644 src/dummy_anndata/generate_dict.py create mode 100644 src/dummy_anndata/generate_matrix.py create mode 100644 src/dummy_anndata/generate_vector.py delete mode 100644 src/dummy_anndata/pl/__init__.py delete mode 100644 src/dummy_anndata/pl/basic.py delete mode 100644 src/dummy_anndata/pp/__init__.py delete mode 100644 src/dummy_anndata/pp/basic.py delete mode 100644 src/dummy_anndata/tl/__init__.py delete mode 100644 src/dummy_anndata/tl/basic.py diff --git a/src/dummy_anndata/__init__.py b/src/dummy_anndata/__init__.py index 63f9a31..35a2311 100644 --- a/src/dummy_anndata/__init__.py +++ b/src/dummy_anndata/__init__.py @@ -1,7 +1,9 @@ from importlib.metadata import version -from . import pl, pp, tl - -__all__ = ["pl", "pp", "tl"] +from .generate_dataframe import generate_dataframe +from .generate_dataset import generate_dataset +from .generate_dict import generate_scalar, generate_type, generate_dict +from .generate_matrix import generate_matrix +from .generate_vector import generate_vector __version__ = version("dummy-anndata") diff --git a/src/dummy_anndata/generate_dataframe.py b/src/dummy_anndata/generate_dataframe.py new file mode 100644 index 0000000..509053a --- /dev/null +++ b/src/dummy_anndata/generate_dataframe.py @@ -0,0 +1,23 @@ +import pandas as pd +from generate_vector import vector_generators + + +def generate_dataframe(n_rows, types=None): + """ + Generate a pandas DataFrame with specified number of rows and column types. + + Parameters: + n_rows (int): The number of rows in the DataFrame. + types (list, optional): A list of column types to include in the DataFrame. + Choose from the list of vector_generators keys. + If not provided, all available column types will be included. + + Returns: + pandas.DataFrame: The generated DataFrame. + + """ + if types is None: + types = list(vector_generators.keys()) + + data = {t: vector_generators[t](n_rows) for t in types} + return pd.DataFrame(data) diff --git a/src/dummy_anndata/generate_dataset.py b/src/dummy_anndata/generate_dataset.py new file mode 100644 index 0000000..0092956 --- /dev/null +++ b/src/dummy_anndata/generate_dataset.py @@ -0,0 +1,110 @@ +import anndata as ad + +from generate_matrix import matrix_generators +from generate_vector import vector_generators +from generate_dataframe import generate_dataframe +from generate_dict import scalar_generators, generate_dict + + +def generate_dataset( + n_obs=10, + n_vars=20, + x_type="generate_integer_matrix", + layer_types=None, + obs_types=None, + var_types=None, + obsm_types=None, + varm_types=None, + obsp_types=None, + varp_types=None, + uns_types=None, +): + + assert x_type in matrix_generators, f"Unknown matrix type: {x_type}" + assert layer_types is None or all( + t in matrix_generators.keys() for t in layer_types + ), "Unknown layer type" + assert obs_types is None or all( + t in vector_generators.keys() for t in obs_types + ), "Unknown obs type" + assert var_types is None or all( + t in vector_generators.keys() for t in var_types + ), "Unknown var type" + assert obsm_types is None or all( + t in matrix_generators.keys() or t in vector_generators.keys() + for t in obsm_types + ), "Unknown obsm type" + assert varm_types is None or all( + t in matrix_generators.keys() or t in vector_generators.keys() + for t in varm_types + ), "Unknown varm type" + assert obsp_types is None or all( + t in matrix_generators.keys() for t in obsp_types + ), "Unknown obsp type" + assert varp_types is None or all( + t in matrix_generators.keys() for t in varp_types + ), "Unknown varp type" + # TODO uns types + + if layer_types is None: # layer_types are all matrices + layer_types = list(matrix_generators.keys()) + if obs_types is None: # obs_types are all vectors + obs_types = list(vector_generators.keys()) + if var_types is None: # var_types are all vectors + var_types = list(vector_generators.keys()) + if obsm_types is None: # obsm_types are all matrices or vectors + obsm_types = list(matrix_generators.keys()) + list(vector_generators.keys()) + if varm_types is None: # varm_types are all matrices or vectors + varm_types = list(matrix_generators.keys()) + list(vector_generators.keys()) + if obsp_types is None: # obsp_types are all matrices + obsp_types = list(matrix_generators.keys()) + if varp_types is None: # varp_types are all matrices + varp_types = list(matrix_generators.keys()) + if uns_types is None: + uns_types = ( + list(vector_generators.keys()) + + list(matrix_generators.keys()) + + list(scalar_generators.keys()) + ) + + X = matrix_generators[x_type](n_obs, n_vars) + layers = {t: matrix_generators[t](n_obs, n_vars) for t in layer_types} + + obs_names = [f"Cell{i:03d}" for i in range(n_obs)] + var_names = [f"Gene{i:03d}" for i in range(n_vars)] + + obs = generate_dataframe(n_obs, obs_types) + var = generate_dataframe(n_vars, var_types) + obs.index = obs_names + var.index = var_names + + obsm = {} + for t in obsm_types: + if t in matrix_generators.keys(): + obsm[t] = matrix_generators[t](n_obs, n_obs) + elif t in vector_generators.keys(): + obsm[t] = vector_generators[t](n_obs) + + varm = {} + for t in varm_types: + if t in matrix_generators.keys(): + varm[t] = matrix_generators[t](n_vars, n_vars) + elif t in vector_generators.keys(): + varm[t] = vector_generators[t](n_vars) + + obsp = {t: matrix_generators[t](n_obs, n_obs) for t in obsp_types} + varp = {t: matrix_generators[t](n_vars, n_vars) for t in varp_types} + + uns = generate_dict(n_obs, n_vars, uns_types) + + return ad.AnnData( + X, + layers=layers, + obs=obs, + var=var, + obsm=obsm, + varm=varm, + obsp=obsp, + varp=varp, + uns=uns, + ) diff --git a/src/dummy_anndata/generate_dict.py b/src/dummy_anndata/generate_dict.py new file mode 100644 index 0000000..85d55cd --- /dev/null +++ b/src/dummy_anndata/generate_dict.py @@ -0,0 +1,50 @@ +from generate_vector import vector_generators +from generate_matrix import matrix_generators + +import pandas as pd +import numpy as np + +scalar_generators = { + "string": "version", + "char": "a", + "integer": 1, + "float": 1.0, + "boolean": True, + "none": None, + # "NA": pd.NA, cannot write to h5 group + "nan": np.nan, +} + + +def generate_scalar(scalar_type): + if scalar_type[:7] == "scalar_": + return vector_generators[scalar_type[7:]](1) + return scalar_generators[scalar_type] + + +def generate_type(type, n_rows, n_cols): + if type in scalar_generators or type[:7] == "scalar_": + return generate_scalar(type) + if type in vector_generators: + return vector_generators[type](n_rows) + if type in matrix_generators: + return matrix_generators[type](n_rows, n_cols) + return None + + +def generate_dict(n_rows, n_cols, types=None, nested=True): + if types is None: # types are all vectors and all matrices + scalar_types = list(scalar_generators.keys()) + [ + f"scalar_{t}" for t in vector_generators.keys() + ] + types = ( + scalar_types + + list(vector_generators.keys()) + + list(matrix_generators.keys()) + ) + + data = {t: generate_type(t, n_rows, n_cols) for t in types} + if nested: + data["nested"] = generate_dict(n_rows, n_cols, types, False) + + return data diff --git a/src/dummy_anndata/generate_matrix.py b/src/dummy_anndata/generate_matrix.py new file mode 100644 index 0000000..a6b2ccc --- /dev/null +++ b/src/dummy_anndata/generate_matrix.py @@ -0,0 +1,65 @@ +import numpy as np +import scipy as sp + + +def float_mtx(n_obs, n_vars, NAs=False): + # add 0.5 to easily spot conversion issues + mtx = np.arange(n_obs * n_vars, dtype=float).reshape(n_obs, n_vars) + 0.5 + if NAs: # numpy matrices do no support pd.NA + mtx[0, 0] = np.nan + return mtx + + +def int_mtx(n_obs, n_vars): + mtx = np.arange(n_obs * n_vars).reshape(n_obs, n_vars) + return mtx + + +# Possible matrix generators +# integer matrices do not support NAs in Python +matrix_generators = { + "generate_float_matrix": lambda n_obs, n_vars: float_mtx(n_obs, n_vars), + "generate_float_matrix_nas": lambda n_obs, n_vars: float_mtx( + n_obs, n_vars, NAs=True + ), + "generate_float_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix( + float_mtx(n_obs, n_vars) + ), + "generate_float_csparse_nas": lambda n_obs, n_vars: sp.sparse.csc_matrix( + float_mtx(n_obs, n_vars, NAs=True) + ), + "generate_float_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix( + float_mtx(n_obs, n_vars) + ), + "generate_float_rsparse_nas": lambda n_obs, n_vars: sp.sparse.csr_matrix( + float_mtx(n_obs, n_vars, NAs=True) + ), + "generate_integer_matrix": lambda n_obs, n_vars: int_mtx(n_obs, n_vars), + "generate_integer_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix( + int_mtx(n_obs, n_vars) + ), + "generate_integer_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix( + int_mtx(n_obs, n_vars) + ), +} + + +def generate_matrix(n_obs, n_vars, matrix_type): + """ + Generate a matrix of given dimensions and type. + + Parameters: + n_obs (int): The number of observations (rows) in the matrix. + n_vars (int): The number of variables (columns) in the matrix. + matrix_type (str): The type of matrix to generate. + + Returns: + The generated matrix, either numpy.ndarray or scipy.sparse.csc_matrix or scipy.sparse.csr_matrix. + + Raises: + AssertionError: If the matrix_type is unknown. + + """ + assert matrix_type in matrix_generators, f"Unknown matrix type: {matrix_type}" + + return matrix_generators[matrix_type](n_obs, n_vars) diff --git a/src/dummy_anndata/generate_vector.py b/src/dummy_anndata/generate_vector.py new file mode 100644 index 0000000..09132c9 --- /dev/null +++ b/src/dummy_anndata/generate_vector.py @@ -0,0 +1,73 @@ +import pandas as pd +import numpy as np + + +def nullable_integer_array(n): + assert n > 0, "an integer array must be at least one value" + nullable_array = [i for i in range(n)] + # np.nan, pd.NA and None should all end up as null values, masked in the h5ad file + nullable_array[0] = np.nan + return pd.array(nullable_array, dtype="Int64") + + +def nullable_boolean_array(n): + assert n > 0, "a boolean array must be at least one value" + nullable_array = pd.array([[True, False][i % 2] for i in range(n)], dtype="boolean") + # np.nan, pd.NA and None should all end up as null values, masked in the h5ad file + nullable_array[0] = pd.NA + return nullable_array + + +def missing_values_categorical(n, ordered=True): + assert n > 0, "a categorical must be at least one value" + missing_values = pd.Categorical( + [["Value1", "Value2"][i % 2] for i in range(n)], + categories=["Value1", "Value2"], + ordered=ordered, + ) + # They should all end up as code -1 in the h5ad file + missing_values[0] = np.nan + return missing_values + + +vector_generators = { + "categorical": lambda n: pd.Categorical( + [["Value1", "Value2"][i % 2] for i in range(n)] + ), + "categorical_ordered": lambda n: pd.Categorical( + [["Value1", "Value2"][i % 2] for i in range(n)], ordered=True + ), + "categorical_missing_values": lambda n: missing_values_categorical( + n, ordered=False + ), + "categorical_ordered_missing_values": lambda n: missing_values_categorical( + n, ordered=True + ), + "string_array": lambda n: np.array([f"value_{i}" for i in range(n)]), + # should we also check a 1d sparse array? We should probably leave it for the matrix generation? + "dense_array": lambda n: np.arange(n, dtype=float) + 0.5, + "integer_array": lambda n: np.array([i for i in range(n)]), + "nullable_integer_array": nullable_integer_array, + "boolean_array": lambda n: np.array([[True, False][i % 2] for i in range(n)]), + "nullable_boolean_array": nullable_boolean_array, +} + + +def generate_vector(n, vector_type): + """ + Generate a vector of a specified type. + + Parameters: + vector_type (str): The type of vector to generate. + n (int): The length of the vector. + + Returns: + list: The generated vector. + + Raises: + AssertionError: If the vector_type is unknown. + """ + # check if vector_type is valid + assert vector_type in vector_generators, f"Unknown vector type: {vector_type}" + + return vector_generators[vector_type](n) diff --git a/src/dummy_anndata/pl/__init__.py b/src/dummy_anndata/pl/__init__.py deleted file mode 100644 index c2315dd..0000000 --- a/src/dummy_anndata/pl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .basic import BasicClass, basic_plot diff --git a/src/dummy_anndata/pl/basic.py b/src/dummy_anndata/pl/basic.py deleted file mode 100644 index ed390ef..0000000 --- a/src/dummy_anndata/pl/basic.py +++ /dev/null @@ -1,63 +0,0 @@ -from anndata import AnnData - - -def basic_plot(adata: AnnData) -> int: - """Generate a basic plot for an AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Import matplotlib and implement a plotting function here.") - return 0 - - -class BasicClass: - """A basic class. - - Parameters - ---------- - adata - The AnnData object to preprocess. - """ - - my_attribute: str = "Some attribute." - my_other_attribute: int = 0 - - def __init__(self, adata: AnnData): - print("Implement a class here.") - - def my_method(self, param: int) -> int: - """A basic method. - - Parameters - ---------- - param - A parameter. - - Returns - ------- - Some integer value. - """ - print("Implement a method here.") - return 0 - - def my_other_method(self, param: str) -> str: - """Another basic method. - - Parameters - ---------- - param - A parameter. - - Returns - ------- - Some integer value. - """ - print("Implement a method here.") - return "" diff --git a/src/dummy_anndata/pp/__init__.py b/src/dummy_anndata/pp/__init__.py deleted file mode 100644 index 5e7e293..0000000 --- a/src/dummy_anndata/pp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .basic import basic_preproc diff --git a/src/dummy_anndata/pp/basic.py b/src/dummy_anndata/pp/basic.py deleted file mode 100644 index 5db1ec0..0000000 --- a/src/dummy_anndata/pp/basic.py +++ /dev/null @@ -1,17 +0,0 @@ -from anndata import AnnData - - -def basic_preproc(adata: AnnData) -> int: - """Run a basic preprocessing on the AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Implement a preprocessing function here.") - return 0 diff --git a/src/dummy_anndata/tl/__init__.py b/src/dummy_anndata/tl/__init__.py deleted file mode 100644 index 95a32cd..0000000 --- a/src/dummy_anndata/tl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .basic import basic_tool diff --git a/src/dummy_anndata/tl/basic.py b/src/dummy_anndata/tl/basic.py deleted file mode 100644 index d215ade..0000000 --- a/src/dummy_anndata/tl/basic.py +++ /dev/null @@ -1,17 +0,0 @@ -from anndata import AnnData - - -def basic_tool(adata: AnnData) -> int: - """Run a tool on the AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Implement a tool to run on the AnnData object.") - return 0