Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #2 and generates empty uns if asked for #3

Merged
merged 4 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 68 additions & 3 deletions src/dummy_anndata/generate_dataset.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import anndata as ad

from collections.abc import Iterable

from .generate_matrix import matrix_generators
from .generate_vector import vector_generators
from .generate_dataframe import generate_dataframe
Expand All @@ -18,9 +20,62 @@ def generate_dataset(
obsp_types=None,
varp_types=None,
uns_types=None,
nested_uns_types=None,
):
"""
Generate a synthetic AnnData dataset with specified dimensions and data types.

Parameters:
-----------
n_obs : int, optional (default=10)
Number of observations (cells).
n_vars : int, optional (default=20)
Number of variables (genes).
x_type : str, optional (default="generate_integer_matrix")
Type of matrix to generate for the main data matrix `X`. Must be a key in `matrix_generators`.
layer_types : list of str, optional
Types of matrices to generate for layers. Each type must be a key in `matrix_generators`.
obs_types : list of str, optional
Types of vectors to generate for `obs`. Each type must be a key in `vector_generators`.
var_types : list of str, optional
Types of vectors to generate for `var`. Each type must be a key in `vector_generators`.
obsm_types : list of str, optional
Types of matrices or vectors to generate for `obsm`. Each type must be a key in `matrix_generators` or `vector_generators`.
varm_types : list of str, optional
Types of matrices or vectors to generate for `varm`. Each type must be a key in `matrix_generators` or `vector_generators`.
obsp_types : list of str, optional
Types of matrices to generate for `obsp`. Each type must be a key in `matrix_generators`.
varp_types : list of str, optional
Types of matrices to generate for `varp`. Each type must be a key in `matrix_generators`.
uns_types : list of str, optional
Types of data to generate for `uns`. Each type must be a key in `vector_generators`, `matrix_generators`, or `scalar_generators`.
nested_uns_types : list of str, optional
Types of data to generate for the nested `uns` dictionary. They will be a new dictionary at the key `nested`.
Each type must be a key in `vector_generators`, `matrix_generators`, or `scalar_generators`.

Returns:
--------
ad.AnnData
An AnnData object containing the generated dataset with the specified dimensions and data types.

Raises:
-------
AssertionError
If any of the specified types are not recognized by the corresponding generator dictionaries.
"""

assert x_type in matrix_generators, f"Unknown matrix type: {x_type}"

check_iterable_types(layer_types, "layer_types")
check_iterable_types(obs_types, "obs_types")
check_iterable_types(var_types, "var_types")
check_iterable_types(obsm_types, "obsm_types")
check_iterable_types(varm_types, "varm_types")
check_iterable_types(obsp_types, "obsp_types")
check_iterable_types(varp_types, "varp_types")
check_iterable_types(uns_types, "uns_types")
check_iterable_types(nested_uns_types, "nested_uns_types")

assert layer_types is None or all(
t in matrix_generators.keys() for t in layer_types
), "Unknown layer type"
Expand Down Expand Up @@ -55,11 +110,11 @@ def generate_dataset(
if obsm_types is None: # obsm_types are all matrices or vectors, except for categoricals and nullables
vector_not_allowed = set(["categorical", "categorical_ordered", "categorical_missing_values", "categorical_ordered_missing_values", \
"nullable_integer_array", "nullable_boolean_array"])
obsm_types = set(matrix_generators.keys()) - vector_not_allowed
obsm_types = set(matrix_generators.keys()) - vector_not_allowed
if varm_types is None: # varm_types are all matrices or vectors, except for categoricals and nullables
vector_not_allowed = set(["categorical", "categorical_ordered", "categorical_missing_values", "categorical_ordered_missing_values", \
"nullable_integer_array", "nullable_boolean_array"])
varm_types = set(matrix_generators.keys()) - vector_not_allowed
varm_types = set(matrix_generators.keys()) - vector_not_allowed
if obsp_types is None: # obsp_types are all matrices
obsp_types = list(matrix_generators.keys())
if varp_types is None: # varp_types are all matrices
Expand All @@ -70,6 +125,12 @@ def generate_dataset(
+ list(matrix_generators.keys())
+ list(scalar_generators.keys())
)
if nested_uns_types is None:
nested_uns_types = (
list(vector_generators.keys())
+ list(matrix_generators.keys())
+ list(scalar_generators.keys())
)

X = matrix_generators[x_type](n_obs, n_vars)
layers = {t: matrix_generators[t](n_obs, n_vars) for t in layer_types}
Expand Down Expand Up @@ -99,7 +160,7 @@ def generate_dataset(
obsp = {t: matrix_generators[t](n_obs, n_obs) for t in obsp_types}
varp = {t: matrix_generators[t](n_vars, n_vars) for t in varp_types}

uns = generate_dict(n_obs, n_vars, uns_types)
uns = generate_dict(n_obs, n_vars, uns_types, nested_uns_types)

return ad.AnnData(
X,
Expand All @@ -112,3 +173,7 @@ def generate_dataset(
varp=varp,
uns=uns,
)


def check_iterable_types(iterable_types, name):
assert iterable_types is None or (isinstance(iterable_types, Iterable) and not isinstance(iterable_types, str)), f"{name} should be a non-string iterable type"
31 changes: 19 additions & 12 deletions src/dummy_anndata/generate_dict.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from .generate_vector import vector_generators
from .generate_matrix import matrix_generators

import pandas as pd
import numpy as np

from .generate_matrix import matrix_generators
from .generate_vector import vector_generators

scalar_generators = {
"string": "version",
"char": "a",
Expand Down Expand Up @@ -32,19 +31,27 @@ def generate_type(type, n_rows, n_cols):
return None


def generate_dict(n_rows, n_cols, types=None, nested=True):
def generate_dict(n_rows, n_cols, types=None, nested_uns_types=None):
if types is None: # types are all vectors and all matrices
scalar_types = list(scalar_generators.keys()) + [
f"scalar_{t}" for t in vector_generators.keys()
]
types = (
scalar_types
list(scalar_generators.keys())
+ [f"scalar_{t}" for t in vector_generators.keys()]
+ list(vector_generators.keys())
+ list(matrix_generators.keys())
)

if nested_uns_types is None:
nested_uns_types = (
list(scalar_generators.keys())
+ [f"scalar_{t}" for t in vector_generators.keys()]
+ list(vector_generators.keys())
+ list(matrix_generators.keys())
)

data = {t: generate_type(t, n_rows, n_cols) for t in types}
if nested:
data["nested"] = generate_dict(n_rows, n_cols, types, False)
data = {}
if types: # types is not empty
data = {t: generate_type(t, n_rows, n_cols) for t in types}
if nested_uns_types:
data["nested"] = generate_dict(n_rows, n_cols, types = nested_uns_types, nested_uns_types=[])

return data
14 changes: 12 additions & 2 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pytest

import dummy_anndata


Expand All @@ -13,3 +11,15 @@ def test_generating_dataset(tmp_path):
dummy = dummy_anndata.generate_dataset()
filename = tmp_path / "dummy.h5ad"
dummy.write_h5ad(filename)

def test_uns():
dummy_empty = dummy_anndata.generate_dataset(uns_types=[], nested_uns_types=[])
assert dummy_empty.uns == {}

dummy_nested = dummy_anndata.generate_dataset(uns_types=[])
assert "nested" in dummy_nested.uns and dummy_nested.uns["nested"] != {}

dummy_no_nested = dummy_anndata.generate_dataset(nested_uns_types=[])
assert "nested" not in dummy_no_nested.uns


Loading