Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small adata Error fix #616

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 22 additions & 8 deletions dynamo/preprocessing/gene_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,14 +233,28 @@ def calc_dispersion_by_svr(

for layer in layers:
valid_CM, detected_bool = get_vaild_CM(adata, layer, **SVRs_kwargs)
if valid_CM is None:
continue

mean, cv = get_mean_cv(adata, valid_CM, algorithm, winsorize, winsor_perc)
fitted_fun, svr_gamma = get_prediction_by_svr(mean, cv, svr_gamma)
score = cv - fitted_fun(mean)
if sort_inverse:
score = -score
# valid_CM seems to be set as an empty sparse array when adata.var has too few rows.
# This is not picked up via None checks. (Empty valid_CM leads to a divide by 0 error in get_prediction_by_svr())
# Note that simply doing `not valid_CM.toarray()` results in "truth value of array..." error due to numpy array
# `not valid_CM.toarray().tolist()` doesn't work either because a list of empty lists still gives False
if valid_CM is None or valid_CM.shape[1] == 0:
main_warning("No valid_CM for layer " + layer)

#If all layers are skipped then there will be no "score" column, causing a KeyError during preprocessing
#continue

#Temporary values.
#adata.shape[1] == valid_CM.shape[1] = adata.var.shape[0]
temp = np.full((adata.shape[1],), 0)
mean, cv = np.full((adata.shape[1],1), 0), temp
_, svr_gamma = None, temp
score = temp
else:
mean, cv = get_mean_cv(adata, valid_CM, algorithm, winsorize, winsor_perc)
fitted_fun, svr_gamma = get_prediction_by_svr(mean, cv, svr_gamma)
score = cv - fitted_fun(mean)
if sort_inverse:
score = -score

# Now we can get "SVR" from get_prediction_by_svr
key = "velocyto_SVR" if layer == "raw" or layer == "X" else layer + "_velocyto_SVR"
Expand Down
14 changes: 11 additions & 3 deletions dynamo/preprocessing/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from sklearn.utils.sparsefuncs import mean_variance_axis

from ..configuration import DKM
from ..dynamo_logger import main_info_insert_adata_obsm, main_info_insert_adata_var
from ..dynamo_logger import main_info_insert_adata_obsm, main_info_insert_adata_var, main_warning


def _truncatedSVD_with_center(
Expand Down Expand Up @@ -46,7 +46,9 @@ def _truncatedSVD_with_center(
random_state = check_random_state(random_state)
np.random.set_state(random_state.get_state())
v0 = random_state.uniform(-1, 1, np.min(X.shape))
n_components = min(n_components, X.shape[1] - 1)
# svds() requires 0 < k < min(X.shape)
# min(X.shape) or X[0] <= 30 when adata.obs is pruned to <= 30 rows
n_components = min(n_components, min(X.shape) - 1)

mean = X.mean(0)
X_H = X.T.conj()
Expand Down Expand Up @@ -241,7 +243,13 @@ def pca(
adata.var.iloc[bad_genes, adata.var.columns.tolist().index("use_for_pca")] = False
X_data = X_data[:, valid_ind]

if use_incremental_PCA:
if 0 in X_data.shape:
main_warning("No genes passed filter, ABORTING PCA REDUCTION.")
if return_all:
return adata, None, None
else:
return adata
elif use_incremental_PCA:
from sklearn.decomposition import IncrementalPCA

fit, X_pca = _pca_fit(
Expand Down
7 changes: 7 additions & 0 deletions dynamo/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,13 @@ def get_svr_filter(
valid_idx = np.where(np.isfinite(adata.var.loc[:, score_name]))[0]

valid_table = adata.var.iloc[valid_idx, :]
if len(valid_table) == 0:
main_warning("No gene with valid svr scores")
if return_adata:
return adata
else:
return np.zeros(adata.n_vars, dtype=bool)

nth_score = np.sort(valid_table.loc[:, score_name])[::-1][np.min((n_top_genes - 1, valid_table.shape[0] - 1))]

feature_gene_idx = np.where(valid_table.loc[:, score_name] >= nth_score)[0][:n_top_genes]
Expand Down