From e21ef179ddc3d0a406a5d2dc5f5d7af1cf5fde1a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 14 Oct 2024 19:45:23 +0000 Subject: [PATCH] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .github/workflows/test.yaml.rej | 8 +- README.md.rej | 6 +- docs/conf.py.rej | 10 +- pyproject.toml.rej | 4 +- sobolev_alignment/feature_analysis.py | 33 +- .../generate_artificial_sample.py | 20 +- sobolev_alignment/interpolated_features.py | 22 +- sobolev_alignment/krr_approx.py | 60 +- sobolev_alignment/krr_model_selection.py | 72 ++- sobolev_alignment/multi_krr_approx.py | 9 +- sobolev_alignment/scvi_model_search.py | 24 +- sobolev_alignment/sobolev_alignment.py | 593 +++++++++++++----- tests/test_krr_approx.py | 78 ++- tests/test_sobolev_alignment.py | 66 +- tutorials/process_data.ipynb | 10 +- tutorials/tutorial_simple.ipynb | 17 +- 16 files changed, 769 insertions(+), 263 deletions(-) diff --git a/.github/workflows/test.yaml.rej b/.github/workflows/test.yaml.rej index b8974f9..20bc751 100644 --- a/.github/workflows/test.yaml.rej +++ b/.github/workflows/test.yaml.rej @@ -1,7 +1,7 @@ diff a/.github/workflows/test.yaml b/.github/workflows/test.yaml (rejected hunks) @@ -1,53 +1,67 @@ name: Test - + on: - push: - branches: [main] @@ -13,13 +13,13 @@ diff a/.github/workflows/test.yaml b/.github/workflows/test.yaml (rejected hunks + branches: [main] + schedule: + - cron: "0 5 1,15 * *" - + concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true - + jobs: - test: - runs-on: ${{ matrix.os }} @@ -37,7 +37,7 @@ diff a/.github/workflows/test.yaml b/.github/workflows/test.yaml (rejected hunks + defaults: + run: + shell: bash -e {0} # -e to fail on error - + + strategy: + fail-fast: false + matrix: diff --git a/README.md.rej b/README.md.rej index 1af2520..152b23c 100644 --- a/README.md.rej +++ b/README.md.rej @@ -1,10 +1,10 @@ diff a/README.md b/README.md (rejected hunks) @@ -17,7 +17,7 @@ Please refer to the [documentation][link-docs]. In particular, the - + ## Installation - + -You need to have Python 3.8 or newer installed on your system. If you don't have +You need to have Python 3.10 or newer installed on your system. If you don't have Python installed, we recommend installing [Mambaforge](https://github.com/conda-forge/miniforge#mambaforge). - + There are several alternative options to install sobolev_alignment: diff --git a/docs/conf.py.rej b/docs/conf.py.rej index b614a3d..d26d26d 100644 --- a/docs/conf.py.rej +++ b/docs/conf.py.rej @@ -7,7 +7,7 @@ diff a/docs/conf.py b/docs/conf.py (rejected hunks) # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html @@ -36,10 +36,10 @@ needs_sphinx = "4.0" - + html_context = { "display_github": True, # Integrate GitHub - "github_user": "saroudant", # Username @@ -19,7 +19,7 @@ diff a/docs/conf.py b/docs/conf.py (rejected hunks) + "github_version": "main", + "conf_py_path": "/docs/", } - + # -- General configuration --------------------------------------------------- @@ -57,6 +57,7 @@ extensions = [ "sphinx_autodoc_typehints", @@ -28,7 +28,7 @@ diff a/docs/conf.py b/docs/conf.py (rejected hunks) + "sphinxext.opengraph", *[p.stem for p in (HERE / "extensions").glob("*.py")], ] - + @@ -108,12 +109,15 @@ exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"] # html_theme = "sphinx_book_theme" @@ -36,14 +36,14 @@ diff a/docs/conf.py b/docs/conf.py (rejected hunks) +html_css_files = ["css/custom.css"] + html_title = project_name - + html_theme_options = { "repository_url": repository_url, "use_repository_button": True, "path_to_docs": "docs/", + "navigation_with_keys": False, } - + pygments_style = "default" @@ -123,18 +127,3 @@ nitpick_ignore = [ # you can add an exception to this list. diff --git a/pyproject.toml.rej b/pyproject.toml.rej index 14cef03..1192599 100644 --- a/pyproject.toml.rej +++ b/pyproject.toml.rej @@ -15,7 +15,7 @@ diff a/pyproject.toml b/pyproject.toml (rejected hunks) - "session-info" + "session-info", ] - + [project.optional-dependencies] dev = [ - # CLI for bumping the version number @@ -44,5 +44,5 @@ diff a/pyproject.toml b/pyproject.toml (rejected hunks) - "pytest-cov", + "coverage", ] - + [tool.coverage.run] diff --git a/sobolev_alignment/feature_analysis.py b/sobolev_alignment/feature_analysis.py index 10f5356..d495e40 100644 --- a/sobolev_alignment/feature_analysis.py +++ b/sobolev_alignment/feature_analysis.py @@ -79,7 +79,9 @@ def higher_order_contribution( # Compute features by iterating over possible combinations logging.info("\t START FEATURES") - combinations_features = Parallel(n_jobs=n_jobs, verbose=1, max_nbytes=1e6, pre_dispatch=int(1.5 * n_jobs))( + combinations_features = Parallel( + n_jobs=n_jobs, verbose=1, max_nbytes=1e6, pre_dispatch=int(1.5 * n_jobs) + )( delayed(combinatorial_product)(sparse_data, x, gamma) for x in combinations_with_replacement(np.arange(sparse_data.shape[1]), r=d) ) @@ -98,10 +100,18 @@ def higher_order_contribution( # Return names of each features. logging.info("\t\t FIND NAMES") combinations_names = Parallel( - n_jobs=min(5, n_jobs), verbose=1, max_nbytes=1e4, pre_dispatch=int(1.5 * min(5, n_jobs)) - )(delayed(_interaction_name)(x) for x in combinations_with_replacement(gene_names, r=d)) + n_jobs=min(5, n_jobs), + verbose=1, + max_nbytes=1e4, + pre_dispatch=int(1.5 * min(5, n_jobs)), + )( + delayed(_interaction_name)(x) + for x in combinations_with_replacement(gene_names, r=d) + ) - return pd.DataFrame.sparse.from_spmatrix(data=combinations_features, columns=combinations_names) + return pd.DataFrame.sparse.from_spmatrix( + data=combinations_features, columns=combinations_names + ) def _combination_to_idx(idx, p): @@ -177,7 +187,11 @@ def combinatorial_product(x, idx, gamma): Values of the higher order feature. """ # Iterate over all genes and compute the feature weight by multiplication - prod = [basis(x[:, i], k, gamma) for i, k in enumerate(_combination_to_idx(idx, x.shape[1])) if k > 0] + prod = [ + basis(x[:, i], k, gamma) + for i, k in enumerate(_combination_to_idx(idx, x.shape[1])) + if k > 0 + ] if len(prod) == 0: return 1 @@ -185,12 +199,17 @@ def combinatorial_product(x, idx, gamma): def _interaction_name(gene_combi): - combin_name = [f"{g}^{r}" for g, r in zip(*np.unique(gene_combi, return_counts=True))] + combin_name = [ + f"{g}^{r}" for g, r in zip(*np.unique(gene_combi, return_counts=True)) + ] return "*".join(combin_name) if len(combin_name) > 0 else "1" def _higher_order_interaction_wrapper(data, x, gamma, gene_names): - return [combinatorial_product(data, x, gamma), _interaction_name(gene_names, _combination_to_idx(x, data.shape[1]))] + return [ + combinatorial_product(data, x, gamma), + _interaction_name(gene_names, _combination_to_idx(x, data.shape[1])), + ] def _compute_offset(data, gamma): diff --git a/sobolev_alignment/generate_artificial_sample.py b/sobolev_alignment/generate_artificial_sample.py index e15a81b..dd92448 100644 --- a/sobolev_alignment/generate_artificial_sample.py +++ b/sobolev_alignment/generate_artificial_sample.py @@ -66,7 +66,9 @@ def generate_samples( batch_name_ids = [batch_key_dict[n] for n in batch_names] batch_name_ids = torch.Tensor(np.array(batch_name_ids).reshape(-1, 1)) # Recover log library size (exponential) - lib_size_samples = np.array([np.random.choice(lib_size[n], 1)[0] for n in batch_names]) + lib_size_samples = np.array( + [np.random.choice(lib_size[n], 1)[0] for n in batch_names] + ) lib_size_samples = np.log(lib_size_samples) else: batch_name_ids = None @@ -82,7 +84,11 @@ def generate_samples( cont_covs = torch.Tensor(covariates_values) # Generate random noise - z = torch.Tensor(np.random.normal(size=(int(sample_size), model.init_params_["non_kwargs"]["n_latent"]))) + z = torch.Tensor( + np.random.normal( + size=(int(sample_size), model.init_params_["non_kwargs"]["n_latent"]) + ) + ) dist_param_samples = model.module.generative( z=z, library=torch.Tensor(np.array(lib_size_samples).reshape(-1, 1)), @@ -156,8 +162,14 @@ def parallel_generate_samples( results = Parallel(n_jobs=n_jobs, verbose=1)( delayed(generate_samples)( sample_size=batch_size, - batch_names=batch_names[i : i + batch_size] if batch_names is not None else None, - covariates_values=covariates_values[i : i + batch_size] if covariates_values is not None else None, + batch_names=( + batch_names[i : i + batch_size] if batch_names is not None else None + ), + covariates_values=( + covariates_values[i : i + batch_size] + if covariates_values is not None + else None + ), lib_size=lib_size, model=model, batch_key_dict=batch_key_dict, diff --git a/sobolev_alignment/interpolated_features.py b/sobolev_alignment/interpolated_features.py index 8ee82b9..73e3db8 100644 --- a/sobolev_alignment/interpolated_features.py +++ b/sobolev_alignment/interpolated_features.py @@ -9,7 +9,9 @@ import scipy -def compute_optimal_tau(PV_number, pv_projections, principal_angles, n_interpolation=100): +def compute_optimal_tau( + PV_number, pv_projections, principal_angles, n_interpolation=100 +): """Compute the optimal interpolation step for each PV (Grassmann interpolation).""" ks_statistics = {} for tau_step in np.linspace(0, 1, n_interpolation + 1): @@ -25,12 +27,22 @@ def compute_optimal_tau(PV_number, pv_projections, principal_angles, n_interpola def project_on_interpolate_PV(angle, PV_number, tau_step, pv_projections): """Project data on interpolated PVs.""" - source_proj = np.sin((1 - tau_step) * angle) * pv_projections["source"]["source"][:, PV_number] - source_proj += np.sin(tau_step * angle) * pv_projections["target"]["source"][:, PV_number] + source_proj = ( + np.sin((1 - tau_step) * angle) + * pv_projections["source"]["source"][:, PV_number] + ) + source_proj += ( + np.sin(tau_step * angle) * pv_projections["target"]["source"][:, PV_number] + ) source_proj /= np.sin(angle) - target_proj = np.sin((1 - tau_step) * angle) * pv_projections["source"]["target"][:, PV_number] - target_proj += np.sin(tau_step * angle) * pv_projections["target"]["target"][:, PV_number] + target_proj = ( + np.sin((1 - tau_step) * angle) + * pv_projections["source"]["target"][:, PV_number] + ) + target_proj += ( + np.sin(tau_step * angle) * pv_projections["target"]["target"][:, PV_number] + ) target_proj /= np.sin(angle) return source_proj, target_proj diff --git a/sobolev_alignment/krr_approx.py b/sobolev_alignment/krr_approx.py index ab7928f..9d6164b 100644 --- a/sobolev_alignment/krr_approx.py +++ b/sobolev_alignment/krr_approx.py @@ -34,7 +34,10 @@ FALKON_IMPORTED = True except ImportError: FALKON_IMPORTED = False - print("FALKON NOT INSTALLED, OR NOT IMPORTED. USING FALKON WOULD RESULT IN BETTER PERFORMANCE.", flush=True) + print( + "FALKON NOT INSTALLED, OR NOT IMPORTED. USING FALKON WOULD RESULT IN BETTER PERFORMANCE.", + flush=True, + ) from sklearn.gaussian_process.kernels import Matern, PairwiseKernel from sklearn.kernel_ridge import KernelRidge @@ -133,7 +136,11 @@ def __init__( # Set kernel self.kernel = kernel - self.kernel_params = kernel_params if kernel_params else self.default_kernel_params[self.method][self.kernel] + self.kernel_params = ( + kernel_params + if kernel_params + else self.default_kernel_params[self.method][self.kernel] + ) self._make_kernel() # Set penalization parameters @@ -147,7 +154,9 @@ def __init__( # Preprocessing self.mean_center = mean_center self.unit_std = unit_std - self.pre_process_ = StandardScaler(with_mean=mean_center, with_std=unit_std, copy=False) + self.pre_process_ = StandardScaler( + with_mean=mean_center, with_std=unit_std, copy=False + ) def _make_kernel(self): """ @@ -160,9 +169,13 @@ def _make_kernel(self): # scikit-learn initialization if self.method.lower() == "sklearn": if self.sklearn_kernel[self.kernel.lower()] != "wrapper": - self.kernel_ = self.sklearn_kernel[self.kernel.lower()](**self.kernel_params) + self.kernel_ = self.sklearn_kernel[self.kernel.lower()]( + **self.kernel_params + ) else: - self.kernel_ = PairwiseKernel(metric=self.kernel.lower(), **self.kernel_params) + self.kernel_ = PairwiseKernel( + metric=self.kernel.lower(), **self.kernel_params + ) # Falkon elif self.method.lower() == "falkon": @@ -170,7 +183,9 @@ def _make_kernel(self): # If not implemented else: - raise NotImplementedError("%s not implemented. Choices: sklearn and falkon" % (self.method)) + raise NotImplementedError( + "%s not implemented. Choices: sklearn and falkon" % (self.method) + ) return True @@ -197,7 +212,9 @@ def fit(self, X: torch.Tensor, y: torch.Tensor): # are False as it can have a large memory footprint. if self.mean_center or self.unit_std: self.pre_process_.fit(X) - self.training_data_ = torch.Tensor(self.pre_process_.transform(torch.Tensor(X))) + self.training_data_ = torch.Tensor( + self.pre_process_.transform(torch.Tensor(X)) + ) else: self.training_data_ = X @@ -296,7 +313,9 @@ def transform(self, X: torch.Tensor): elif self.method == "falkon": return self.ridge_clf_.predict(X) else: - raise NotImplementedError("%s not implemented. Choices: sklearn and falkon" % (self.method)) + raise NotImplementedError( + "%s not implemented. Choices: sklearn and falkon" % (self.method) + ) def save(self, folder: str = "."): """ @@ -330,12 +349,19 @@ def save(self, folder: str = "."): # Save important material: # - KRR weights # - Samples used for prediction. - torch.save(torch.Tensor(self.anchors()), open("%s/sample_anchors.pt" % (folder), "wb")) - torch.save(torch.Tensor(self.sample_weights_), open("%s/sample_weights.pt" % (folder), "wb")) + torch.save( + torch.Tensor(self.anchors()), open("%s/sample_anchors.pt" % (folder), "wb") + ) + torch.save( + torch.Tensor(self.sample_weights_), + open("%s/sample_weights.pt" % (folder), "wb"), + ) # Save weights and anchors as csv. # Longer to load, but compatible with all platforms. - np.savetxt("%s/sample_weights.csv" % (folder), self.sample_weights_.detach().numpy()) + np.savetxt( + "%s/sample_weights.csv" % (folder), self.sample_weights_.detach().numpy() + ) np.savetxt("%s/sample_anchors.csv" % (folder), self.anchors().detach().numpy()) return True @@ -356,15 +382,21 @@ def load(folder: str = "."): # Load and format parameters. params = load(open("%s/params.pkl" % (folder), "rb")) krr_params = { - e: f for e, f in params.items() if e in ["method", "M", "penalization", "mean_center", "unit_std"] + e: f + for e, f in params.items() + if e in ["method", "M", "penalization", "mean_center", "unit_std"] } # krr_params['kernel'] = krr_params['kernel'].kernel_name krr_approx_clf = KRRApprox(**krr_params) krr_approx_clf.kernel_ = params["kernel"] # Load sample weights and anchors. - krr_approx_clf.sample_weights_ = torch.load(open("%s/sample_weights.pt" % (folder), "rb")) - krr_approx_clf.training_data_ = torch.load(open("%s/sample_anchors.pt" % (folder), "rb")) + krr_approx_clf.sample_weights_ = torch.load( + open("%s/sample_weights.pt" % (folder), "rb") + ) + krr_approx_clf.training_data_ = torch.load( + open("%s/sample_anchors.pt" % (folder), "rb") + ) # Set up classifiers for out-of-sample application. krr_approx_clf._setup_clf() diff --git a/sobolev_alignment/krr_model_selection.py b/sobolev_alignment/krr_model_selection.py index b94457d..0559f0c 100644 --- a/sobolev_alignment/krr_model_selection.py +++ b/sobolev_alignment/krr_model_selection.py @@ -117,7 +117,11 @@ def model_selection_nu( # Generate artificial samples sobolev_alignment_clf.fit( - X_source=X_source, X_target=X_target, fit_vae=False, sample_artificial=True, krr_approx=False + X_source=X_source, + X_target=X_target, + fit_vae=False, + sample_artificial=True, + krr_approx=False, ) for krr_params in krr_param_grid: @@ -128,17 +132,26 @@ def model_selection_nu( "$".join([f"{e}:{f}" for e, f in krr_params["kernel_params"].items()]), ) - sobolev_alignment_clf.krr_params = {"source": deepcopy(krr_params), "target": deepcopy(krr_params)} + sobolev_alignment_clf.krr_params = { + "source": deepcopy(krr_params), + "target": deepcopy(krr_params), + } print("\t START %s" % (param_id), flush=True) sobolev_alignment_clf.fit( - X_source=X_source, X_target=X_target, fit_vae=False, sample_artificial=False, krr_approx=True + X_source=X_source, + X_target=X_target, + fit_vae=False, + sample_artificial=False, + krr_approx=True, ) gc.collect() # Compute_error krr_approx_error = sobolev_alignment_clf.compute_error(size=test_error_size) - processed_error_df = {x: _process_error_df(df) for x, df in krr_approx_error.items()} + processed_error_df = { + x: _process_error_df(df) for x, df in krr_approx_error.items() + } processed_latent_error_df = {x: df[0] for x, df in processed_error_df.items()} processed_latent_error_df = pd.concat(processed_latent_error_df) latent_results_krr_error_df[param_id] = processed_latent_error_df @@ -150,17 +163,28 @@ def model_selection_nu( latent_error_df.index.names = ["param_id", "data_source", "data_generation"] latent_error_df = latent_error_df.reset_index() latent_error_df = latent_error_df.reset_index().pivot_table( - values="spearmanr", index=["data_source", "param_id"], columns=["data_generation"] + values="spearmanr", + index=["data_source", "param_id"], + columns=["data_generation"], + ) + latent_spearman_df = pd.concat( + {x: latent_error_df.loc[x]["input"] for x in ["source", "target"]}, axis=1 + ) + latent_spearman_df["combined"] = ( + np.sum(latent_spearman_df, axis=1) / latent_spearman_df.shape[1] ) - latent_spearman_df = pd.concat({x: latent_error_df.loc[x]["input"] for x in ["source", "target"]}, axis=1) - latent_spearman_df["combined"] = np.sum(latent_spearman_df, axis=1) / latent_spearman_df.shape[1] latent_spearman_df = latent_spearman_df.sort_values("combined", ascending=False) return latent_spearman_df def model_alignment_penalization( - X_data: AnnData, data_source: str, sobolev_alignment_clf, sigma: float, optimal_nu: float, M: int = 250 + X_data: AnnData, + data_source: str, + sobolev_alignment_clf, + sigma: float, + optimal_nu: float, + M: int = 250, ): r""" $\\sigma$ and $\nu$ selection. @@ -201,14 +225,24 @@ def model_alignment_penalization( _clf = deepcopy(sobolev_alignment_clf) supp_data_source = "target" if data_source == "source" else "source" _clf.scvi_models[supp_data_source] = sobolev_alignment_clf.scvi_models[data_source] - _clf.scvi_batch_keys_[supp_data_source] = sobolev_alignment_clf.scvi_batch_keys_[data_source] + _clf.scvi_batch_keys_[supp_data_source] = sobolev_alignment_clf.scvi_batch_keys_[ + data_source + ] gc.collect() # Artificial sampling _clf.n_jobs = 1 _clf.batch_name[supp_data_source] = _clf.batch_name[data_source] - _clf.continuous_covariate_names[supp_data_source] = _clf.continuous_covariate_names[data_source] - _clf.fit(X_source=X_data, X_target=X_data, fit_vae=False, sample_artificial=True, krr_approx=False) + _clf.continuous_covariate_names[supp_data_source] = _clf.continuous_covariate_names[ + data_source + ] + _clf.fit( + X_source=X_data, + X_target=X_data, + fit_vae=False, + sample_artificial=True, + krr_approx=False, + ) krr_param_grid = _make_hyperparameters_grid(sigma, M, [optimal_nu]) principal_angles_df = {} @@ -223,7 +257,13 @@ def model_alignment_penalization( _clf.krr_params = {"source": krr_params, "target": krr_params} print("\t START %s" % (param_id), flush=True) - _clf.fit(X_source=X_data, X_target=X_data, fit_vae=False, sample_artificial=False, krr_approx=True) + _clf.fit( + X_source=X_data, + X_target=X_data, + fit_vae=False, + sample_artificial=False, + krr_approx=True, + ) gc.collect() # Compute_error @@ -237,12 +277,16 @@ def model_alignment_penalization( def _make_hyperparameters_grid(sigma, M, nu_values=None): nu_values = nu_values if nu_values else [0.5, 1.5, 2.5, np.inf] krr_param_possibilities = deepcopy(DEFAULT_KRR_PARAMS) - krr_param_possibilities["kernel_params"] = [{"sigma": sigma, "nu": n} for n in nu_values] + krr_param_possibilities["kernel_params"] = [ + {"sigma": sigma, "nu": n} for n in nu_values + ] krr_param_possibilities["M"] = [M] return ParameterGrid(krr_param_possibilities) def _process_error_df(df): latent_error_df = pd.DataFrame(df["latent"]) - factor_error_df = pd.concat({x: pd.DataFrame(df["factor"][x]) for x in df["factor"]}) + factor_error_df = pd.concat( + {x: pd.DataFrame(df["factor"][x]) for x in df["factor"]} + ) return [latent_error_df, factor_error_df] diff --git a/sobolev_alignment/multi_krr_approx.py b/sobolev_alignment/multi_krr_approx.py index f33e867..6c1f6aa 100644 --- a/sobolev_alignment/multi_krr_approx.py +++ b/sobolev_alignment/multi_krr_approx.py @@ -23,7 +23,10 @@ def __init__(self): def predict(self, X: torch.Tensor): """Predict latent factor values given a tensor.""" - prediction = [clf.transform(torch.Tensor(X)).detach().numpy() for clf in self.krr_regressors] + prediction = [ + clf.transform(torch.Tensor(X)).detach().numpy() + for clf in self.krr_regressors + ] prediction = torch.Tensor(prediction) prediction = torch.mean(prediction, axis=0) @@ -40,7 +43,9 @@ def anchors(self): def process_clfs(self): """Process the different classifiers.""" self.anchors_ = torch.cat([clf.anchors() for clf in self.krr_regressors]) - self.sample_weights_ = torch.cat([clf.sample_weights_ for clf in self.krr_regressors]) + self.sample_weights_ = torch.cat( + [clf.sample_weights_ for clf in self.krr_regressors] + ) self.sample_weights_ = 1 / len(self.krr_regressors) * self.sample_weights_ self.kernel_ = self.krr_regressors[0].kernel_ diff --git a/sobolev_alignment/scvi_model_search.py b/sobolev_alignment/scvi_model_search.py index fc0f9d7..6ecce60 100644 --- a/sobolev_alignment/scvi_model_search.py +++ b/sobolev_alignment/scvi_model_search.py @@ -87,10 +87,18 @@ def model_selection( # Bayesian optimisation save_hyperopt_res = Trials() _objective_function = make_objective_function( - train_data_an=train_data_an, test_data_an=test_data_an, batch_key=batch_key, model=model + train_data_an=train_data_an, + test_data_an=test_data_an, + batch_key=batch_key, + model=model, ) best = fmin( - _objective_function, space, algo=tpe.suggest, max_evals=max_eval, return_argmin=False, trials=save_hyperopt_res + _objective_function, + space, + algo=tpe.suggest, + max_evals=max_eval, + return_argmin=False, + trials=save_hyperopt_res, ) # Save @@ -101,7 +109,9 @@ def model_selection( return best, hyperopt_results_df, save_hyperopt_res -def make_objective_function(train_data_an, test_data_an, batch_key=None, model=scvi.model.SCVI): +def make_objective_function( + train_data_an, test_data_an, batch_key=None, model=scvi.model.SCVI +): """ Generate Hyperopt objective function. @@ -160,8 +170,12 @@ def _objective_function(params): clf.train(plan_kwargs=plan_kwargs, **trainer_kwargs) # Reconstruction error - test_reconstruction_error = clf.get_reconstruction_error(adata=test_data_an)["reconstruction_loss"] - train_reconstruction_error = clf.get_reconstruction_error()["reconstruction_loss"] + test_reconstruction_error = clf.get_reconstruction_error( + adata=test_data_an + )["reconstruction_loss"] + train_reconstruction_error = clf.get_reconstruction_error()[ + "reconstruction_loss" + ] # ELBO test_elbo = clf.get_elbo(adata=test_data_an) diff --git a/sobolev_alignment/sobolev_alignment.py b/sobolev_alignment/sobolev_alignment.py index 0477a76..e04ad5e 100644 --- a/sobolev_alignment/sobolev_alignment.py +++ b/sobolev_alignment/sobolev_alignment.py @@ -189,7 +189,10 @@ def __init__( """ # Save batch and continuous covariate names self.batch_name = {"source": source_batch_name, "target": target_batch_name} - self.continuous_covariate_names = {"source": continuous_covariate_names, "target": continuous_covariate_names} + self.continuous_covariate_names = { + "source": continuous_covariate_names, + "target": continuous_covariate_names, + } # Save fitting parameters self._fit_params = { @@ -208,8 +211,16 @@ def __init__( # scVI params self.scvi_params = { - "source": source_scvi_params if source_scvi_params is not None else self.default_scvi_params, - "target": target_scvi_params if target_scvi_params is not None else self.default_scvi_params, + "source": ( + source_scvi_params + if source_scvi_params is not None + else self.default_scvi_params + ), + "target": ( + target_scvi_params + if target_scvi_params is not None + else self.default_scvi_params + ), } for x in self.scvi_params: if "model" not in self.scvi_params[x]: @@ -219,8 +230,16 @@ def __init__( # KRR params self.krr_params = { - "source": source_krr_params if source_krr_params is not None else {"method": "falkon"}, - "target": target_krr_params if target_krr_params is not None else {"method": "falkon"}, + "source": ( + source_krr_params + if source_krr_params is not None + else {"method": "falkon"} + ), + "target": ( + target_krr_params + if target_krr_params is not None + else {"method": "falkon"} + ), } self._check_same_kernel() # Check whether source and target have the same kernel self.scaler_ = {} @@ -278,7 +297,9 @@ def fit( # Train VAE if fit_vae: - self._train_scvi_modules(no_posterior_collapse=self._fit_params["no_posterior_collapse"]) + self._train_scvi_modules( + no_posterior_collapse=self._fit_params["no_posterior_collapse"] + ) # Sample artificial points if sample_artificial: @@ -300,7 +321,9 @@ def fit( krr_approx=krr_approx, save_mmap=self._fit_params["save_mmap"], log_input=self._fit_params["log_input"], - n_samples_per_sample_batch=self._fit_params["n_samples_per_sample_batch"], + n_samples_per_sample_batch=self._fit_params[ + "n_samples_per_sample_batch" + ], frac_save_artificial=self._fit_params["frac_save_artificial"], n_krr_clfs=self._fit_params["n_krr_clfs"], mean_center=self.mean_center, @@ -385,11 +408,13 @@ def _train_one_krr( ): # Generate samples (decoder) if sample_artificial: - artificial_samples, artificial_batches, artificial_covariates = self._generate_artificial_samples( - data_source=data_source, - n_artificial_samples=n_artificial_samples, - large_batch_size=n_samples_per_sample_batch, - save_mmap=save_mmap, + artificial_samples, artificial_batches, artificial_covariates = ( + self._generate_artificial_samples( + data_source=data_source, + n_artificial_samples=n_artificial_samples, + large_batch_size=n_samples_per_sample_batch, + save_mmap=save_mmap, + ) ) # Compute embeddings (encoder) @@ -440,13 +465,17 @@ def _train_one_krr( # Subsample the artificial sample saved if sample_artificial: n_save = int(frac_save_artificial * n_artificial_samples) - subsampled_idx = np.random.choice(a=np.arange(n_artificial_samples), size=n_save, replace=False) + subsampled_idx = np.random.choice( + a=np.arange(n_artificial_samples), size=n_save, replace=False + ) self.artificial_samples_[data_source] = artificial_samples[subsampled_idx] del artificial_samples # Remove data in memmap if save_mmap is not None: os.remove(f"{save_mmap}/{data_source}_artificial_input.npy") - self.artificial_embeddings_[data_source] = artificial_embeddings[subsampled_idx] + self.artificial_embeddings_[data_source] = artificial_embeddings[ + subsampled_idx + ] # Remove data in memmap if save_mmap is not None: os.remove(f"{save_mmap}/{data_source}_artificial_embedding.npy") @@ -474,15 +503,22 @@ def _train_scvi_modules(self, no_posterior_collapse=False): # Change covariates to float if self.continuous_covariate_names[x] is not None: for cov in self.continuous_covariate_names[x]: - self.training_data[x].obs[cov] = self.training_data[x].obs[cov].astype(np.float64) + self.training_data[x].obs[cov] = ( + self.training_data[x].obs[cov].astype(np.float64) + ) latent_variable_variance = np.zeros(1) save_iter = 0 while np.any(latent_variable_variance < 0.2): logging.info(f"START TRAINING {x} model number {save_iter}") try: - self.scvi_models[x] = scvi.model.SCVI(self.training_data[x], **self.scvi_params[x]["model"]) - self.scvi_models[x].train(plan_kwargs=self.scvi_params[x]["plan"], **self.scvi_params[x]["train"]) + self.scvi_models[x] = scvi.model.SCVI( + self.training_data[x], **self.scvi_params[x]["model"] + ) + self.scvi_models[x].train( + plan_kwargs=self.scvi_params[x]["plan"], + **self.scvi_params[x]["train"], + ) except ValueError as err: logging.error("\n SCVI TRAINING ERROR: \n %s \n\n\n\n" % (err)) latent_variable_variance = np.zeros(1) @@ -496,8 +532,12 @@ def _train_scvi_modules(self, no_posterior_collapse=False): save_iter += 1 if save_iter > 0 and save_iter % 5 == 0: - logging.info("\t SCVI: REMOVE ONE LATENT VARIABLE TO AVOID POSTERIOR COLLAPSE") - self.scvi_params[x]["model"]["n_latent"] = self.scvi_params[x]["model"]["n_latent"] - 1 + logging.info( + "\t SCVI: REMOVE ONE LATENT VARIABLE TO AVOID POSTERIOR COLLAPSE" + ) + self.scvi_params[x]["model"]["n_latent"] = ( + self.scvi_params[x]["model"]["n_latent"] - 1 + ) # Log batch key (used in data generation). if self.batch_name[x] is not None: @@ -507,14 +547,20 @@ def _train_scvi_modules(self, no_posterior_collapse=False): .reset_index(drop=True) .drop_duplicates() ) - self.scvi_batch_keys_[x] = dict_batch.set_index(self.batch_name[x]).to_dict()["_scvi_batch"] + self.scvi_batch_keys_[x] = dict_batch.set_index( + self.batch_name[x] + ).to_dict()["_scvi_batch"] else: self.scvi_batch_keys_[x] = None return True def _generate_artificial_samples( - self, data_source: str, n_artificial_samples: int, large_batch_size: int = 10**5, save_mmap: str = None + self, + data_source: str, + n_artificial_samples: int, + large_batch_size: int = 10**5, + save_mmap: str = None, ): """ Generate artificial samples for one model. @@ -533,32 +579,54 @@ def _generate_artificial_samples( artificial_data: dict Dictionary containing the generated data for both source and target """ - batch_sizes = [large_batch_size] * (n_artificial_samples // large_batch_size) + [ - n_artificial_samples % large_batch_size - ] + batch_sizes = [large_batch_size] * ( + n_artificial_samples // large_batch_size + ) + [n_artificial_samples % large_batch_size] batch_sizes = [x for x in batch_sizes if x > 0] - _generated_data = [self._generate_artificial_samples_batch(batch, data_source) for batch in batch_sizes] + _generated_data = [ + self._generate_artificial_samples_batch(batch, data_source) + for batch in batch_sizes + ] _generated_data = list(zip(*_generated_data)) artificial_samples = np.concatenate(_generated_data[0]) - artificial_batches_ = np.concatenate(_generated_data[1]) if self.batch_name[data_source] is not None else None + artificial_batches_ = ( + np.concatenate(_generated_data[1]) + if self.batch_name[data_source] is not None + else None + ) artificial_covariates_ = ( - pd.concat(_generated_data[2]) if self.continuous_covariate_names[data_source] is not None else None + pd.concat(_generated_data[2]) + if self.continuous_covariate_names[data_source] is not None + else None ) del _generated_data gc.collect() if save_mmap is not None and isinstance(save_mmap, str): - np.save(open(os.path.join(save_mmap, "%s_artificial_input.npy" % (data_source)), "wb"), artificial_samples) + np.save( + open( + os.path.join(save_mmap, "%s_artificial_input.npy" % (data_source)), + "wb", + ), + artificial_samples, + ) artificial_samples = np.load( - os.path.join(save_mmap, "%s_artificial_input.npy" % (data_source)), mmap_mode="r" + os.path.join(save_mmap, "%s_artificial_input.npy" % (data_source)), + mmap_mode="r", ) gc.collect() return artificial_samples, artificial_batches_, artificial_covariates_ - def _generate_artificial_samples_batch(self, n_artificial_samples: int, data_source: str): - artificial_batches = self._sample_batches(n_artificial_samples=n_artificial_samples, data=data_source) - artificial_covariates = self._sample_covariates(n_artificial_samples=n_artificial_samples, data=data_source) + def _generate_artificial_samples_batch( + self, n_artificial_samples: int, data_source: str + ): + artificial_batches = self._sample_batches( + n_artificial_samples=n_artificial_samples, data=data_source + ) + artificial_covariates = self._sample_covariates( + n_artificial_samples=n_artificial_samples, data=data_source + ) artificial_samples = parallel_generate_samples( sample_size=n_artificial_samples, batch_names=artificial_batches, @@ -583,14 +651,23 @@ def _generate_artificial_samples_batch(self, n_artificial_samples: int, data_sou def _compute_batch_library_size(self): if self.batch_name["source"] is None or self.batch_name["target"] is None: - return {x: np.sum(self.training_data[x].X, axis=1).astype(float) for x in self.training_data} + return { + x: np.sum(self.training_data[x].X, axis=1).astype(float) + for x in self.training_data + } - unique_batches = {x: np.unique(self.training_data[x].obs[self.batch_name[x]]) for x in self.training_data} + unique_batches = { + x: np.unique(self.training_data[x].obs[self.batch_name[x]]) + for x in self.training_data + } return { x: { str(b): np.sum( - self.training_data[x][self.training_data[x].obs[self.batch_name[x]] == b].X, axis=1 + self.training_data[x][ + self.training_data[x].obs[self.batch_name[x]] == b + ].X, + axis=1, ).astype(float) for b in unique_batches[x] } @@ -599,10 +676,22 @@ def _compute_batch_library_size(self): def _check_same_kernel(self): """Verify that same kernel is used for source and kernel KRR.""" - if "kernel" in self.krr_params["source"] or "kernel" in self.krr_params["target"]: - assert self.krr_params["source"]["kernel"] == self.krr_params["target"]["kernel"] - if "kernel_params" in self.krr_params["source"] or "kernel_params" in self.krr_params["target"]: - assert self.krr_params["source"]["kernel_params"] == self.krr_params["target"]["kernel_params"] + if ( + "kernel" in self.krr_params["source"] + or "kernel" in self.krr_params["target"] + ): + assert ( + self.krr_params["source"]["kernel"] + == self.krr_params["target"]["kernel"] + ) + if ( + "kernel_params" in self.krr_params["source"] + or "kernel_params" in self.krr_params["target"] + ): + assert ( + self.krr_params["source"]["kernel_params"] + == self.krr_params["target"]["kernel_params"] + ) def _sample_batches(self, n_artificial_samples, data): """Sample batches for either source or target.""" @@ -610,7 +699,8 @@ def _sample_batches(self, n_artificial_samples, data): return None return np.random.choice( - self.training_data[data].obs[self.batch_name[data]].values, size=int(n_artificial_samples) + self.training_data[data].obs[self.batch_name[data]].values, + size=int(n_artificial_samples), ) def _sample_covariates(self, n_artificial_samples, data): @@ -625,13 +715,18 @@ def _sample_covariates(self, n_artificial_samples, data): ) def _embed_artificial_samples( - self, artificial_samples, artificial_batches, artificial_covariates, data_source: str, large_batch_size=10**5 + self, + artificial_samples, + artificial_batches, + artificial_covariates, + data_source: str, + large_batch_size=10**5, ): # Divide in batches n_artificial_samples = artificial_samples.shape[0] - batch_sizes = [large_batch_size] * (n_artificial_samples // large_batch_size) + [ - n_artificial_samples % large_batch_size - ] + batch_sizes = [large_batch_size] * ( + n_artificial_samples // large_batch_size + ) + [n_artificial_samples % large_batch_size] batch_sizes = [0] + list(np.cumsum([x for x in batch_sizes if x > 0])) batch_start = batch_sizes[:-1] batch_end = batch_sizes[1:] @@ -641,39 +736,66 @@ def _embed_artificial_samples( for start, end in zip(batch_start, batch_end): x_train = artificial_samples[start:end] train_obs = pd.DataFrame( - np.array(artificial_batches[start:end]) if artificial_batches is not None else [], - columns=[self.batch_name[data_source]] if artificial_batches is not None else [], + ( + np.array(artificial_batches[start:end]) + if artificial_batches is not None + else [] + ), + columns=( + [self.batch_name[data_source]] + if artificial_batches is not None + else [] + ), index=np.arange(end - start), ) if artificial_covariates is not None: train_obs = pd.concat( - [train_obs, artificial_covariates.iloc[start:end].reset_index(drop=True)], ignore_index=True, axis=1 + [ + train_obs, + artificial_covariates.iloc[start:end].reset_index(drop=True), + ], + ignore_index=True, + axis=1, ) - train_obs.columns = [self.batch_name[data_source], *self.continuous_covariate_names[data_source]] + train_obs.columns = [ + self.batch_name[data_source], + *self.continuous_covariate_names[data_source], + ] x_train_an = AnnData(x_train, obs=train_obs) x_train_an.layers["counts"] = x_train_an.X.copy() - embedding.append(self.scvi_models[data_source].get_latent_representation(x_train_an)) + embedding.append( + self.scvi_models[data_source].get_latent_representation(x_train_an) + ) # Forward these formatted samples return np.concatenate(embedding) def _correct_artificial_samples_lib_size( - self, artificial_samples, artificial_batches, artificial_covariates, data_source: str, large_batch_size=10**5 + self, + artificial_samples, + artificial_batches, + artificial_covariates, + data_source: str, + large_batch_size=10**5, ): """Correct for library size the artificial samples.""" # Divide in batches n_artificial_samples = artificial_samples.shape[0] - batch_sizes = [large_batch_size] * (n_artificial_samples // large_batch_size) + [ - n_artificial_samples % large_batch_size - ] + batch_sizes = [large_batch_size] * ( + n_artificial_samples // large_batch_size + ) + [n_artificial_samples % large_batch_size] batch_sizes = [0] + list(np.cumsum([x for x in batch_sizes if x > 0])) batch_start = batch_sizes[:-1] batch_end = batch_sizes[1:] # Format artificial samples to be fed into scVI. - artificial_samples = [artificial_samples[start:end] for start, end in zip(batch_start, batch_end)] - for idx, (x_train, start, end) in enumerate(zip(artificial_samples, batch_start, batch_end)): + artificial_samples = [ + artificial_samples[start:end] for start, end in zip(batch_start, batch_end) + ] + for idx, (x_train, start, end) in enumerate( + zip(artificial_samples, batch_start, batch_end) + ): train_obs = pd.DataFrame( np.array(artificial_batches[start:end]), columns=[self.batch_name[data_source]], @@ -681,13 +803,23 @@ def _correct_artificial_samples_lib_size( ) if artificial_covariates is not None: train_obs = pd.concat( - [train_obs, artificial_covariates.iloc[start:end].reset_index(drop=True)], ignore_index=True, axis=1 + [ + train_obs, + artificial_covariates.iloc[start:end].reset_index(drop=True), + ], + ignore_index=True, + axis=1, ) - train_obs.columns = [self.batch_name[data_source], *self.continuous_covariate_names[data_source]] + train_obs.columns = [ + self.batch_name[data_source], + *self.continuous_covariate_names[data_source], + ] x_train_an = AnnData(x_train, obs=train_obs) x_train_an.layers["counts"] = x_train_an.X.copy() - artificial_samples[idx] = self.scvi_models[data_source].get_normalized_expression( + artificial_samples[idx] = self.scvi_models[ + data_source + ].get_normalized_expression( x_train_an, return_numpy=True, library_size=DEFAULT_LIB_SIZE ) @@ -710,7 +842,9 @@ def _memmap_log_processing( if save_mmap is not None and isinstance(save_mmap, str): self._save_mmap = save_mmap self._memmap_embedding( - data_source=data_source, artificial_embeddings=artificial_embeddings, save_mmap=save_mmap + data_source=data_source, + artificial_embeddings=artificial_embeddings, + save_mmap=save_mmap, ) self.krr_log_input_ = log_input @@ -722,12 +856,19 @@ def _memmap_log_processing( artificial_samples = scaler_.fit_transform(np.array(artificial_samples)) # Frobenius norm scaling - artificial_samples = self._frobenius_normalisation(data_source, artificial_samples, frob_norm_source) + artificial_samples = self._frobenius_normalisation( + data_source, artificial_samples, frob_norm_source + ) if save_mmap is not None and isinstance(save_mmap, str): # Re-save - np.save(open(f"{save_mmap}/{data_source}_artificial_input.npy", "wb"), artificial_samples) - artificial_samples = np.load(f"{save_mmap}/{data_source}_artificial_input.npy", mmap_mode="r") + np.save( + open(f"{save_mmap}/{data_source}_artificial_input.npy", "wb"), + artificial_samples, + ) + artificial_samples = np.load( + f"{save_mmap}/{data_source}_artificial_input.npy", mmap_mode="r" + ) gc.collect() else: @@ -735,35 +876,53 @@ def _memmap_log_processing( else: # Frobenius norm scaling - artificial_samples = self._frobenius_normalisation(data_source, artificial_samples, frob_norm_source) + artificial_samples = self._frobenius_normalisation( + data_source, artificial_samples, frob_norm_source + ) return artificial_samples - def _frobenius_normalisation(self, data_source, artificial_samples, frob_norm_source): + def _frobenius_normalisation( + self, data_source, artificial_samples, frob_norm_source + ): # Normalise to same Frobenius norm per sample if frob_norm_source: if data_source == "source": - self._frob_norm_param = np.mean(np.linalg.norm(artificial_samples, axis=1)) + self._frob_norm_param = np.mean( + np.linalg.norm(artificial_samples, axis=1) + ) else: frob_norm = np.mean(np.linalg.norm(artificial_samples, axis=1)) - artificial_samples = artificial_samples * self._frob_norm_param / frob_norm + artificial_samples = ( + artificial_samples * self._frob_norm_param / frob_norm + ) else: pass return artificial_samples def _memmap_embedding(self, data_source, artificial_embeddings, save_mmap): - np.save(open(f"{save_mmap}/{data_source}_artificial_embedding.npy", "wb"), artificial_embeddings) - artificial_embeddings = np.load(f"{save_mmap}/{data_source}_artificial_embedding.npy", mmap_mode="r") + np.save( + open(f"{save_mmap}/{data_source}_artificial_embedding.npy", "wb"), + artificial_embeddings, + ) + artificial_embeddings = np.load( + f"{save_mmap}/{data_source}_artificial_embedding.npy", mmap_mode="r" + ) gc.collect() return artificial_embeddings - def _approximate_encoders(self, data_source: str, artificial_samples, artificial_embeddings): + def _approximate_encoders( + self, data_source: str, artificial_samples, artificial_embeddings + ): """Approximate the encoder by a KRR regression.""" krr_approx = KRRApprox(**self.krr_params[data_source]) - krr_approx.fit(torch.from_numpy(artificial_samples), torch.from_numpy(artificial_embeddings)) + krr_approx.fit( + torch.from_numpy(artificial_samples), + torch.from_numpy(artificial_embeddings), + ) return krr_approx @@ -774,7 +933,10 @@ def _compare_approximated_encoders(self): self.sqrt_inv_M_X_ = mat_inv_sqrt(self.M_X) self.sqrt_inv_M_Y_ = mat_inv_sqrt(self.M_Y) - self.sqrt_inv_matrices_ = {"source": self.sqrt_inv_M_X_, "target": self.sqrt_inv_M_Y_} + self.sqrt_inv_matrices_ = { + "source": self.sqrt_inv_M_X_, + "target": self.sqrt_inv_M_Y_, + } self.cosine_sim = self.sqrt_inv_M_X_.dot(self.M_XY).dot(self.sqrt_inv_M_Y_) def _compute_cosine_sim_intra_dataset(self, data: str): @@ -791,7 +953,8 @@ def _compute_cosine_sim_intra_dataset(self, data: str): def _compute_cross_cosine_sim(self): K_XY = self.approximate_krr_regressions_["target"].kernel_( - self.approximate_krr_regressions_["source"].anchors(), self.approximate_krr_regressions_["target"].anchors() + self.approximate_krr_regressions_["source"].anchors(), + self.approximate_krr_regressions_["target"].anchors(), ) K_XY = torch.Tensor(K_XY) return ( @@ -811,15 +974,22 @@ def _compute_principal_vectors(self, all_PVs=False): """ cosine_svd = np.linalg.svd(self.cosine_sim, full_matrices=all_PVs) self.principal_angles = cosine_svd[1] - self.untransformed_rotations_ = {"source": cosine_svd[0], "target": cosine_svd[2].T} + self.untransformed_rotations_ = { + "source": cosine_svd[0], + "target": cosine_svd[2].T, + } self.principal_vectors_coef_ = { x: self.untransformed_rotations_[x] .T.dot(self.sqrt_inv_matrices_[x]) - .dot(self.approximate_krr_regressions_[x].sample_weights_.T.detach().numpy()) + .dot( + self.approximate_krr_regressions_[x].sample_weights_.T.detach().numpy() + ) for x in self.untransformed_rotations_ } - def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool = True, return_anndata=False): + def compute_consensus_features( + self, X_input: dict, n_similar_pv: int, fit: bool = True, return_anndata=False + ): """ Project data on interpolated consensus features. @@ -846,7 +1016,9 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool """ X_data_log = { data_source: self._frobenius_normalisation( - data_source, torch.log10(torch.Tensor(X_input[data_source].X + 1)), frob_norm_source=True + data_source, + torch.log10(torch.Tensor(X_input[data_source].X + 1)), + frob_norm_source=True, ) for data_source in ["source", "target"] } @@ -870,7 +1042,9 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool for proj_data_source in krr_projections[pv_data_source]: rotated_proj = self.untransformed_rotations_[pv_data_source].T rotated_proj = rotated_proj.dot(self.sqrt_inv_matrices_[pv_data_source]) - rotated_proj = rotated_proj.dot(krr_projections[pv_data_source][proj_data_source].T).T + rotated_proj = rotated_proj.dot( + krr_projections[pv_data_source][proj_data_source].T + ).T pv_projections[pv_data_source][proj_data_source] = rotated_proj del rotated_proj @@ -878,9 +1052,9 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool # Mean-center projection data on the PV pv_projections = { pv_data_source: { - proj_data_source: StandardScaler(with_mean=True, with_std=False).fit_transform( - pv_projections[pv_data_source][proj_data_source] - ) + proj_data_source: StandardScaler( + with_mean=True, with_std=False + ).fit_transform(pv_projections[pv_data_source][proj_data_source]) for proj_data_source in ["source", "target"] } for pv_data_source in ["source", "target"] @@ -891,7 +1065,10 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool self.n_similar_pv = n_similar_pv self.optimal_interpolation_step_ = { PV_number: compute_optimal_tau( - PV_number, pv_projections, np.arccos(self.principal_angles), n_interpolation=100 + PV_number, + pv_projections, + np.arccos(self.principal_angles), + n_interpolation=100, ) for PV_number in range(self.n_similar_pv) } @@ -901,7 +1078,10 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool PV_number: np.concatenate( list( project_on_interpolate_PV( - np.arccos(self.principal_angles)[PV_number], PV_number, optimal_step, pv_projections + np.arccos(self.principal_angles)[PV_number], + PV_number, + optimal_step, + pv_projections, ) ) ) @@ -911,10 +1091,12 @@ def compute_consensus_features(self, X_input: dict, n_similar_pv: int, fit: bool if return_anndata: interpolated_proj_an = sc.concat([X_input["source"], X_input["target"]]) - interpolated_proj_an.obs["data_source"] = ["source"] * X_input["source"].shape[0] + ["target"] * X_input[ - "target" - ].shape[0] - interpolated_proj_an.obsm["X_sobolev_alignment"] = np.array(interpolated_proj_df) + interpolated_proj_an.obs["data_source"] = ["source"] * X_input[ + "source" + ].shape[0] + ["target"] * X_input["target"].shape[0] + interpolated_proj_an.obsm["X_sobolev_alignment"] = np.array( + interpolated_proj_df + ) return interpolated_proj_an else: return interpolated_proj_df @@ -990,7 +1172,11 @@ def scvi_model_selection( return self def krr_model_selection( - self, X_source: AnnData, X_target: AnnData, M: int = 1000, same_model_alignment_thresh: float = 0.9 + self, + X_source: AnnData, + X_target: AnnData, + M: int = 1000, + same_model_alignment_thresh: float = 0.9, ): """ Hyper-parameters selection for KRR. @@ -1023,9 +1209,15 @@ def krr_model_selection( # Compute sigma after re-scaling data (if required) if self._fit_params["frob_norm_source"]: - X_input["source"] = self._frobenius_normalisation("source", X_input["source"], frob_norm_source=True) - X_input["target"] = self._frobenius_normalisation("target", X_input["target"], frob_norm_source=True) - source_target_distance = np.power(pairwise_distances(X_input["source"], X_input["target"]), 2) + X_input["source"] = self._frobenius_normalisation( + "source", X_input["source"], frob_norm_source=True + ) + X_input["target"] = self._frobenius_normalisation( + "target", X_input["target"], frob_norm_source=True + ) + source_target_distance = np.power( + pairwise_distances(X_input["source"], X_input["target"]), 2 + ) sigma = np.sqrt(np.mean(source_target_distance) / np.log(2)) print("OPTIMAL SIGMA: %1.2f" % (sigma)) @@ -1044,18 +1236,26 @@ def krr_model_selection( # Select penalization self.penalization_principal_angles_df_ = {} for data_source in ["source", "target"]: - self.krr_params[data_source]["kernel_params"] = {"sigma": sigma, "nu": optimal_krr_nu} - self.penalization_principal_angles_df_[data_source] = model_alignment_penalization( - X_data=X_source if data_source == "source" else X_target, - data_source=data_source, - sobolev_alignment_clf=self, - sigma=sigma, - optimal_nu=optimal_krr_nu, - M=M, + self.krr_params[data_source]["kernel_params"] = { + "sigma": sigma, + "nu": optimal_krr_nu, + } + self.penalization_principal_angles_df_[data_source] = ( + model_alignment_penalization( + X_data=X_source if data_source == "source" else X_target, + data_source=data_source, + sobolev_alignment_clf=self, + sigma=sigma, + optimal_nu=optimal_krr_nu, + M=M, + ) ) self.penalization_principal_angles_df_[data_source] = ( - pd.DataFrame(self.penalization_principal_angles_df_[data_source]).iloc[:1] > same_model_alignment_thresh + pd.DataFrame(self.penalization_principal_angles_df_[data_source]).iloc[ + :1 + ] + > same_model_alignment_thresh ).T pen = re.findall( @@ -1080,14 +1280,20 @@ def save(self, folder: str = ".", with_krr: bool = True, with_model: bool = True os.mkdir(folder) dump(self.batch_name, open("%s/batch_name.pkl" % (folder), "wb")) - dump(self.continuous_covariate_names, open("%s/continuous_covariate_names.pkl" % (folder), "wb")) + dump( + self.continuous_covariate_names, + open("%s/continuous_covariate_names.pkl" % (folder), "wb"), + ) # Dump scVI models if with_model: for x in self.scvi_models: dump(self.scvi_models[x], open(f"{folder}/scvi_model_{x}.pkl", "wb")) self.scvi_models[x].save(f"{folder}/scvi_model_{x}", save_anndata=True) - dump(self.scvi_batch_keys_[x], open(f"{folder}/scvi_model_key_dict_{x}.pkl", "wb")) + dump( + self.scvi_batch_keys_[x], + open(f"{folder}/scvi_model_key_dict_{x}.pkl", "wb"), + ) # Dump the KRR: if not with_krr: @@ -1101,11 +1307,15 @@ def save(self, folder: str = ".", with_krr: bool = True, with_model: bool = True dump(self.krr_params, open("%s/krr_params.pkl" % (folder), "wb")) for param_t in ["model", "plan", "train"]: - df = pd.DataFrame([self.scvi_params[x][param_t] for x in ["source", "target"]]) + df = pd.DataFrame( + [self.scvi_params[x][param_t] for x in ["source", "target"]] + ) df.to_csv(f"{folder}/scvi_params_{param_t}.csv") dump(self.scvi_params, open("%s/scvi_params.pkl" % (folder), "wb")) - pd.DataFrame(self._fit_params, index=["params"]).to_csv("%s/fit_params.csv" % (folder)) + pd.DataFrame(self._fit_params, index=["params"]).to_csv( + "%s/fit_params.csv" % (folder) + ) dump(self._fit_params, open("%s/fit_params.pkl" % (folder), "wb")) # Save results @@ -1125,7 +1335,9 @@ def save(self, folder: str = ".", with_krr: bool = True, with_model: bool = True torch.save(element, open(f"{folder}/{idx}.pt", "wb")) if self._frob_norm_param is not None: - np.savetxt("%s/frob_norm_param.csv" % (folder), np.array([self._frob_norm_param])) + np.savetxt( + "%s/frob_norm_param.csv" % (folder), np.array([self._frob_norm_param]) + ) def load(folder: str = ".", with_krr: bool = True, with_model: bool = True): """ @@ -1149,7 +1361,9 @@ def load(folder: str = ".", with_krr: bool = True, with_model: bool = True): if "batch_name.pkl" in os.listdir(folder): clf.batch_name = load(open("%s/batch_name.pkl" % (folder), "rb")) if "continuous_covariate_names.pkl" in os.listdir(folder): - clf.continuous_covariate_names = load(open("%s/continuous_covariate_names.pkl" % (folder), "rb")) + clf.continuous_covariate_names = load( + open("%s/continuous_covariate_names.pkl" % (folder), "rb") + ) if with_model: clf.scvi_models = {} @@ -1157,14 +1371,18 @@ def load(folder: str = ".", with_krr: bool = True, with_model: bool = True): for x in ["source", "target"]: clf.scvi_models[x] = scvi.model.SCVI.load(f"{folder}/scvi_model_{x}") if "scvi_model_key_dict_%s.pkl" % (x) in os.listdir(folder): - clf.scvi_batch_keys_[x] = load(open(f"{folder}/scvi_model_key_dict_{x}.pkl", "rb")) + clf.scvi_batch_keys_[x] = load( + open(f"{folder}/scvi_model_key_dict_{x}.pkl", "rb") + ) else: clf.scvi_batch_keys_[x] = None if with_krr: clf.approximate_krr_regressions_ = {} for x in ["source", "target"]: - clf.approximate_krr_regressions_[x] = KRRApprox.load(f"{folder}/krr_approx_{x}/") + clf.approximate_krr_regressions_[x] = KRRApprox.load( + f"{folder}/krr_approx_{x}/" + ) # Load params clf.krr_params = load(open("%s/krr_params.pkl" % (folder), "rb")) @@ -1191,16 +1409,25 @@ def load(folder: str = ".", with_krr: bool = True, with_model: bool = True): if "alignment_cosine_sim.npy" in os.listdir(folder): clf.cosine_sim = np.load("%s/alignment_cosine_sim.npy" % (folder)) elif "alignment_cosine_sim.pt" in os.listdir(folder): - clf.cosine_sim = torch.load(open("%s/alignment_cosine_sim.pt" % (folder), "rb")) + clf.cosine_sim = torch.load( + open("%s/alignment_cosine_sim.pt" % (folder), "rb") + ) if "alignment_principal_angles.npy" in os.listdir(folder): - clf.principal_angles = np.load("%s/alignment_principal_angles.npy" % (folder)) + clf.principal_angles = np.load( + "%s/alignment_principal_angles.npy" % (folder) + ) elif "alignment_principal_angles.pt" in os.listdir(folder): - clf.principal_angles = torch.load(open("%s/alignment_principal_angles.pt" % (folder), "rb")) + clf.principal_angles = torch.load( + open("%s/alignment_principal_angles.pt" % (folder), "rb") + ) clf.sqrt_inv_M_X_ = mat_inv_sqrt(clf.M_X) clf.sqrt_inv_M_Y_ = mat_inv_sqrt(clf.M_Y) - clf.sqrt_inv_matrices_ = {"source": clf.sqrt_inv_M_X_, "target": clf.sqrt_inv_M_Y_} + clf.sqrt_inv_matrices_ = { + "source": clf.sqrt_inv_M_X_, + "target": clf.sqrt_inv_M_Y_, + } clf._compute_principal_vectors() if "frob_norm_param.csv" in os.listdir(folder): @@ -1236,7 +1463,10 @@ def plot_cosine_similarity(self, folder: str = ".", absolute_cos: bool = False): plt.xlabel("Tumor", fontsize=25, color="black") plt.ylabel("Cell lines", fontsize=25) plt.tight_layout() - plt.savefig("{}/{}cosine_similarity.png".format(folder, "abs_" if absolute_cos else ""), dpi=300) + plt.savefig( + "{}/{}cosine_similarity.png".format(folder, "abs_" if absolute_cos else ""), + dpi=300, + ) plt.show() def compute_error(self, size=-1): @@ -1259,35 +1489,53 @@ def _compute_error_one_type(self, data_type, size=-1): input_krr_pred = np.log10(input_krr_pred + 1) if data_type == " target": - input_krr_pred = self._frobenius_normalisation(data_type, input_krr_pred, self._frob_norm_param is not None) + input_krr_pred = self._frobenius_normalisation( + data_type, input_krr_pred, self._frob_norm_param is not None + ) - input_krr_pred = StandardScaler(with_mean=self.mean_center, with_std=self.unit_std).fit_transform( - input_krr_pred + input_krr_pred = StandardScaler( + with_mean=self.mean_center, with_std=self.unit_std + ).fit_transform(input_krr_pred) + input_krr_pred = self.approximate_krr_regressions_[data_type].transform( + torch.Tensor(input_krr_pred) + ) + input_spearman_corr = np.array( + [scipy.stats.spearmanr(x, y)[0] for x, y in zip(input_krr_pred.T, latent.T)] ) - input_krr_pred = self.approximate_krr_regressions_[data_type].transform(torch.Tensor(input_krr_pred)) - input_spearman_corr = np.array([scipy.stats.spearmanr(x, y)[0] for x, y in zip(input_krr_pred.T, latent.T)]) input_krr_diff = input_krr_pred - latent input_mean_square = torch.square(input_krr_diff) input_factor_mean_square = torch.mean(input_mean_square, axis=0) input_latent_mean_square = torch.mean(input_mean_square) - input_factor_reconstruction_error = np.linalg.norm(input_krr_diff, axis=0) / np.linalg.norm(latent, axis=0) - input_latent_reconstruction_error = np.linalg.norm(input_krr_diff) / np.linalg.norm(latent) + input_factor_reconstruction_error = np.linalg.norm( + input_krr_diff, axis=0 + ) / np.linalg.norm(latent, axis=0) + input_latent_reconstruction_error = np.linalg.norm( + input_krr_diff + ) / np.linalg.norm(latent) del input_krr_pred, input_mean_square, input_krr_diff gc.collect() # KRR error of artificial data if size > 1: - subsamples = np.random.choice(np.arange(self.artificial_samples_[data_type].shape[0]), size, replace=False) + subsamples = np.random.choice( + np.arange(self.artificial_samples_[data_type].shape[0]), + size, + replace=False, + ) elif size <= 0: return { "factor": { "MSE": {"input": input_factor_mean_square.detach().numpy()}, - "reconstruction_error": {"input": input_factor_reconstruction_error}, + "reconstruction_error": { + "input": input_factor_reconstruction_error + }, "spearmanr": {"input": np.array(input_spearman_corr)}, }, "latent": { "MSE": {"input": input_latent_mean_square.detach().numpy()}, - "reconstruction_error": {"input": input_latent_reconstruction_error}, + "reconstruction_error": { + "input": input_latent_reconstruction_error + }, "spearmanr": {"input": np.mean(input_spearman_corr)}, }, } @@ -1299,39 +1547,54 @@ def _compute_error_one_type(self, data_type, size=-1): training_spearman_corr = np.array( [ scipy.stats.spearmanr(x, y)[0] - for x, y in zip(training_krr_diff.T, self.artificial_embeddings_[data_type][subsamples].T) + for x, y in zip( + training_krr_diff.T, + self.artificial_embeddings_[data_type][subsamples].T, + ) ] ) - training_krr_diff = training_krr_diff - self.artificial_embeddings_[data_type][subsamples] - training_krr_factor_reconstruction_error = np.linalg.norm(training_krr_diff, axis=0) / np.linalg.norm( - self.artificial_embeddings_[data_type][subsamples], axis=0 - ) - training_krr_latent_reconstruction_error = np.linalg.norm(training_krr_diff) / np.linalg.norm( - self.artificial_embeddings_[data_type][subsamples] + training_krr_diff = ( + training_krr_diff - self.artificial_embeddings_[data_type][subsamples] ) + training_krr_factor_reconstruction_error = np.linalg.norm( + training_krr_diff, axis=0 + ) / np.linalg.norm(self.artificial_embeddings_[data_type][subsamples], axis=0) + training_krr_latent_reconstruction_error = np.linalg.norm( + training_krr_diff + ) / np.linalg.norm(self.artificial_embeddings_[data_type][subsamples]) return { "factor": { "MSE": { "input": input_factor_mean_square.detach().numpy(), - "artificial": torch.mean(torch.square(training_krr_diff), axis=0).detach().numpy(), + "artificial": torch.mean(torch.square(training_krr_diff), axis=0) + .detach() + .numpy(), }, "reconstruction_error": { "input": input_factor_reconstruction_error, "artificial": training_krr_factor_reconstruction_error, }, - "spearmanr": {"input": np.array(input_spearman_corr), "artificial": np.array(training_spearman_corr)}, + "spearmanr": { + "input": np.array(input_spearman_corr), + "artificial": np.array(training_spearman_corr), + }, }, "latent": { "MSE": { "input": input_latent_mean_square.detach().numpy(), - "artificial": torch.mean(torch.square(training_krr_diff)).detach().numpy(), + "artificial": torch.mean(torch.square(training_krr_diff)) + .detach() + .numpy(), }, "reconstruction_error": { "input": input_latent_reconstruction_error, "artificial": training_krr_latent_reconstruction_error, }, - "spearmanr": {"input": np.mean(input_spearman_corr), "artificial": np.mean(training_spearman_corr)}, + "spearmanr": { + "input": np.mean(input_spearman_corr), + "artificial": np.mean(training_spearman_corr), + }, }, } @@ -1370,7 +1633,10 @@ def feature_analysis(self, max_order: int = 1, gene_names: list = None): # Compute the sample offset (matrix O_X and O_Y) self.sample_offset = { - x: _compute_offset(self.approximate_krr_regressions_[x].anchors(), self.gamma) for x in self.training_data + x: _compute_offset( + self.approximate_krr_regressions_[x].anchors(), self.gamma + ) + for x in self.training_data } if gene_names is None: @@ -1388,7 +1654,11 @@ def feature_analysis(self, max_order: int = 1, gene_names: list = None): # Computes all the features of order d. basis_feature_weights_df = higher_order_contribution( d=max_order, - data=self.approximate_krr_regressions_[x].anchors().cpu().detach().numpy(), + data=self.approximate_krr_regressions_[x] + .anchors() + .cpu() + .detach() + .numpy(), sample_offset=self.sample_offset[x], gene_names=self.gene_names, gamma=self.gamma, @@ -1396,10 +1666,14 @@ def feature_analysis(self, max_order: int = 1, gene_names: list = None): ) # Process feature weights. - index = np.arange(self.approximate_krr_regressions_[x].sample_weights_.T.shape[0]) + index = np.arange( + self.approximate_krr_regressions_[x].sample_weights_.T.shape[0] + ) columns = basis_feature_weights_df.columns values = self.approximate_krr_regressions_[x].sample_weights_.T.to(device) - values = values.matmul(torch.Tensor(basis_feature_weights_df.values).to(device)) + values = values.matmul( + torch.Tensor(basis_feature_weights_df.values).to(device) + ) self.factor_level_feature_weights_df[x] = pd.DataFrame( values.cpu().detach().numpy(), index=index, columns=columns ) @@ -1412,7 +1686,10 @@ def feature_analysis(self, max_order: int = 1, gene_names: list = None): self.untransformed_rotations_[x] .T.dot(self.sqrt_inv_matrices_[x]) .dot(self.factor_level_feature_weights_df[x]), - index=["PV %s" % (i) for i in range(self.untransformed_rotations_[x].shape[1])], + index=[ + "PV %s" % (i) + for i in range(self.untransformed_rotations_[x].shape[1]) + ], columns=self.factor_level_feature_weights_df[x].columns, ) for x in self.training_data @@ -1428,7 +1705,10 @@ def permutation_test_number_similar_pvs( perm_clf = deepcopy(self) for ds in perm_clf.artificial_samples_: perm_clf.artificial_samples_[ds] = perm_clf.artificial_samples_[ds][ - :, np.random.permutation(np.arange(perm_clf.artificial_samples_[ds].shape[1])) + :, + np.random.permutation( + np.arange(perm_clf.artificial_samples_[ds].shape[1]) + ), ] self.random_principal_angles.append( perm_clf.fit( @@ -1459,21 +1739,26 @@ def sample_random_vector_(self, data_source, K): # Random norms M = self.M_X if data_source == "source" else self.M_Y factor_norms = torch.FloatTensor(n_factors).uniform_( - torch.sqrt(torch.min(torch.linalg.svd(M)[1])), torch.sqrt(torch.max(torch.linalg.svd(M)[1])) + torch.sqrt(torch.min(torch.linalg.svd(M)[1])), + torch.sqrt(torch.max(torch.linalg.svd(M)[1])), ) # Gram-Schmidt for j in range(n_factors): for i in range(j): similarity = coefficients[:, i].matmul(K).matmul(coefficients[:, j]) - coefficients[:, j] = coefficients[:, j] - similarity * coefficients[:, i] + coefficients[:, j] = ( + coefficients[:, j] - similarity * coefficients[:, i] + ) # Normalise coefficients[:, j] = coefficients[:, j] / torch.sqrt( coefficients[:, j].matmul(K).matmul(coefficients[:, j]) ) # Correct for norm - norm_vectors = torch.sqrt(torch.diag(coefficients.T.matmul(K).matmul(coefficients))) + norm_vectors = torch.sqrt( + torch.diag(coefficients.T.matmul(K).matmul(coefficients)) + ) coefficients = coefficients / norm_vectors * factor_norms return coefficients @@ -1485,29 +1770,39 @@ def compute_random_direction_(self, K_X, K_Y, K_XY): perm_target_sample_coef = self.sample_random_vector_("target", K_Y) # Computation of cosine similarity matrix - M_XY_perm_uncorrected = perm_source_sample_coef.T.matmul(K_XY).matmul(perm_target_sample_coef) + M_XY_perm_uncorrected = perm_source_sample_coef.T.matmul(K_XY).matmul( + perm_target_sample_coef + ) M_X_perm = perm_source_sample_coef.T.matmul(K_X).matmul(perm_source_sample_coef) M_Y_perm = perm_target_sample_coef.T.matmul(K_Y).matmul(perm_target_sample_coef) inv_M_X_perm = mat_inv_sqrt(M_X_perm) inv_M_Y_perm = mat_inv_sqrt(M_Y_perm) - return np.linalg.svd(inv_M_X_perm.dot(M_XY_perm_uncorrected).dot(inv_M_Y_perm))[1] + return np.linalg.svd(inv_M_X_perm.dot(M_XY_perm_uncorrected).dot(inv_M_Y_perm))[ + 1 + ] - def null_model_similarity(self, n_iter=100, quantile=0.95, return_all=False, n_jobs=1): + def null_model_similarity( + self, n_iter=100, quantile=0.95, return_all=False, n_jobs=1 + ): """Compute the null model for PV similarities.""" # Compute similarity K_X = self.approximate_krr_regressions_["target"].kernel_( - self.approximate_krr_regressions_["source"].anchors(), self.approximate_krr_regressions_["source"].anchors() + self.approximate_krr_regressions_["source"].anchors(), + self.approximate_krr_regressions_["source"].anchors(), ) K_Y = self.approximate_krr_regressions_["target"].kernel_( - self.approximate_krr_regressions_["target"].anchors(), self.approximate_krr_regressions_["target"].anchors() + self.approximate_krr_regressions_["target"].anchors(), + self.approximate_krr_regressions_["target"].anchors(), ) K_XY = self.approximate_krr_regressions_["target"].kernel_( - self.approximate_krr_regressions_["source"].anchors(), self.approximate_krr_regressions_["target"].anchors() + self.approximate_krr_regressions_["source"].anchors(), + self.approximate_krr_regressions_["target"].anchors(), ) random_directions = Parallel(n_jobs=n_jobs, verbose=1, backend="threading")( - delayed(self.compute_random_direction_)(K_X, K_Y, K_XY) for _ in range(n_iter) + delayed(self.compute_random_direction_)(K_X, K_Y, K_XY) + for _ in range(n_iter) ) if return_all: diff --git a/tests/test_krr_approx.py b/tests/test_krr_approx.py index 0acf8d8..abe2f65 100644 --- a/tests/test_krr_approx.py +++ b/tests/test_krr_approx.py @@ -52,7 +52,10 @@ class TestKRRApprox: @pytest.fixture(scope="class") def sklearn_rbf_KRR(self): return KRRApprox( - method="sklearn", kernel="rbf", kernel_params={"gamma": 1 / (2 * n_genes)}, penalization=penalization + method="sklearn", + kernel="rbf", + kernel_params={"gamma": 1 / (2 * n_genes)}, + penalization=penalization, ) @pytest.fixture(scope="class") @@ -67,7 +70,10 @@ def sklearn_matern_KRR(self): @pytest.fixture(scope="class") def sklearn_laplacian_KRR(self): return KRRApprox( - method="sklearn", kernel="laplacian", kernel_params={"gamma": 1 / (2 * n_genes)}, penalization=penalization + method="sklearn", + kernel="laplacian", + kernel_params={"gamma": 1 / (2 * n_genes)}, + penalization=penalization, ) @pytest.fixture(scope="class") @@ -171,17 +177,27 @@ def test_all_falkon_kernels(self, falkon_import): def test_rbf_sklearn_fit(self, fit_sklearn_rbf_ridge, valid_input, valid_embedding): pred = fit_sklearn_rbf_ridge.transform(valid_input) - pearson_corr = scipy.stats.pearsonr(pred.flatten(), valid_embedding.detach().numpy().flatten()) + pearson_corr = scipy.stats.pearsonr( + pred.flatten(), valid_embedding.detach().numpy().flatten() + ) assert pearson_corr[0] > pearson_threshold - def test_matern_sklearn_fit(self, fit_sklearn_matern_ridge, valid_input, valid_embedding): + def test_matern_sklearn_fit( + self, fit_sklearn_matern_ridge, valid_input, valid_embedding + ): pred = fit_sklearn_matern_ridge.transform(valid_input) - pearson_corr = scipy.stats.pearsonr(pred.flatten(), valid_embedding.detach().numpy().flatten()) + pearson_corr = scipy.stats.pearsonr( + pred.flatten(), valid_embedding.detach().numpy().flatten() + ) assert pearson_corr[0] > pearson_threshold - def test_laplacian_sklearn_fit(self, fit_sklearn_laplacian_ridge, valid_input, valid_embedding): + def test_laplacian_sklearn_fit( + self, fit_sklearn_laplacian_ridge, valid_input, valid_embedding + ): pred = fit_sklearn_laplacian_ridge.transform(valid_input) - pearson_corr = scipy.stats.pearsonr(pred.flatten(), valid_embedding.detach().numpy().flatten()) + pearson_corr = scipy.stats.pearsonr( + pred.flatten(), valid_embedding.detach().numpy().flatten() + ) assert pearson_corr[0] > pearson_threshold ### @@ -191,35 +207,59 @@ def test_laplacian_sklearn_fit(self, fit_sklearn_laplacian_ridge, valid_input, v def test_rbf_falkon_fit(self, fit_falkon_rbf_ridge, valid_input, valid_embedding): if fit_falkon_rbf_ridge is not None: pred = fit_falkon_rbf_ridge.transform(valid_input) - pearson_corr = scipy.stats.pearsonr(pred.flatten(), valid_embedding.detach().numpy().flatten()) + pearson_corr = scipy.stats.pearsonr( + pred.flatten(), valid_embedding.detach().numpy().flatten() + ) assert pearson_corr[0] > pearson_threshold - def test_matern_falkon_fit(self, fit_falkon_matern_ridge, valid_input, valid_embedding): + def test_matern_falkon_fit( + self, fit_falkon_matern_ridge, valid_input, valid_embedding + ): if fit_falkon_matern_ridge is not None: pred = fit_falkon_matern_ridge.transform(valid_input) - pearson_corr = scipy.stats.pearsonr(pred.flatten(), valid_embedding.detach().numpy().flatten()) + pearson_corr = scipy.stats.pearsonr( + pred.flatten(), valid_embedding.detach().numpy().flatten() + ) assert pearson_corr[0] > pearson_threshold - def test_laplacian_falkon_fit(self, fit_falkon_laplacian_ridge, valid_input, valid_embedding): + def test_laplacian_falkon_fit( + self, fit_falkon_laplacian_ridge, valid_input, valid_embedding + ): if fit_falkon_laplacian_ridge is not None: pred = fit_falkon_laplacian_ridge.transform(valid_input) - pearson_corr = scipy.stats.pearsonr(pred.flatten(), valid_embedding.detach().numpy().flatten()) + pearson_corr = scipy.stats.pearsonr( + pred.flatten(), valid_embedding.detach().numpy().flatten() + ) assert pearson_corr[0] > pearson_threshold - def test_ridge_coef_sklearn_fit(self, fit_sklearn_laplacian_ridge, input, valid_input): + def test_ridge_coef_sklearn_fit( + self, fit_sklearn_laplacian_ridge, input, valid_input + ): if fit_sklearn_laplacian_ridge is not None: pred_reconstruct = fit_sklearn_laplacian_ridge.kernel_( valid_input, input[fit_sklearn_laplacian_ridge.ridge_samples_idx_, :] ) - pred_reconstruct = pred_reconstruct.dot(fit_sklearn_laplacian_ridge.sample_weights_) + pred_reconstruct = pred_reconstruct.dot( + fit_sklearn_laplacian_ridge.sample_weights_ + ) np.testing.assert_array_almost_equal( - pred_reconstruct, fit_sklearn_laplacian_ridge.transform(valid_input), decimal=3 + pred_reconstruct, + fit_sklearn_laplacian_ridge.transform(valid_input), + decimal=3, ) - def test_ridge_coef_falkon_fit(self, fit_falkon_laplacian_ridge, input, valid_input): + def test_ridge_coef_falkon_fit( + self, fit_falkon_laplacian_ridge, input, valid_input + ): if fit_falkon_laplacian_ridge is not None: - pred_reconstruct = fit_falkon_laplacian_ridge.kernel_(valid_input, fit_falkon_laplacian_ridge.anchors()) - pred_reconstruct = pred_reconstruct.matmul(fit_falkon_laplacian_ridge.sample_weights_) + pred_reconstruct = fit_falkon_laplacian_ridge.kernel_( + valid_input, fit_falkon_laplacian_ridge.anchors() + ) + pred_reconstruct = pred_reconstruct.matmul( + fit_falkon_laplacian_ridge.sample_weights_ + ) np.testing.assert_array_almost_equal( - pred_reconstruct, fit_falkon_laplacian_ridge.transform(valid_input), decimal=3 + pred_reconstruct, + fit_falkon_laplacian_ridge.transform(valid_input), + decimal=3, ) diff --git a/tests/test_sobolev_alignment.py b/tests/test_sobolev_alignment.py index 023b874..9671133 100644 --- a/tests/test_sobolev_alignment.py +++ b/tests/test_sobolev_alignment.py @@ -22,13 +22,19 @@ def falkon_import(): @pytest.fixture(scope="module") def source_data(): poisson_coef = np.random.randint(1, 25, size=n_genes) - return np.concatenate([np.random.poisson(lam=l, size=n_samples).reshape(-1, 1) for l in poisson_coef], axis=1) + return np.concatenate( + [np.random.poisson(lam=l, size=n_samples).reshape(-1, 1) for l in poisson_coef], + axis=1, + ) @pytest.fixture(scope="module") def target_data(): poisson_coef = np.random.randint(1, 25, size=n_genes) - return np.concatenate([np.random.poisson(lam=l, size=n_samples).reshape(-1, 1) for l in poisson_coef], axis=1) + return np.concatenate( + [np.random.poisson(lam=l, size=n_samples).reshape(-1, 1) for l in poisson_coef], + axis=1, + ) @pytest.fixture(scope="module") @@ -93,7 +99,9 @@ def target_scvi_params(): class TestSobolevAlignment: @pytest.fixture(scope="class") - def sobolev_alignment_raw(self, falkon_import, source_scvi_params, target_scvi_params): + def sobolev_alignment_raw( + self, falkon_import, source_scvi_params, target_scvi_params + ): if falkon_import: source_krr_params = {"method": "falkon"} target_krr_params = {"method": "falkon"} @@ -112,7 +120,9 @@ def sobolev_alignment_raw(self, falkon_import, source_scvi_params, target_scvi_p ) @pytest.fixture(scope="class") - def sobolev_alignment_batch(self, falkon_import, source_scvi_params, target_scvi_params): + def sobolev_alignment_batch( + self, falkon_import, source_scvi_params, target_scvi_params + ): if falkon_import: source_krr_params = {"method": "falkon"} target_krr_params = {"method": "falkon"} @@ -141,11 +151,17 @@ def scvi_raw_trained(self, source_anndata, target_anndata, sobolev_alignment_raw ) @pytest.fixture(scope="class") - def scvi_batch_trained(self, source_anndata, target_anndata, sobolev_alignment_batch): - return sobolev_alignment_batch.fit(X_source=source_anndata, X_target=target_anndata) + def scvi_batch_trained( + self, source_anndata, target_anndata, sobolev_alignment_batch + ): + return sobolev_alignment_batch.fit( + X_source=source_anndata, X_target=target_anndata + ) @pytest.fixture(scope="class") - def scvi_batch_trained_lib_size(self, source_anndata, target_anndata, sobolev_alignment_batch): + def scvi_batch_trained_lib_size( + self, source_anndata, target_anndata, sobolev_alignment_batch + ): return sobolev_alignment_batch.fit( X_source=source_anndata, X_target=target_anndata, @@ -161,14 +177,23 @@ def test_training_scvi_batch_trained( ): assert isinstance(scvi_batch_trained.scvi_models, dict) for _, model in scvi_batch_trained.scvi_models.items(): - assert model.history["train_loss_epoch"].values[-1, 0] < model.history["train_loss_epoch"].values[0, 0] + assert ( + model.history["train_loss_epoch"].values[-1, 0] + < model.history["train_loss_epoch"].values[0, 0] + ) for x in scvi_batch_trained.artificial_samples_: - assert scvi_batch_trained.artificial_samples_[x].shape[0] == n_artificial_samples * frac_save_artificial + assert ( + scvi_batch_trained.artificial_samples_[x].shape[0] + == n_artificial_samples * frac_save_artificial + ) assert scvi_batch_trained.artificial_samples_[x].shape[1] == n_genes for x in scvi_batch_trained.artificial_embeddings_: - assert scvi_batch_trained.artificial_embeddings_[x].shape[0] == n_artificial_samples * frac_save_artificial + assert ( + scvi_batch_trained.artificial_embeddings_[x].shape[0] + == n_artificial_samples * frac_save_artificial + ) assert scvi_batch_trained.artificial_embeddings_[x].shape[1] == n_latent def test_training_scvi_batch_trained_lib_size( @@ -177,21 +202,29 @@ def test_training_scvi_batch_trained_lib_size( ): assert isinstance(scvi_batch_trained_lib_size.scvi_models, dict) for _, model in scvi_batch_trained_lib_size.scvi_models.items(): - assert model.history["train_loss_epoch"].values[-1, 0] < model.history["train_loss_epoch"].values[0, 0] + assert ( + model.history["train_loss_epoch"].values[-1, 0] + < model.history["train_loss_epoch"].values[0, 0] + ) for x in scvi_batch_trained_lib_size.artificial_samples_: assert ( scvi_batch_trained_lib_size.artificial_samples_[x].shape[0] == n_artificial_samples * frac_save_artificial ) - assert scvi_batch_trained_lib_size.artificial_samples_[x].shape[1] == n_genes + assert ( + scvi_batch_trained_lib_size.artificial_samples_[x].shape[1] == n_genes + ) for x in scvi_batch_trained_lib_size.artificial_embeddings_: assert ( scvi_batch_trained_lib_size.artificial_embeddings_[x].shape[0] == n_artificial_samples * frac_save_artificial ) - assert scvi_batch_trained_lib_size.artificial_embeddings_[x].shape[1] == n_latent + assert ( + scvi_batch_trained_lib_size.artificial_embeddings_[x].shape[1] + == n_latent + ) # np.savetxt(open('source.csv', 'w'), scvi_batch_trained_lib_size.artificial_samples_['source']) # np.savetxt(open('target.csv', 'w'), scvi_batch_trained_lib_size.artificial_samples_['target']) @@ -201,7 +234,12 @@ def test_training_scvi_batch_trained_lib_size( def test_KRR_scvi_trained(self, scvi_batch_trained): for x in scvi_batch_trained.artificial_samples_: - assert scvi_batch_trained.approximate_krr_regressions_[x].sample_weights_.shape[1] == n_latent + assert ( + scvi_batch_trained.approximate_krr_regressions_[ + x + ].sample_weights_.shape[1] + == n_latent + ) # def test_training_scvi_raw_trained(self, scvi_raw_trained): # assert type(scvi_raw_trained.scvi_models) == dict diff --git a/tutorials/process_data.ipynb b/tutorials/process_data.ipynb index 66920ca..8275a4a 100644 --- a/tutorials/process_data.ipynb +++ b/tutorials/process_data.ipynb @@ -18,9 +18,9 @@ "metadata": {}, "outputs": [], "source": [ - "import scanpy as sc\n", "import numpy as np\n", "import pandas as pd\n", + "import scanpy as sc\n", "\n", "# Make a data folder\n", "! mkdir -p ./data/" @@ -77,8 +77,8 @@ "source": [ "# Read only cancer cells\n", "kim_df = pd.read_csv(\n", - " './data/kim.txt.gz', \n", - " compression='gzip', sep='\\t', \n", + " './data/kim.txt.gz',\n", + " compression='gzip', sep='\\t',\n", " usecols = ['Index'] + list(kim_epithelial_cells)\n", ")\n", "kim_df = kim_df.set_index('Index').T\n", @@ -215,9 +215,9 @@ "source": [ "# Read only cancer cells\n", "kinker_df = pd.read_csv(\n", - " './data/kinker.txt', \n", + " './data/kinker.txt',\n", " sep='\\t',\n", - " header=[0,1,2], \n", + " header=[0,1,2],\n", " index_col=0,\n", ")\n", "kinker_df = kinker_df[kinker_lung_cancer_cell]" diff --git a/tutorials/tutorial_simple.ipynb b/tutorials/tutorial_simple.ipynb index 1543538..a770e0a 100644 --- a/tutorials/tutorial_simple.ipynb +++ b/tutorials/tutorial_simple.ipynb @@ -15,15 +15,10 @@ "metadata": {}, "outputs": [], "source": [ - "import scanpy as sc\n", - "from sobolev_alignment import SobolevAlignment\n", - "from copy import deepcopy\n", "\n", "import numpy as np\n", - "import pandas as pd\n", - "import re\n", - "import tqdm\n", - "import torch" + "import scanpy as sc\n", + "from sobolev_alignment import SobolevAlignment" ] }, { @@ -165,8 +160,8 @@ "sobolev_alignment_clf.fit(\n", " X_source=source_an,\n", " X_target=target_an,\n", - " fit_vae=False, \n", - " krr_approx=True, \n", + " fit_vae=False,\n", + " krr_approx=True,\n", " sample_artificial=True\n", ")" ] @@ -256,10 +251,10 @@ "metadata": {}, "outputs": [], "source": [ - "import rpy2\n", - "from rpy2.robjects.packages import importr\n", "import rpy2.robjects as robjects\n", "from rpy2.robjects import pandas2ri\n", + "from rpy2.robjects.packages import importr\n", + "\n", "pandas2ri.activate()\n", "\n", "importr('batchelor')"