Generate new data given labels #53

fbao-fudan · 2024-07-24T03:21:13Z

Hi Dongyuan,

Thanks for sharing the work. I have a question about using the model: can I first train model on one dataset and learn the relation between cell type and omics parameters; then I have a new list of cell types, can I use trained model to generate the gene data based on the new cell type list? I did not find this function from examples, maybe I missed that.

Thank you!

Feng

SONGDONGYUAN1994 · 2024-07-24T03:25:37Z

Hi Feng,
Sure, we have this function. Please check https://songdongyuan1994.github.io/scDesign3/docs/articles/scDesign3-introduction-vignette.html

It is not written clearly (I apologize); basically, in step-by-step mode, the final simu_new function has a parameter: new_covariate. Just input your new labels here. Please let me know if it works for you.

fbao-fudan · 2024-07-24T03:55:49Z

Thank you so much for the quick reply, I tried to make a toy model for this, where I have a sce_train for model fitting and a sce_test for just inference. I only used sce_test in the last step. It seems there was a mismatching shape issue. Can you help me check this?

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "label",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "label",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "label",
    sigma_formula = "label",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 2,
    family_use = "nb",
    new_covariate = con_data_train$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_test$dat,
    new_covariate = con_data_test$newCovariate,
    filtered_gene = con_data_test$filtered_gene
)

SONGDONGYUAN1994 · 2024-07-24T15:31:19Z

Oh I forgot this. extract_para also has the parameter new_covariate. Please also replace it with your test data. Thanks! filtered_gene in the last step should still use training data.

fbao-fudan · 2024-07-24T16:45:37Z

Oh I forgot this. extract_para also has the parameter new_covariate. Please also replace it with your test data. Thanks! filtered_gene in the last step should still use training data.

Thank you for your information. I tested as suggested.


MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 1,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$newCovariate,
    filtered_gene = con_data_train$filtered_gene
)

The code stopped at extract_para. The error message is:

error in eval(predvars, data, env): object 'label' not found
Traceback:

1. extract_para(sce = sce_train, marginal_list = MOBSC_marginal, 
 .     n_cores = 1, family_use = "nb", new_covariate = con_data_test$newCovariate, 
 .     data = con_data_train$dat)
2. suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores))
3. withCallingHandlers(expr, message = function(c) if (inherits(c, 
 .     classes)) tryInvokeRestart("muffleMessage"))
4. paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores)
5. mclapply(seq_len(n), do_one, mc.preschedule = mc.preschedule, 
 .     mc.set.seed = mc.set.seed, mc.silent = mc.silent, mc.cores = mc.cores, 
 .     mc.cleanup = mc.cleanup, affinity.list = affinity.list)
6. lapply(X = X, FUN = FUN, ...)
7. FUN(X[[i]], ...)
8. .mapply(FUN, dots, MoreArgs)
9. (function (x, y) 
 . {

But I checked con_data_test$newCovariate, and confirmed there is a column called label.
I also tested if I replaced con_data_test$newCovariate with con_data_train$newCovariate, extract_para worked fine, but the code will stop at simu_new.

fbao-fudan · 2024-07-24T16:56:44Z

I made a minimal test script here. Really appreciate your help!

library(scDesign3)
library(SingleCellExperiment)
library(dplyr)


example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40582019")))


sce_train <- example_sce[1:10,1:50 ]
sce_test <- example_sce[1:10,51:80]


set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$newCovariate,
    filtered_gene = con_data_train$filtered_gene
)

SONGDONGYUAN1994 · 2024-07-24T21:12:10Z

Thanks for this demo! I found a bug and we will update the package as soon as possible. I will @ you once we update the package.

fbao-fudan · 2024-07-24T21:32:11Z

Thanks for this demo! I found a bug and we will update the package as soon as possible. I will @ you once we update the package.

Thank you and look forward to your update.

qw130 · 2024-07-27T01:05:31Z

Hi Feng, @feng-bao-ucsf

I just updated the code. The newest code on GitHub should work.

When I was running the demo code you provided, I noticed that one parameter (new_covariate) in simu_new was not set properly, which will cause an error when you run simu_new. Please use the following code to run simu_new.

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)

The parameter new_covariate in simu_new needs to be a dataframe which contains the columns for your specified covariates and a column named corr_group. con_data_test$dat will contain all the columns needed, but con_data_test$newCovariate will not contain the column corr_group if you don't modify the ncell parameter in construct_data.

Thanks again for your interest in scDesign3. Please let us know if you have any further questions!

fbao-fudan · 2024-07-27T18:10:52Z

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,

filtered_gene = con_data_train$filtered_gene

)

Thank you so much for the quick response. I tested the code using the data from figshare, and it ran perfectly. However, when I tried to run with my own data, I found there might be some issues for the extract_para step. I tried multiple datasets and the error information was the same.

Here is my code to test the data and the data:

meta_intestine.csv

data_intestine.csv

library(scDesign3)
library(SingleCellExperiment)
library(ggplot2)
library(dplyr)


data <- read.csv("data_intestine.csv", row.names = 1)
meta <- read.csv("meta_intestine.csv", row.names = 1)

meta <- meta[, c("x", "y", "label")]

colnames(meta) <- c("x", "y", "cell_type")
# Add 1 to each element in the 'label' column
meta$label <- meta$cell_type + 1

sce <- SingleCellExperiment(
    assays = list(counts = t(data)),
    colData = meta
)
sce_train <- sce[1:10, 1:50]
sce_test <- sce[1:10, 51:80]


set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)

The error info is:

Error in X[, pstart[i] - 1 + 1:object$nsdf[i]] <- Xp: number of items to replace is not a multiple of replacement length
Traceback:

1. extract_para(sce = sce_train, marginal_list = MOBSC_marginal, 
 .     n_cores = 1, family_use = "nb", new_covariate = con_data_test$newCovariate, 
 .     data = con_data_train$dat)
2. suppressMessages(paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores))
3. withCallingHandlers(expr, message = function(c) if (inherits(c, 
 .     classes)) tryInvokeRestart("muffleMessage"))
4. paraFunc(mat_function, x = seq_len(dim(sce)[1])[qc_gene_idx], 
 .     y = family_use, SIMPLIFY = FALSE, mc.cores = n_cores)
5. mclapply(seq_len(n), do_one, mc.preschedule = mc.preschedule, 
 .     mc.set.seed = mc.set.seed, mc.silent = mc.silent, mc.cores = mc.cores, 
 .     mc.cleanup = mc.cleanup, affinity.list = affinity.list)
6. lapply(X = X, FUN = FUN, ...)
7. FUN(X[[i]], ...)
8. .mapply(FUN, dots, MoreArgs)
9. (function (x, y) 
 . {
 .     fit <- marginal_list[[x]]

fbao-fudan · 2024-07-31T04:15:08Z

Dear Dongyuan and Qingyang, I also tried other example datasets from the scDesign3. It seems the same error message also happened for those. Can you help check that? Really appreciate it.

library(scDesign3)
library(SingleCellExperiment)
library(dplyr)


example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581980")))
# I also tried this dataset
# example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581965")))

sce_train <- example_sce[1:10, 1:50]
sce_test <- example_sce[1:10, 51:80]

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = NULL,
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "cell_type",
    sigma_formula = "cell_type",
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)
MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)

qw130 · 2024-07-31T20:45:00Z

Hi Feng,

I just uploaded the code. The error is due to the incorrect data format that results when selecting only one column from a data frame.

I tested the newest code with your script above. The data with the link (https://figshare.com/ndownloader/files/40581980) ran with no error. The data with the link (https://figshare.com/ndownloader/files/40581965) produces an error. However, it causes an error because of the training and testing data. The training data does not contain CD14+ monocyte cells, while the testing data does; thus, the GAMLSS model can't predict the parameters for an unseen cell type. Once I removed the three CD14+ monocyte cells from the testing data, the data with the link (https://figshare.com/ndownloader/files/40581965) also ran with no error.

Thanks for letting us know about this bug, and please let us know if you have further questions.

fbao-fudan · 2024-08-01T21:17:32Z

Thank you so much for the update. It works perfectly on the test datasets. I am working to extend the test to additional datasets and will close this slot shortly.

fbao-fudan · 2024-08-02T20:37:10Z

I did another test on prediction of new samples but based on spatial positions. Here is my test code:

library(scDesign3)
library(SingleCellExperiment)
library(dplyr)

example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40582019")))
# example_sce <- readRDS((url("https://figshare.com/ndownloader/files/40581977")))
sce_train <- example_sce[1:10, 1:50]
sce_test <- example_sce[1:10, 51:80]

set.seed(123)
con_data_train <- construct_data(
    sce = sce_train,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = c("spatial1", "spatial2"),
    other_covariates = NULL,
    corr_by = "1"
)

con_data_test <- construct_data(
    sce = sce_test,
    assay_use = "counts",
    celltype = "cell_type",
    pseudotime = NULL,
    spatial = c("spatial1", "spatial2"),
    other_covariates = NULL,
    corr_by = "1"
)

MOBSC_marginal <- fit_marginal(
    data = con_data_train,
    predictor = "gene",
    mu_formula = "s(spatial1, spatial2, bs = 'gp', k= 400)",
    sigma_formula =1,
    family_use = "nb",
    n_cores = 2,
    usebam = FALSE,
    parallelization = "pbmcmapply"
)

MOBSC_copula <- fit_copula(
    sce = sce_train,
    assay_use = "counts",
    marginal_list = MOBSC_marginal,
    family_use = "nb",
    copula = "gaussian",
    n_cores = 2,
    input_data = con_data_train$dat
)

MOBSC_para <- extract_para(
    sce = sce_train,
    marginal_list = MOBSC_marginal,
    n_cores = 1,
    family_use = "nb",
    new_covariate = con_data_test$newCovariate,
    data = con_data_train$dat
)

MOBSC_newcount <- simu_new(
    sce = sce_train,
    mean_mat = MOBSC_para$mean_mat,
    sigma_mat = MOBSC_para$sigma_mat,
    zero_mat = MOBSC_para$zero_mat,
    quantile_mat = NULL,
    copula_list = MOBSC_copula$copula_list,
    n_cores = 2,
    family_use = "nb",
    input_data = con_data_train$dat,
    new_covariate = con_data_test$dat,
    filtered_gene = con_data_train$filtered_gene
)

The code stops at fit_copula, with error messages:

Error in `colnames<-`(`*tmp*`, value = rownames(sce)): attempt to set 'colnames' on an object with less than two dimensions
Traceback:

1. fit_copula(sce = sce_train, assay_use = "counts", marginal_list = MOBSC_marginal, 
 .     family_use = "nb", copula = "gaussian", n_cores = 2, input_data = con_data_train$dat)
2. convert_n(sce = sce[qc_gene_idx, ], assay_use = assay_use, marginal_list = marginal_list[qc_gene_idx], 
 .     data = input_data, DT = DT, pseudo_obs = pseudo_obs, n_cores = n_cores, 
 .     family_use = family_use, epsilon = epsilon, parallelization = parallelization, 
 .     BPPARAM = BPPARAM)
3. `colnames<-`(`*tmp*`, value = rownames(sce))
4. stop("attempt to set 'colnames' on an object with less than two dimensions")

Any ideas on this error? Really appreciate your help!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Generate new data given labels #53

Generate new data given labels #53

fbao-fudan commented Jul 24, 2024

SONGDONGYUAN1994 commented Jul 24, 2024

fbao-fudan commented Jul 24, 2024 •

edited

Loading

SONGDONGYUAN1994 commented Jul 24, 2024

fbao-fudan commented Jul 24, 2024 •

edited

Loading

fbao-fudan commented Jul 24, 2024

SONGDONGYUAN1994 commented Jul 24, 2024

fbao-fudan commented Jul 24, 2024

qw130 commented Jul 27, 2024

fbao-fudan commented Jul 27, 2024 •

edited

Loading

fbao-fudan commented Jul 31, 2024

qw130 commented Jul 31, 2024

fbao-fudan commented Aug 1, 2024

fbao-fudan commented Aug 2, 2024 •

edited

Loading

Generate new data given labels #53

Generate new data given labels #53

Comments

fbao-fudan commented Jul 24, 2024

SONGDONGYUAN1994 commented Jul 24, 2024

fbao-fudan commented Jul 24, 2024 • edited Loading

SONGDONGYUAN1994 commented Jul 24, 2024

fbao-fudan commented Jul 24, 2024 • edited Loading

fbao-fudan commented Jul 24, 2024

SONGDONGYUAN1994 commented Jul 24, 2024

fbao-fudan commented Jul 24, 2024

qw130 commented Jul 27, 2024

fbao-fudan commented Jul 27, 2024 • edited Loading

fbao-fudan commented Jul 31, 2024

qw130 commented Jul 31, 2024

fbao-fudan commented Aug 1, 2024

fbao-fudan commented Aug 2, 2024 • edited Loading

fbao-fudan commented Jul 24, 2024 •

edited

Loading

fbao-fudan commented Jul 24, 2024 •

edited

Loading

fbao-fudan commented Jul 27, 2024 •

edited

Loading

fbao-fudan commented Aug 2, 2024 •

edited

Loading