From 3dad59d824c3cb810f96c45dbe6773f2f8094d65 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Wed, 28 Feb 2024 17:16:59 -0800 Subject: [PATCH 01/15] add arrow support and ability to read zarr --- R/convenience.R | 26 ++++-- R/preprocessing.R | 214 ++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 196 insertions(+), 44 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index 5c9367eca..c4cbed9f6 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -193,20 +193,34 @@ LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { "centroids" = CreateCentroids(data$centroids), "segmentation" = CreateSegmentation(data$segmentations) ) + coords <- CreateFOV( coords = segmentations.data, type = c("segmentation", "centroids"), molecules = data$microns, assay = assay ) + + slot.map <- c( + `Blank Codeword` = 'BlankCodeword', + `Unassigned Codeword` = 'BlankCodeword', + `Negative Control Codeword` = 'ControlCodeword', + `Negative Control Probe` = 'ControlProbe' + ) xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]], assay = assay) - if("Blank Codeword" %in% names(data$matrix)) - xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Blank Codeword"]]) - else - xenium.obj[["BlankCodeword"]] <- CreateAssayObject(counts = data$matrix[["Unassigned Codeword"]]) - xenium.obj[["ControlCodeword"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Codeword"]]) - xenium.obj[["ControlProbe"]] <- CreateAssayObject(counts = data$matrix[["Negative Control Probe"]]) + + if(!is.null(data$metadata)) { + Misc(xenium.obj, 'run_metadata') <- data$metadata + } + + if(!is.null(data$segmentation_method)) { + xenium.obj <- AddMetaData(xenium.obj, data$segmentation_method) + } + + for(name in intersect(names(slot.map), names(data$matrix))) { + xenium.obj[[slot.map[name]]] <- CreateAssayObject(counts = data$matrix[[name]]) + } xenium.obj[[fov]] <- coords return(xenium.obj) diff --git a/R/preprocessing.R b/R/preprocessing.R index 6f5857d0a..58561fca9 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2102,6 +2102,7 @@ ReadNanostring <- function( #' \itemize{ #' \item \dQuote{matrix}: the counts matrix #' \item \dQuote{microns}: molecule coordinates +#' \item \dQuote{segmentation_method}: molecule coordinates #' } #' @param type Type of cell spatial coordinate matrices to read; choose one #' or more of: @@ -2131,26 +2132,28 @@ ReadNanostring <- function( #' ReadXenium <- function( data.dir, - outs = c("matrix", "microns"), + outs = c("segmentation_method", "matrix", "microns"), type = "centroids", mols.qv.threshold = 20 ) { # Argument checking type <- match.arg( arg = type, - choices = c("centroids", "segmentations"), + choices = c("centroids", "segmentations", "nucleus_segmentations"), several.ok = TRUE ) outs <- match.arg( arg = outs, - choices = c("matrix", "microns"), + choices = c("segmentation_method", "matrix", "microns"), several.ok = TRUE ) outs <- c(outs, type) has_dt <- requireNamespace("data.table", quietly = TRUE) && requireNamespace("R.utils", quietly = TRUE) + has_arrow <- requireNamespace("arrow", quietly = TRUE) + has_hdf5r <- requireNamespace("hdf5r", quietly = TRUE) data <- sapply(outs, function(otype) { switch( @@ -2158,10 +2161,65 @@ ReadXenium <- function( 'matrix' = { pmtx <- progressor() pmtx(message = 'Reading counts matrix', class = 'sticky', amount = 0) - matrix <- suppressWarnings(Read10X(data.dir = file.path(data.dir, "cell_feature_matrix/"))) + + for(option in Filter(function(x) x$req, list( + list(filename = "cell_feature_matrix.h5", fn = Read10X_h5, req = has_hdf5r), + list(filename = "cell_feature_matrix", fn = Read10X, req = TRUE) + ))) { + matrix <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(matrix, "try-error")) { break } + } + pmtx(type = "finish") matrix }, + 'segmentation_method' = { + if(!requireNamespace("stars", quietly = TRUE) || !requireNamespace("jsonlite", quietly = TRUE) || !requireNamespace("gmp", quietly = TRUE)) { + warning("Reading segmentation_method requires the `stars`, `gmp` and `jsonlite` packages") + return(NULL) + } + + if(file.exists(file.path(data.dir, "cells.zarr.zip"))) { + pcents <- progressor() + pcents( + message = 'Loading cell metadata', + class = 'sticky', + amount = 0 + ) + + tempdir <- path.expand(tempdir()) + unzip(file.path(data.dir, "cells.zarr.zip"), exdir = tempdir) + zattr <- jsonlite::read_json(file.path(tempdir, '.zattrs')) + which_entry <- which(unlist(zattr$polygon_set_names) == 'cell') + + indices <- stars::read_mdim(file.path(tempdir, 'polygon_sets', which_entry - 1, 'cell_index'))$cell_index + 1 + indices[is.na(indices)] <- 1 + + ids <- stars::read_mdim(file.path(tempdir, 'cell_id'))$cell_id + ids[is.na(ids)] <- 0 + + ids <- paste0( + gsub(' ', 'a', sprintf('%8s', sapply( + strsplit(as.character(gmp::as.bigz(ids[1,]), 16), ''), + function(id) { + rawToChar(as.raw(sapply(id, function(x) { + as.numeric(charToRaw(x)) + + ifelse(is.na(suppressWarnings(as.numeric(x))), 10, 49) + }))) + } + ))), '-', ids[2,]) + + method <- stars::read_mdim(file.path(tempdir, 'polygon_sets', which_entry - 1, 'method'))$method + 1 + method[is.na(method)] <- 1 + + segmentation_method <- unlist(zattr$segmentation_methods)[method] + + pcents(type = "finish") + data.frame(segmentation_method = segmentation_method, row.names = ids) + } else { + NULL + } + }, 'centroids' = { pcents <- progressor() pcents( @@ -2169,19 +2227,32 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - if (has_dt) { - cell_info <- as.data.frame(data.table::fread(file.path(data.dir, "cells.csv.gz"))) - } else { - cell_info <- read.csv(file.path(data.dir, "cells.csv.gz")) + + col.use <- c(x_centroid = 'x', y_centroid = 'y', cell_id = 'cell') + + for(option in Filter(function(x) x$req, list( + list( + filename = "cells.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))), + req = has_arrow + ), + list( + filename = "cells.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE, select = names(col.use)), + req = has_dt + ), + list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + cell_info <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(cell_info, "try-error")) { break } } - cell_centroid_df <- data.frame( - x = cell_info$x_centroid, - y = cell_info$y_centroid, - cell = cell_info$cell_id, - stringsAsFactors = FALSE - ) + + cell_info <- cell_info[, names(col.use)] + colnames(cell_info) <- col.use + pcents(type = 'finish') - cell_centroid_df + + cell_info }, 'segmentations' = { psegs <- progressor() @@ -2190,17 +2261,61 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - - # load cell boundaries - if (has_dt) { - cell_boundaries_df <- as.data.frame(data.table::fread(file.path(data.dir, "cell_boundaries.csv.gz"))) - } else { - cell_boundaries_df <- read.csv(file.path(data.dir, "cell_boundaries.csv.gz"), stringsAsFactors = FALSE) + + for(option in Filter(function(x) x$req, list( + list( + filename = "cell_boundaries.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x)), + req = has_arrow + ), + list( + filename = "cell_boundaries.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE), + req = has_dt + ), + list(filename = "cell_boundaries.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + cell_boundaries_df <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(cell_boundaries_df, "try-error")) { break } } - names(cell_boundaries_df) <- c("cell", "x", "y") + + colnames(cell_boundaries_df) <- c('cell', 'x', 'y') + psegs(type = "finish") + cell_boundaries_df }, + 'nucleus_segmentations' = { + psegs <- progressor() + psegs( + message = 'Loading nucleus segmentations', + class = 'sticky', + amount = 0 + ) + + for(option in Filter(function(x) x$req, list( + list( + filename = "nucleus_boundaries.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x)), + req = has_arrow + ), + list( + filename = "nucleus_boundaries.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE), + req = has_dt + ), + list(filename = "nucleus_boundaries.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + nucleus_boundaries_df <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(nucleus_boundaries_df, "try-error")) { break } + } + + colnames(nucleus_boundaries_df) <- c('cell', 'x', 'y') + + psegs(type = "finish") + + nucleus_boundaries_df + }, 'microns' = { pmicrons <- progressor() pmicrons( @@ -2209,28 +2324,51 @@ ReadXenium <- function( amount = 0 ) - # molecules - if (has_dt) { - tx_dt <- as.data.frame(data.table::fread(file.path(data.dir, "transcripts.csv.gz"))) - transcripts <- subset(tx_dt, qv >= mols.qv.threshold) - } else { - transcripts <- read.csv(file.path(data.dir, "transcripts.csv.gz")) - transcripts <- subset(transcripts, qv >= mols.qv.threshold) + col.use = c(x_location = 'x', y_location = 'y', feature_name = 'gene') + + for(option in Filter(function(x) x$req, list( + list( + filename = "transcripts.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))), + req = has_arrow + ), + list( + filename = "transcripts.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, select = names(col.use), stringsAsFactors = FALSE), + req = has_dt + ), + list(filename = "transcripts.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + transcripts <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(transcripts, "try-error")) { break } } - - df <- - data.frame( - x = transcripts$x_location, - y = transcripts$y_location, - gene = transcripts$feature_name, - stringsAsFactors = FALSE - ) + + transcripts <- transcripts[, names(col.use)] + colnames(transcripts) <- col.use + pmicrons(type = 'finish') - df + + transcripts }, stop("Unknown Xenium input type: ", otype) ) }, USE.NAMES = TRUE) + + metadata <- file.path(data.dir, "experiment.xenium") + if(file.exists(metadata) && requireNamespace("jsonlite", quietly = TRUE)) { + meta <- jsonlite::read_json(metadata) + data$metadata <- meta[ + intersect( + names(meta), + c( + 'run_name', 'run_start_time', 'region_name', + 'preservation_method', 'panel_name', 'panel_organism', + 'panel_tissue_type', 'instrument_sw_version', + 'segmentation_stain' + ) + ) + ] + } return(data) } From b3a4c010ac10c0eec93233986431c6f4b97868cf Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Wed, 28 Feb 2024 17:24:10 -0800 Subject: [PATCH 02/15] wwip --- vignettes/seurat5_spatial_vignette_2.Rmd | 113 ++++++++++++++++------- 1 file changed, 79 insertions(+), 34 deletions(-) diff --git a/vignettes/seurat5_spatial_vignette_2.Rmd b/vignettes/seurat5_spatial_vignette_2.Rmd index f3e765f21..88395262f 100644 --- a/vignettes/seurat5_spatial_vignette_2.Rmd +++ b/vignettes/seurat5_spatial_vignette_2.Rmd @@ -6,7 +6,8 @@ output: df_print: kable date: 'Compiled: `r format(Sys.Date(), "%B %d, %Y")`' --- -*** + +------------------------------------------------------------------------ ```{r setup, include=FALSE} all_times <- list() # store the time for each chunk @@ -37,9 +38,9 @@ In this vignette, we introduce a Seurat extension to analyze new types of spatia We update the Seurat infrastructure to enable the analysis, visualization, and exploration of these exciting datasets. In this vignette, we focus on three datasets produced by different multiplexed imaging technologies, each of which is publicly available. We will be adding support for additional imaging-based technologies in the coming months. -* Vizgen MERSCOPE (Mouse Brain) -* Nanostring CosMx Spatial Molecular Imager (FFPE Human Lung) -* Akoya CODEX (Human Lymph Node) +- Vizgen MERSCOPE (Mouse Brain) +- Nanostring CosMx Spatial Molecular Imager (FFPE Human Lung) +- Akoya CODEX (Human Lymph Node) First, we load the packages necessary for this vignette. @@ -54,11 +55,11 @@ library(ggplot2) This dataset was produced using the Vizgen MERSCOPE system, which utilizes the MERFISH technology. The total dataset is available for [public download](https://info.vizgen.com/mouse-brain-data), and contains nine samples (three full coronal slices of the mouse brain, with three biological replicates per slice). The gene panel consists of 483 gene targets, representing known anonical cell type markers, nonsensory G-Protein coupled receptors (GPCRs), and Receptor Tyrosine Kinases (RTKs). In this vignette, we analyze one of the samples - slice 2, replicate 1. The median number of transcripts detected in each cell is 206. -First, we read in the dataset and create a Seurat object. +First, we read in the dataset and create a Seurat object. -We use the `LoadVizgen()` function, which we have written to read in the output of the Vizgen analysis pipeline. The resulting Seurat object contains the following information: +We use the `LoadVizgen()` function, which we have written to read in the output of the Vizgen analysis pipeline. The resulting Seurat object contains the following information: -* A count matrix, indicating the number of observed molecules for each of the 483 transcripts in each cell. This matrix is analogous to a count matrix in scRNA-seq, and is stored by default in the RNA assay of the Seurat object +- A count matrix, indicating the number of observed molecules for each of the 483 transcripts in each cell. This matrix is analogous to a count matrix in scRNA-seq, and is stored by default in the RNA assay of the Seurat object ```{r, message=FALSE, warning=FALSE} # Loading segmentations is a slow process and multi processing with the future pacakge is recommended @@ -68,32 +69,42 @@ vizgen.obj <- LoadVizgen(data.dir = "/brahms/hartmana/vignette_data/vizgen/s2r1/ The next pieces of information are specific to imaging assays, and is stored in the images slot of the resulting Seurat object:
- **Cell Centroids: The spatial coordinates marking the centroid for each cell being profiled** + +**Cell Centroids: The spatial coordinates marking the centroid for each cell being profiled** ```{r} # Get the center position of each centroid. There is one row per cell in this dataframe. head(GetTissueCoordinates(vizgen.obj[["s2r1"]][["centroids"]])) ``` +
+
- **Cell Segmentation Boundaries: The spatial coordinates that describe the polygon segmentation of each single cell** + +**Cell Segmentation Boundaries: The spatial coordinates that describe the polygon segmentation of each single cell** ```{r} # Get the coordinates for each segmentation vertice. Each cell will have a variable number of vertices describing its shape. head(GetTissueCoordinates(vizgen.obj[["s2r1"]][["segmentation"]])) ``` +
+
- **Molecule positions: The spatial coordinates for each individual molecule that was detected during the multiplexed smFISH experiment.** + +**Molecule positions: The spatial coordinates for each individual molecule that was detected during the multiplexed smFISH experiment.** ```{r} # Fetch molecules positions for Chrm1 head(FetchData(vizgen.obj[["s2r1"]][["molecules"]], vars="Chrm1")) ``` +
+ \ ## Preprocessing and unsupervised analysis + We start by performing a standard unsupervised clustering analysis, essentially first treating the dataset as an scRNA-seq experiment. We use SCTransform-based normalization, though we slightly modify the default clipping parameters to mitigate the effect of outliers that we occasionally observe in smFISH experiments. After normalization, we can run dimensional reduction and clustering. ```{r analysis, results='hide'} @@ -117,18 +128,20 @@ ImageDimPlot(vizgen.obj, fov = "s2r1", cols = "polychrome", axes = TRUE) You can also customize multiple aspect of the plot, including the color scheme, cell border widths, and size (see below).
- **Customizing spatial plots in Seurat** + +**Customizing spatial plots in Seurat** The `ImageDimPlot()` and `ImageFeaturePlot()` functions have a few parameters which you can customize individual visualizations. These include: -* alpha: Ranges from 0 to 1. Sets the transparency of within-cell coloring. -* size: determines the size of points representing cells, if centroids are being plotted -* cols: Sets the color scheme for the internal shading of each cell. Examples settings are `polychrome`, `glasbey`, `Paired`, `Set3`, and `parade`. Default is the ggplot2 color palette -* shuffle.cols: In some cases the selection of `cols` is more effective when the same colors are assigned to different clusters. Set `shuffle.cols = TRUE` to randomly shuffle the colors in the palette. -* border.size: Sets the width of the cell segmentation borders. By default, segmentations are plotted with a border size of 0.3 and centroids are plotted without border. -* border.color: Sets the color of the cell segmentation borders -* dark.background: Sets a black background color (TRUE by default) -* axes: Display +- alpha: Ranges from 0 to 1. Sets the transparency of within-cell coloring. +- size: determines the size of points representing cells, if centroids are being plotted +- cols: Sets the color scheme for the internal shading of each cell. Examples settings are `polychrome`, `glasbey`, `Paired`, `Set3`, and `parade`. Default is the ggplot2 color palette +- shuffle.cols: In some cases the selection of `cols` is more effective when the same colors are assigned to different clusters. Set `shuffle.cols = TRUE` to randomly shuffle the colors in the palette. +- border.size: Sets the width of the cell segmentation borders. By default, segmentations are plotted with a border size of 0.3 and centroids are plotted without border. +- border.color: Sets the color of the cell segmentation borders +- dark.background: Sets a black background color (TRUE by default) +- axes: Display +
Since it can be difficult to visualize the spatial localization patterns of an individual cluster when viewing them all together, we can highlight all cells that belong to a particular cluster: @@ -139,7 +152,7 @@ p2 <- ImageDimPlot(vizgen.obj, fov = "s2r1", cols = "red", cells = WhichCells(vi p1 + p2 ``` -We can find markers of individual clusters and visualize their spatial expression pattern. We can color cells based on their quantified expression of an individual gene, using `ImageFeaturePlot()`, which is analagous to the `FeaturePlot()` function for visualizing expression on a 2D embedding. Since MERFISH images individual molecules, we can also visualize the location of individual *molecules*. +We can find markers of individual clusters and visualize their spatial expression pattern. We can color cells based on their quantified expression of an individual gene, using `ImageFeaturePlot()`, which is analagous to the `FeaturePlot()` function for visualizing expression on a 2D embedding. Since MERFISH images individual molecules, we can also visualize the location of individual *molecules*. ```{r, fig.height=7, fig.width=12} p1 <- ImageFeaturePlot(vizgen.obj, features = "Slc17a7") @@ -160,7 +173,7 @@ p1 + p2 The updated Seurat spatial framework has the option to treat cells as individual points, or also to visualize cell boundaries (segmentations). By default, Seurat ignores cell segmentations and treats each cell as a point ('centroids'). This speeds up plotting, especially when looking at large areas, where cell boundaries are too small to visualize. -We can zoom into a region of tissue, creating a new field of view. For example, we can zoom into a region that contains the hippocampus. Once zoomed-in, we can set `DefaultBoundary()` to show cell segmentations. You can also 'simplify' the cell segmentations, reducing the number of edges in each polygon to speed up plotting. +We can zoom into a region of tissue, creating a new field of view. For example, we can zoom into a region that contains the hippocampus. Once zoomed-in, we can set `DefaultBoundary()` to show cell segmentations. You can also 'simplify' the cell segmentations, reducing the number of edges in each polygon to speed up plotting. ```{r, fig.height=5, fig.width=14} # create a Crop @@ -187,18 +200,41 @@ p1 + p2 + p3 ```
- **What is the tol parameter?** -The tol parameter determines how simplified the resulting segmentations are. A higher value of tol will reduce the number of vertices more drastically which will speed up plotting, but some segmentation detail will be lost. See https://rgeos.r-forge.r-project.org/reference/topo-unary-gSimplify.html for examples using different values for tol. +**What is the tol parameter?** + +The tol parameter determines how simplified the resulting segmentations are. A higher value of tol will reduce the number of vertices more drastically which will speed up plotting, but some segmentation detail will be lost. See for examples using different values for tol.
We can visualize individual molecules plotted at higher resolution after zooming-in + ```{r, fig.height=8, fig.width=8} # Since there is nothing behind the segmentations, alpha will slightly mute colors ImageDimPlot(vizgen.obj, fov = "hippo", molecules = rownames(markers.14)[1:4], cols = "polychrome", mols.size = 1, alpha = 0.5, mols.cols = c("red", "blue", "yellow", "green")) ``` +# Human Lung: 10x Genomics Xenium In Situ + +This dataset is a preview of the Xenium multimodal cell segmentation solution using a development version of the assay user guide and analysis software. It uses the [Xenium Human Multi-Tissue and Cancer Panel](https://www.10xgenomics.com/support/in-situ-gene-expression/documentation/steps/panel-design/pre-designed-xenium-gene-expression-panels) (377 genes) which was pre-designed by 10x Genomics. In this vignette, we will demonstrate how to load Xenium data for analysis and visualization using Seurat and, in particular, how to parse and visualize cell metadata. Note that this vignette requires the use of some optional dependencies in Seurat, namely `stars`, `jsonlite` and `gmp` in order to read data from `.zarr` files. + +This uses the full Xenium output bundle available from the [FFPE Human Lung Cancer with Xenium Multimodal Cell Segmentation Preview Data](https://www.10xgenomics.com/datasets/preview-data-ffpe-human-lung-cancer-with-xenium-multimodal-cell-segmentation-1-standard) page, which can be downloaded as described below. + +```{bash, eval=FALSE} +wget https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_humanLung_Cancer_FFPE/Xenium_V1_humanLung_Cancer_FFPE_outs.zip +unzip Xenium_V1_humanLung_Cancer_FFPE_outs.zip +``` + +We will first load in the dataset and create the Seurat object. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. Note that this dataset is moderate size. Installing `arrow` will permit you to load the data from Parquet files, which is much more efficient than from csv. + +```{r, results='hide'} +path <- "~/yard/data/Xenium_V1_humanLung_Cancer_FFPE_outs" +# Load the Xenium data +xenium.obj <- LoadXenium(path, fov = "fov") +# remove cells with 0 counts +xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) +``` + # Mouse Brain: 10x Genomics Xenium In Situ In this section we'll analyze data produced by the Xenium platform. The vignette demonstrates how to load the per-transcript location data, cell x gene matrix, cell segmentation, and cell centroid information available in the Xenium outputs. The resulting Seurat object will contain the gene expression profile of each cell, the centroid and boundary of each cell, and the location of each individual detected transcript. The per-cell gene expression profiles are similar to standard single-cell RNA-seq and can be analyzed using the same tools. @@ -223,11 +259,13 @@ xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) Spatial information is loaded into slots of the Seurat object, labelled by the name of "field of view" (FOV) being loaded. Initially all the data is loaded into the FOV named `fov`. Later, we will make a cropped FOV that zooms into a region of interest. Standard QC plots provided by Seurat are available via the `Xenium` assay. Here are violin plots of genes per cell (`nFeature_Xenium`) and transcript counts per cell (`nCount_Xenium`) + ```{r vlnplot.xenium} VlnPlot(xenium.obj, features = c("nFeature_Xenium", "nCount_Xenium"), ncol = 2, pt.size = 0) ``` Next, we plot the positions of the pan-inhibitory neuron marker Gad1, inhibitory neuron sub-type markers Pvalb, and Sst, and astrocyte marker Gfap on the tissue using `ImageDimPlot()`. + ```{r p2.xenium, fig.width=10, fig.height=8} ImageDimPlot(xenium.obj, fov = "fov", molecules = c("Gad1", "Sst", "Pvalb", "Gfap"), nmols = 20000) ``` @@ -238,11 +276,13 @@ ggsave(filename = "../output/images/spatial_vignette_2.jpg", height = 5, width = ``` Here we visualize the expression level of some key layer marker genes at the per-cell level using `ImageFeaturePlot()` which is analogous to the `FeaturePlot()` function for visualizing expression on a 2D embedding. We manually adjust the `max.cutoff` for each gene to roughly the 90th percentile (which can be specified with `max.cutoff='q90'`) of it's count distribution to improve contrast. + ```{r mat.xenium, message=FALSE, warning=FALSE, fig.width=12, fig.height=12} ImageFeaturePlot(xenium.obj, features = c("Cux2", "Rorb", "Bcl11b", "Foxp2"), max.cutoff = c(25, 35, 12, 10), size = 0.75, cols = c("white", "red")) ``` We can zoom in on a chosen area with the `Crop()` function. Once zoomed-in, we can visualize cell segmentation boundaries along with individual molecules. + ```{r cropping.xenium, message=FALSE, warning=FALSE, fig.width=10, fig.height=8} cropped.coords <- Crop(xenium.obj[["fov"]], x = c(1200, 2900), y = c(3750, 4550), coords = "plot") xenium.obj[["zoom"]] <- cropped.coords @@ -255,6 +295,7 @@ ImageDimPlot(xenium.obj, fov = "zoom", ``` Next, we use SCTransform for normalization followed by standard dimensionality reduction and clustering. This step takes about 5 minutes from start to finish. + ```{r unsupervised.xenium, results='hide'} xenium.obj <- SCTransform(xenium.obj, assay = "Xenium") xenium.obj <- RunPCA(xenium.obj, npcs = 30, features = rownames(xenium.obj)) @@ -264,22 +305,24 @@ xenium.obj <- FindClusters(xenium.obj, resolution = 0.3) ``` We can then visualize the results of the clustering by coloring each cell according to its cluster either in UMAP space with `DimPlot()` or overlaid on the image with `ImageDimPlot()`. + ```{r umap.xenium, fig.width=10, fig.height=7} DimPlot(xenium.obj) ``` We can visualize the expression level of the markers we looked at earlier on the UMAP coordinates. + ```{r features.xenium, fig.width=8, fig.height=10} FeaturePlot(xenium.obj, features = c("Cux2", "Bcl11b", "Foxp2", "Gad1", "Sst", "Gfap")) ``` We can now use `ImageDimPlot()` to color the cell positions colored by the cluster labels determined in the previous step. + ```{r clusters.xenium, fig.width=13, fig.height=13} ImageDimPlot(xenium.obj, cols = "polychrome", size = 0.75) ``` -Using the positional information of each cell, we compute spatial niches. -We use a cortex reference from the the Allen Brain Institute to annotate cells, so we first crop the dataset to the cortex. The Allen Brain reference can be installed [here](https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1). +Using the positional information of each cell, we compute spatial niches. We use a cortex reference from the the Allen Brain Institute to annotate cells, so we first crop the dataset to the cortex. The Allen Brain reference can be installed [here](https://www.dropbox.com/s/cuowvm4vrf65pvq/allen_cortex.rds?dl=1). Below, we use Slc17a7 expression to help determine the cortical region. @@ -355,7 +398,7 @@ keep.cells <- Cells(xenium.obj)[!is.na(xenium.obj$predicted.celltype)] xenium.obj <- subset(xenium.obj, cells = keep.cells) ``` -While the previous analyses consider each cell independently, spatial data enables cells to be defined not just by their neighborhood, but also by their broader spatial context. In Seurat v5, we introduce support for 'niche' analysis of spatial data, which demarcates regions of tissue ('niches'), each of which is defined by a different composition of spatially adjacent cell types. Inspired by methods in [Goltsev et al, Cell 2018](https://www.sciencedirect.com/science/article/pii/S0092867418309048) and [He et al, NBT 2022](https://www.nature.com/articles/s41587-022-01483-z), we consider the 'local neighborhood' for each cell - consisting of its `k.neighbor` spatially closest neighbors, and count the occurrences of each cell type present in this neighborhood. We then use k-means clustering to group cells that have similar neighborhoods together, into spatial niches. +While the previous analyses consider each cell independently, spatial data enables cells to be defined not just by their neighborhood, but also by their broader spatial context. In Seurat v5, we introduce support for 'niche' analysis of spatial data, which demarcates regions of tissue ('niches'), each of which is defined by a different composition of spatially adjacent cell types. Inspired by methods in [Goltsev et al, Cell 2018](https://www.sciencedirect.com/science/article/pii/S0092867418309048) and [He et al, NBT 2022](https://www.nature.com/articles/s41587-022-01483-z), we consider the 'local neighborhood' for each cell - consisting of its `k.neighbor` spatially closest neighbors, and count the occurrences of each cell type present in this neighborhood. We then use k-means clustering to group cells that have similar neighborhoods together, into spatial niches. We call the `BuildNicheAssay` function from within Seurat to construct a new assay called `niche` containing the cell type composition spatially neighboring each cell. A metadata column called `niches` is also returned, which contains cluster assignments based on the niche assay. @@ -402,11 +445,11 @@ table(xenium.obj$predicted.celltype, xenium.obj$niches) # Human Lung: Nanostring CosMx Spatial Molecular Imager -This dataset was produced using Nanostring CosMx Spatial Molecular Imager (SMI). The CosMX SMI performs multiplexed single molecule profiling, can profile both RNA and protein targets, and can be applied directly to FFPE tissues. The dataset represents 8 FFPE samples taken from 5 non-small-cell lung cancer (NSCLC) tissues, and is available for [public download](https://www.nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/). The gene panel consists of 960 transcripts. +This dataset was produced using Nanostring CosMx Spatial Molecular Imager (SMI). The CosMX SMI performs multiplexed single molecule profiling, can profile both RNA and protein targets, and can be applied directly to FFPE tissues. The dataset represents 8 FFPE samples taken from 5 non-small-cell lung cancer (NSCLC) tissues, and is available for [public download](https://www.nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/). The gene panel consists of 960 transcripts. -In this vignette, we load one of 8 samples (lung 5, replicate 1). We use the `LoadNanostring()` function, which parses the outputs available on the public download site. Note that the coordinates for the cell boundaries were provided by Nanostring by request, and are available for download [here](https://www.dropbox.com/s/hl3peavrx92bluy/Lung5_Rep1-polygons.csv?dl=0). +In this vignette, we load one of 8 samples (lung 5, replicate 1). We use the `LoadNanostring()` function, which parses the outputs available on the public download site. Note that the coordinates for the cell boundaries were provided by Nanostring by request, and are available for download [here](https://www.dropbox.com/s/hl3peavrx92bluy/Lung5_Rep1-polygons.csv?dl=0). -For this dataset, instead of performing unsupervised analysis, we map the Nanostring profiles to our Azimuth Healthy Human Lung reference, which was defined by scRNA-seq. We used Azimuth version 0.4.3 with the [human lung](https://azimuth.hubmapconsortium.org/references/#Human%20-%20Lung%20v1) reference version 1.0.0. You can download the precomputed results [here](https://seurat.nygenome.org/vignette_data/spatial_vignette_2/nanostring_data.Rds), which include annotations, prediction scores, and a UMAP visualization. The median number of detected transcripts/cell is 249, which does create uncertainty for the annotation process. +For this dataset, instead of performing unsupervised analysis, we map the Nanostring profiles to our Azimuth Healthy Human Lung reference, which was defined by scRNA-seq. We used Azimuth version 0.4.3 with the [human lung](https://azimuth.hubmapconsortium.org/references/#Human%20-%20Lung%20v1) reference version 1.0.0. You can download the precomputed results [here](https://seurat.nygenome.org/vignette_data/spatial_vignette_2/nanostring_data.Rds), which include annotations, prediction scores, and a UMAP visualization. The median number of detected transcripts/cell is 249, which does create uncertainty for the annotation process. ```{r load} nano.obj <- LoadNanostring(data.dir = "/brahms/hartmana/vignette_data/nanostring/lung5_rep1", fov="lung5.rep1") @@ -427,7 +470,7 @@ nano.obj <- SCTransform(nano.obj, assay = "Nanostring", clip.range = c(-10, 10), head(slot(object = nano.obj, name = "meta.data")[2:5]) ``` -We can visualize the Nanostring cells and annotations, projected onto the reference-defined UMAP. Note that for this NSCLC sample, tumor samples are annotated as 'basal', which is the closest cell type match in the healthy reference. +We can visualize the Nanostring cells and annotations, projected onto the reference-defined UMAP. Note that for this NSCLC sample, tumor samples are annotated as 'basal', which is the closest cell type match in the healthy reference. ```{r, fig.width=9, fig.height=4} DimPlot(nano.obj) @@ -491,8 +534,7 @@ ImageDimPlot(nano.obj, fov = "zoom1", cols = "polychrome", alpha = 0.3, molecule This dataset was produced using Akoya CODEX system. The CODEX system performs multiplexed spatially-resolved protein profiling, iteratively visualizing antibody-binding events. The dataset here represents a tissue section from a human lymph node, and was generated by the University of Florida as part of the Human Biomolecular Atlas Program (HuBMAP). More information about the sample and experiment is available [here](https://portal.hubmapconsortium.org/browse/dataset/c95d9373d698faf60a66ffdc27499fe1). The protein panel in this dataset consists of 28 markers, and protein intensities were quantified as part of the Akoya processor pipeline, which outputs a CSV file providing the intensity of each marker in each cell, as well as the cell coordinates. The file is available for public download via Globus [here](https://app.globus.org/file-manager?origin_id=af603d86-eab9-4eec-bb1d-9d26556741bb&origin_path=%2Fc95d9373d698faf60a66ffdc27499fe1%2Fdrv_CX_20-008_lymphnode_n10_reg001%2Fprocessed_2020-12-2320-008LNn10r001%2Fsegm%2Fsegm-1%2Ffcs%2Fcompensated%2F). - -First, we load in the data of a HuBMAP dataset using the `LoadAkoya()` function in Seurat: +First, we load in the data of a HuBMAP dataset using the `LoadAkoya()` function in Seurat: ```{r} codex.obj <- LoadAkoya( @@ -524,7 +566,7 @@ DimPlot(codex.obj, label = TRUE, label.box = TRUE) + NoLegend() ImageDimPlot(codex.obj, cols = "parade") ``` -The expression patters of individual markers clearly denote different cell types and spatial structures, including Lyve1 (lymphatic endothelial cells), CD34 (blood endothelial cells), and CD21 (B cells). As expected, endothelial cells group together into vessels, and B cells are key components of specialized microstructures known as germinal zones. You can read more about protein markers in this dataset, as well as cellular networks in human lynmphatic tissues, in this [preprint](https://www.biorxiv.org/content/10.1101/2021.10.20.465151v1.full). +The expression patters of individual markers clearly denote different cell types and spatial structures, including Lyve1 (lymphatic endothelial cells), CD34 (blood endothelial cells), and CD21 (B cells). As expected, endothelial cells group together into vessels, and B cells are key components of specialized microstructures known as germinal zones. You can read more about protein markers in this dataset, as well as cellular networks in human lynmphatic tissues, in this [preprint](https://www.biorxiv.org/content/10.1101/2021.10.20.465151v1.full). ```{r, fig.width=9, fig.height=8} p1 <- ImageFeaturePlot(codex.obj, fov = "HBM754.WKLP.262", features = c("CD34", "CD21", "Lyve1"), min.cutoff = "q10", max.cutoff = "q90") @@ -535,10 +577,13 @@ p1 + p2 Each of these datasets represents an opportunity to learn organizing principles that govern the spatial localization of different cell types. Stay tuned for future updates to Seurat enabling further exploration and characterization of the relationship between spatial position and molecular state.
- **Session Info** + +**Session Info** + ```{r} sessionInfo() ``` +
```{r save.times, include=FALSE} From d55be3ab6a2196fb1e7183d6b3da6793bea52db6 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Thu, 29 Feb 2024 16:02:21 -0800 Subject: [PATCH 03/15] clean up the API a bit --- R/convenience.R | 76 +++++++++++++++++++++++++++++++++++++++-------- R/preprocessing.R | 4 ++- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index c4cbed9f6..4e06343bc 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -175,31 +175,81 @@ LoadVizgen <- function(data.dir, fov, assay = 'Vizgen', z = 3L) { #' @param data.dir Path to folder containing Nanostring SMI outputs #' @param fov FOV name #' @param assay Assay name +#' @param mols.qv.threshold Remove transcript molecules with +#' a QV less than this threshold. QV >= 20 is the standard threshold +#' used to construct the cell x gene count matrix. +#' @param cell.centroids Whether or not to load cell centroids +#' @param molecule.coordinates Whether or not to load molecule pixel coordinates +#' @param segmentations One of "cell", "nucleus" or NULL (to load either cell +#' segmentations, nucleus segmentations or neither) #' #' @importFrom SeuratObject Cells CreateCentroids CreateFOV -#' CreateSegmentation CreateSeuratObject +#' CreateSegmentation CreateSeuratObject CreateMolecules #' #' @export #' #' @rdname ReadXenium #' -LoadXenium <- function(data.dir, fov = 'fov', assay = 'Xenium') { +LoadXenium <- function( + data.dir, + fov = 'fov', + assay = 'Xenium', + mols.qv.threshold = 20, + cell.centroids = TRUE, + molecule.coordinates = TRUE, + segmentations = NULL +) { + if(!is.null(segmentations) && !(segmentations %in% c('nucleus', 'cell'))) { + stop('segmentations must be NULL or one of "nucleus", "cell"') + } + + if(!cell.centroids && is.null(segmentations)) { + stop( + "Must load either centroids or cell/nucleus segmentations" + ) + } + data <- ReadXenium( data.dir = data.dir, - type = c("centroids", "segmentations"), + type = c("centroids", "segmentations", "nucleus_segmentations")[ + c(cell.centroids, isTRUE(segmentations == 'cell'), isTRUE(segmentations == 'nucleus')) + ], + outs = c("segmentation_method", "matrix", "microns")[ + c(cell.centroids || isTRUE(segmentations != 'nucleus'), TRUE, molecule.coordinates && (cell.centroids || !is.null(segmentations))) + ], + mols.qv.threshold = mols.qv.threshold ) + + segmentations <- intersect(c("segmentations", "nucleus_segmentations"), names(data)) - segmentations.data <- list( - "centroids" = CreateCentroids(data$centroids), - "segmentation" = CreateSegmentation(data$segmentations) - ) + segmentations.data <- Filter(Negate(is.null), list( + centroids = if(is.null(data$centroids)) { + NULL + } else { + CreateCentroids(data$centroids) + }, + segmentations = if(length(segmentations) > 0) { + CreateSegmentation( + data[[segmentations]] + ) + } else { + NULL + } + )) - coords <- CreateFOV( - coords = segmentations.data, - type = c("segmentation", "centroids"), - molecules = data$microns, - assay = assay - ) + coords <- if(length(segmentations.data) > 0) { + CreateFOV( + segmentations.data, + assay = assay, + molecules = if(is.null(data$microns)) { + NULL + } else { + CreateMolecules(data$microns) + } + ) + } else { + NULL + } slot.map <- c( `Blank Codeword` = 'BlankCodeword', diff --git a/R/preprocessing.R b/R/preprocessing.R index 58561fca9..44d02f1a8 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2102,13 +2102,15 @@ ReadNanostring <- function( #' \itemize{ #' \item \dQuote{matrix}: the counts matrix #' \item \dQuote{microns}: molecule coordinates -#' \item \dQuote{segmentation_method}: molecule coordinates +#' \item \dQuote{segmentation_method}: cell segmentation method (for runs which +#' use multi-modal segmentation) #' } #' @param type Type of cell spatial coordinate matrices to read; choose one #' or more of: #' \itemize{ #' \item \dQuote{centroids}: cell centroids in pixel coordinate space #' \item \dQuote{segmentations}: cell segmentations in pixel coordinate space +#' \item \dQuote{nucleus_segmentations}: nucleus segmentations in pixel coordinate space #' } #' @param mols.qv.threshold Remove transcript molecules with #' a QV less than this threshold. QV >= 20 is the standard threshold From 87396254d59ae9f8cdcfdfcbc11b62ff2bf4c110 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Fri, 1 Mar 2024 15:26:45 -0800 Subject: [PATCH 04/15] bulk out vignette --- R/convenience.R | 6 +- R/preprocessing.R | 34 ++++++++--- vignettes/seurat5_spatial_vignette_2.Rmd | 78 +++++++++++++++++++++--- 3 files changed, 100 insertions(+), 18 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index 4e06343bc..0170fad31 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -197,7 +197,8 @@ LoadXenium <- function( mols.qv.threshold = 20, cell.centroids = TRUE, molecule.coordinates = TRUE, - segmentations = NULL + segmentations = NULL, + flip.xy = FALSE ) { if(!is.null(segmentations) && !(segmentations %in% c('nucleus', 'cell'))) { stop('segmentations must be NULL or one of "nucleus", "cell"') @@ -217,7 +218,8 @@ LoadXenium <- function( outs = c("segmentation_method", "matrix", "microns")[ c(cell.centroids || isTRUE(segmentations != 'nucleus'), TRUE, molecule.coordinates && (cell.centroids || !is.null(segmentations))) ], - mols.qv.threshold = mols.qv.threshold + mols.qv.threshold = mols.qv.threshold, + flip.xy = flip.xy ) segmentations <- intersect(c("segmentations", "nucleus_segmentations"), names(data)) diff --git a/R/preprocessing.R b/R/preprocessing.R index 44d02f1a8..4f6171830 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2136,7 +2136,8 @@ ReadXenium <- function( data.dir, outs = c("segmentation_method", "matrix", "microns"), type = "centroids", - mols.qv.threshold = 20 + mols.qv.threshold = 20, + flip.xy = F ) { # Argument checking type <- match.arg( @@ -2230,7 +2231,11 @@ ReadXenium <- function( amount = 0 ) - col.use <- c(x_centroid = 'x', y_centroid = 'y', cell_id = 'cell') + col.use <- c( + x_centroid = letters[24 + flip.xy], + y_centroid = letters[25 - flip.xy], + cell_id = 'cell' + ) for(option in Filter(function(x) x$req, list( list( @@ -2281,7 +2286,11 @@ ReadXenium <- function( if(!inherits(cell_boundaries_df, "try-error")) { break } } - colnames(cell_boundaries_df) <- c('cell', 'x', 'y') + colnames(cell_boundaries_df) <- c( + 'cell', + letters[24 + flip.xy], + letters[25 - flip.xy] + ) psegs(type = "finish") @@ -2312,7 +2321,11 @@ ReadXenium <- function( if(!inherits(nucleus_boundaries_df, "try-error")) { break } } - colnames(nucleus_boundaries_df) <- c('cell', 'x', 'y') + colnames(nucleus_boundaries_df) <- c( + 'cell', + letters[24 + flip.xy], + letters[25 - flip.xy] + ) psegs(type = "finish") @@ -2326,7 +2339,11 @@ ReadXenium <- function( amount = 0 ) - col.use = c(x_location = 'x', y_location = 'y', feature_name = 'gene') + col.use = c( + x_location = letters[24+flip.xy], + y_location = letters[25-flip.xy], + feature_name = 'gene' + ) for(option in Filter(function(x) x$req, list( list( @@ -2363,10 +2380,9 @@ ReadXenium <- function( intersect( names(meta), c( - 'run_name', 'run_start_time', 'region_name', - 'preservation_method', 'panel_name', 'panel_organism', - 'panel_tissue_type', 'instrument_sw_version', - 'segmentation_stain' + 'run_start_time', 'preservation_method', 'panel_name', + 'panel_organism', 'panel_tissue_type', + 'instrument_sw_version', 'segmentation_stain' ) ) ] diff --git a/vignettes/seurat5_spatial_vignette_2.Rmd b/vignettes/seurat5_spatial_vignette_2.Rmd index 88395262f..d63f6b4cf 100644 --- a/vignettes/seurat5_spatial_vignette_2.Rmd +++ b/vignettes/seurat5_spatial_vignette_2.Rmd @@ -218,23 +218,87 @@ ImageDimPlot(vizgen.obj, fov = "hippo", molecules = rownames(markers.14)[1:4], c This dataset is a preview of the Xenium multimodal cell segmentation solution using a development version of the assay user guide and analysis software. It uses the [Xenium Human Multi-Tissue and Cancer Panel](https://www.10xgenomics.com/support/in-situ-gene-expression/documentation/steps/panel-design/pre-designed-xenium-gene-expression-panels) (377 genes) which was pre-designed by 10x Genomics. In this vignette, we will demonstrate how to load Xenium data for analysis and visualization using Seurat and, in particular, how to parse and visualize cell metadata. Note that this vignette requires the use of some optional dependencies in Seurat, namely `stars`, `jsonlite` and `gmp` in order to read data from `.zarr` files. -This uses the full Xenium output bundle available from the [FFPE Human Lung Cancer with Xenium Multimodal Cell Segmentation Preview Data](https://www.10xgenomics.com/datasets/preview-data-ffpe-human-lung-cancer-with-xenium-multimodal-cell-segmentation-1-standard) page, which can be downloaded as described below. +This uses the full Xenium output bundle available from the [FFPE Human Lung Cancer with Xenium Multimodal Cell Segmentation Preview Data](https://www.10xgenomics.com/datasets/preview-data-ffpe-human-lung-cancer-with-xenium-multimodal-cell-segmentation-1-standard) page, which can be downloaded as described below (note that this file is \~7 GB). ```{bash, eval=FALSE} wget https://cf.10xgenomics.com/samples/xenium/2.0.0/Xenium_V1_humanLung_Cancer_FFPE/Xenium_V1_humanLung_Cancer_FFPE_outs.zip unzip Xenium_V1_humanLung_Cancer_FFPE_outs.zip ``` -We will first load in the dataset and create the Seurat object. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. Note that this dataset is moderate size. Installing `arrow` will permit you to load the data from Parquet files, which is much more efficient than from csv. +We will first load in the dataset and create the Seurat object. We will flip the x/y coordinates for more convenient plotting. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. Installing `arrow` will permit you to load the data from Parquet files, which is much more efficient than from csv. + +By default, the subcellular coordinates of each Q20 transcript will be loaded, as well as the cell centroids, which can commonly take up more than 1 GB of RAM. ```{r, results='hide'} -path <- "~/yard/data/Xenium_V1_humanLung_Cancer_FFPE_outs" -# Load the Xenium data -xenium.obj <- LoadXenium(path, fov = "fov") +path <- "/brahms/hartmana/vignette_data/Xenium_V1_humanLung_Cancer_FFPE_outs" +# Load the Xenium data, including cell segmentations +xenium.obj <- LoadXenium(path, fov = "fov", segmentations = "cell", flip.xy = T) # remove cells with 0 counts xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) ``` +This dataset uses Xenium multimodal segmentation, which involves custom deep learning models trained on Xenium In Situ data. After nuclei segmentation with DAPI, the algorithm uses three methods to segment cells. The segmentation results for each cell are prioritized in this order: + +1. **Cell boundary stain:** This is the most reliable method. Antibodies target epithelial markers (CD45) and immune markers (pan-lymphocyte: ATP1A1, E-Cadherin). It can split nuclei and define cells missing a nucleus. Nuclei that overlap with anucleate cells are assigned to the cell + +2. **Expansion from the nucleus to the cell interior stain edge:** This method requires both segmented nuclei and the interior stain (18S rRNA marker) + +3. **Nuclear expansion:** For cases where cells that do not have boundary or interior stains, segment cells with a nuclear (DAPI) expansion distance of 5 µm or until another cell boundary is encountered + +We can directly visualize cells which were segmented according to each method. + +```{r} +ImageDimPlot(xenium.obj, fov = "fov", dark.background = F, group.by = "segmentation_method", cols = c('#ffabc3', '#a9a900', '#a9ceff')) +``` + +It is also possible to load and visualize the unsupervised cluster annotations computed by the Xenium Onboard Analysis pipeline, which are stored in the `analysis` folder of an output bundle. + +```{r} +where <- tempdir() +untar(file.path(data.dir, 'analysis.tar.gz'), exdir = where) + +graph_clusters <- read.csv(file.path(where, 'analysis', 'clustering', 'gene_expression_graphclust', 'clusters.csv'), row.names = 'Barcode') + +# Store the graph-based clusters in the metadata +xenium.obj <- AddMetaData(xenium.obj, graph_clusters) + +ImageDimPlot(xenium.obj, fov = "fov", dark.background = F, group.by = "Cluster") +``` + +Differential expression results from Xenium Onboard Analysis can also be loaded in a similar fashion. + +```{r} +diff_exp <- read.csv(file.path(where, 'analysis', 'diffexp', 'gene_expression_graphclust', 'differential_expression.csv')) + +diff_exp <- melt(diff_exp, id.vars = c("Feature.ID", "Feature.Name")) + +colnames(diff_exp)[1:2] <- c('ensembl_id', 'gene_name') +diff_exp$cluster <- unlist(lapply(strsplit(as.character(diff_exp$variable), '.', fixed = T), '[[', 2)) +diff_exp$measure <- factor(gsub('Cluster\\.\\d+\\.', '', as.character(diff_exp$variable)), c('Mean.Counts', 'Log2.fold.change', 'Adjusted.p.value'), c('mean_count', 'log2_fc', 'p_adj')) +diff_exp$variable <- NULL + +diff_exp <- dcast(diff_exp, ensembl_id + gene_name + cluster ~ measure) + +significant_de <- subset(diff_exp, p_adj <= 0.05) +significant_de <- significant_de[order(significant_de$mean_count, decreasing = T), ] +significant_de[!duplicated(significant_de$cluster), ] +``` + +We will zoom in to visualize cell segmentations and expression of a select few marker genes. + +```{r} +cropped.coords <- Crop(xenium.obj[["fov"]], x = c(6700, 7400), y = c(1500, 2000), coords = "plot") +xenium.obj[["zoom"]] <- cropped.coords +# visualize cropped area with cell segmentations & selected molecules +DefaultBoundary(xenium.obj[["zoom"]]) <- "segmentation" +ImageDimPlot(xenium.obj, fov = "zoom", group.by = "Cluster", + axes = TRUE, border.color = "white", border.size = 0.1, + cols = "polychrome", coord.fixed = FALSE, + molecules = c("SNTN", "MALL", "MS4A1", "IL7R", "CYP2B6"), nmols = 10000, mols.cols = RColorBrewer::brewer.pal(5, "Set3"), alpha = 0.4) +``` + +Lots of valuable data is output directly in each run, allowing for rapid interrogation of the biology. In the following vignette, we will see how we can use standard Seurat workflows to do more involved secondary analysis on Xenium data. + # Mouse Brain: 10x Genomics Xenium In Situ In this section we'll analyze data produced by the Xenium platform. The vignette demonstrates how to load the per-transcript location data, cell x gene matrix, cell segmentation, and cell centroid information available in the Xenium outputs. The resulting Seurat object will contain the gene expression profile of each cell, the centroid and boundary of each cell, and the location of each individual detected transcript. The per-cell gene expression profiles are similar to standard single-cell RNA-seq and can be analyzed using the same tools. @@ -246,12 +310,12 @@ wget https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_Co unzip Xenium_V1_FF_Mouse_Brain_Coronal_Subset_CTX_HP_outs.zip ``` -First we read in the dataset and create a Seurat object. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. This step should take about a minute. +First we read in the dataset and create a Seurat object. Provide the path to the data folder for a Xenium run as the input path. The RNA data is stored in the `Xenium` assay of the Seurat object. This step should take about a minute (you can improve this by installing `arrow` and `hdf5r`). ```{r load.xenium, results='hide'} path <- "/brahms/hartmana/vignette_data/xenium_tiny_subset" # Load the Xenium data -xenium.obj <- LoadXenium(path, fov = "fov") +xenium.obj <- LoadXenium(path, fov = "fov", segmentations = "cell") # remove cells with 0 counts xenium.obj <- subset(xenium.obj, subset = nCount_Xenium > 0) ``` From e0ad8f1d3e8c60235e40b10b7c99a422fe7ad391 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Mon, 18 Mar 2024 16:13:31 -0700 Subject: [PATCH 05/15] cleaner errors for missing files/arrow --- R/preprocessing.R | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/R/preprocessing.R b/R/preprocessing.R index 4f6171830..0631eacac 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2173,6 +2173,10 @@ ReadXenium <- function( if(!inherits(matrix, "try-error")) { break } } + if(!exists('matrix') || inherits(matrix, "try-error")) { + stop("Xenium outputs were incomplete: missing cell_feature_matrix") + } + pmtx(type = "finish") matrix }, @@ -2254,6 +2258,10 @@ ReadXenium <- function( if(!inherits(cell_info, "try-error")) { break } } + if(!exists('cell_info') || inherits(cell_info, "try-error")) { + stop("Xenium outputs were incomplete: missing cells") + } + cell_info <- cell_info[, names(col.use)] colnames(cell_info) <- col.use @@ -2286,6 +2294,10 @@ ReadXenium <- function( if(!inherits(cell_boundaries_df, "try-error")) { break } } + if(!exists('cell_boundaries_df') || inherits(cell_boundaries_df, "try-error")) { + stop("Xenium outputs were incomplete: missing cell_boundaries") + } + colnames(cell_boundaries_df) <- c( 'cell', letters[24 + flip.xy], @@ -2321,6 +2333,10 @@ ReadXenium <- function( if(!inherits(nucleus_boundaries_df, "try-error")) { break } } + if(!exists('cell_info') || inherits(cell_info, "try-error")) { + stop("Xenium outputs were incomplete: missing nucleus_boundaries") + } + colnames(nucleus_boundaries_df) <- c( 'cell', letters[24 + flip.xy], @@ -2362,6 +2378,15 @@ ReadXenium <- function( if(!inherits(transcripts, "try-error")) { break } } + if(!exists('transcripts') || inherits(transcripts, "try-error")) { + hint <- "" + if(file.exists(file.path(data.dir, "transcripts.parquet"))) { + hint <- " Xenium outputs no longer include `transcripts.csv.gz`. Instead, please install `arrow` to read transcripts.parquet" + } + + stop(paste0("Xenium outputs were incomplete: missing transcripts.", hint)) + } + transcripts <- transcripts[, names(col.use)] colnames(transcripts) <- col.use From ae0616936142f05a82aabeaed66bee71d67aa046 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Thu, 2 May 2024 12:03:20 -0700 Subject: [PATCH 06/15] doc: update some documentation (#7) --- R/convenience.R | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/R/convenience.R b/R/convenience.R index 0170fad31..d32d732e8 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -172,7 +172,7 @@ LoadVizgen <- function(data.dir, fov, assay = 'Vizgen', z = 3L) { #' @return \code{LoadXenium}: A \code{\link[SeuratObject]{Seurat}} object #' -#' @param data.dir Path to folder containing Nanostring SMI outputs +#' @param data.dir Path to folder containing Xenium outputs #' @param fov FOV name #' @param assay Assay name #' @param mols.qv.threshold Remove transcript molecules with @@ -182,6 +182,8 @@ LoadVizgen <- function(data.dir, fov, assay = 'Vizgen', z = 3L) { #' @param molecule.coordinates Whether or not to load molecule pixel coordinates #' @param segmentations One of "cell", "nucleus" or NULL (to load either cell #' segmentations, nucleus segmentations or neither) +#' @param flip.xy Whether or not to flip the x/y coordinates of the Xenium outputs +#' to match what is displayed in Xenium Explorer, or fit on your screen better. #' #' @importFrom SeuratObject Cells CreateCentroids CreateFOV #' CreateSegmentation CreateSeuratObject CreateMolecules @@ -203,13 +205,13 @@ LoadXenium <- function( if(!is.null(segmentations) && !(segmentations %in% c('nucleus', 'cell'))) { stop('segmentations must be NULL or one of "nucleus", "cell"') } - + if(!cell.centroids && is.null(segmentations)) { stop( "Must load either centroids or cell/nucleus segmentations" ) } - + data <- ReadXenium( data.dir = data.dir, type = c("centroids", "segmentations", "nucleus_segmentations")[ @@ -221,7 +223,7 @@ LoadXenium <- function( mols.qv.threshold = mols.qv.threshold, flip.xy = flip.xy ) - + segmentations <- intersect(c("segmentations", "nucleus_segmentations"), names(data)) segmentations.data <- Filter(Negate(is.null), list( @@ -238,7 +240,7 @@ LoadXenium <- function( NULL } )) - + coords <- if(length(segmentations.data) > 0) { CreateFOV( segmentations.data, @@ -252,7 +254,7 @@ LoadXenium <- function( } else { NULL } - + slot.map <- c( `Blank Codeword` = 'BlankCodeword', `Unassigned Codeword` = 'BlankCodeword', @@ -261,15 +263,15 @@ LoadXenium <- function( ) xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]], assay = assay) - + if(!is.null(data$metadata)) { Misc(xenium.obj, 'run_metadata') <- data$metadata } - + if(!is.null(data$segmentation_method)) { xenium.obj <- AddMetaData(xenium.obj, data$segmentation_method) } - + for(name in intersect(names(slot.map), names(data$matrix))) { xenium.obj[[slot.map[name]]] <- CreateAssayObject(counts = data$matrix[[name]]) } From a9ca7f0f60b15e06448e53d9e81edfdf0d53a3ea Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Mon, 24 Jun 2024 09:44:54 -0700 Subject: [PATCH 07/15] fix: load genomic controls when present --- R/convenience.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/convenience.R b/R/convenience.R index d32d732e8..c402cfc6f 100644 --- a/R/convenience.R +++ b/R/convenience.R @@ -259,7 +259,8 @@ LoadXenium <- function( `Blank Codeword` = 'BlankCodeword', `Unassigned Codeword` = 'BlankCodeword', `Negative Control Codeword` = 'ControlCodeword', - `Negative Control Probe` = 'ControlProbe' + `Negative Control Probe` = 'ControlProbe', + `Genomic Control` = 'GenomicControl' ) xenium.obj <- CreateSeuratObject(counts = data$matrix[["Gene Expression"]], assay = assay) From 41adcb48fc7e13a6359a98def41c726863d1ea0c Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Mon, 16 Sep 2024 14:59:25 -0700 Subject: [PATCH 08/15] Typo --- R/preprocessing.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 4a311d9ae..26203b185 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2575,7 +2575,7 @@ ReadXenium <- function( }, stop("Unknown Xenium input type: ", otype) ) - }, SIMPLIFY = FALSE, USE.NAMES = TRUE) + }, simplify = FALSE, USE.NAMES = TRUE) metadata <- file.path(data.dir, "experiment.xenium") if(file.exists(metadata) && requireNamespace("jsonlite", quietly = TRUE)) { From 23a20374a849513982fd85c58ff363b089f56d6d Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Wed, 18 Sep 2024 10:28:54 -0700 Subject: [PATCH 09/15] fix: copypasta on nucleus_boundaries --- R/preprocessing.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 26203b185..9c91cb4f2 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2512,7 +2512,7 @@ ReadXenium <- function( if(!inherits(nucleus_boundaries_df, "try-error")) { break } } - if(!exists('cell_info') || inherits(cell_info, "try-error")) { + if(!exists('nucleus_boundaries_df') || inherits(nucleus_boundaries_df, "try-error")) { stop("Xenium outputs were incomplete: missing nucleus_boundaries") } From 7454eb0488c1c352fa992360fda0c819322f39a9 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Wed, 18 Sep 2024 10:43:47 -0700 Subject: [PATCH 10/15] fix: don't load segmentation method on old datasets --- R/preprocessing.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 9c91cb4f2..899e6254c 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2376,12 +2376,18 @@ ReadXenium <- function( tempdir <- path.expand(tempdir()) unzip(file.path(data.dir, "cells.zarr.zip"), exdir = tempdir) zattr <- jsonlite::read_json(file.path(tempdir, '.zattrs')) + + # Segmentation method only available in datasets versioned 6.0+ + if(zattr$major_version < 6) { + return(NULL) + } + which_entry <- which(unlist(zattr$polygon_set_names) == 'cell') - indices <- stars::read_mdim(file.path(tempdir, 'polygon_sets', which_entry - 1, 'cell_index'))$cell_index + 1 + indices <- stars::read_mdim(file.path(tempdir, "polygon_sets", which_entry - 1, "cell_index"))$cell_index + 1 indices[is.na(indices)] <- 1 - ids <- stars::read_mdim(file.path(tempdir, 'cell_id'))$cell_id + ids <- stars::read_mdim(file.path(tempdir, "cell_id"))$cell_id ids[is.na(ids)] <- 0 ids <- paste0( From fb0435499317929cc625020aaab5fe5dc5e2028a Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Wed, 18 Sep 2024 11:02:32 -0700 Subject: [PATCH 11/15] fix: read cells.parquet or csv.gz instead of zarr.zip --- R/preprocessing.R | 80 ++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 899e6254c..1692097ad 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2360,56 +2360,44 @@ ReadXenium <- function( matrix }, 'segmentation_method' = { - if(!requireNamespace("stars", quietly = TRUE) || !requireNamespace("jsonlite", quietly = TRUE) || !requireNamespace("gmp", quietly = TRUE)) { - warning("Reading segmentation_method requires the `stars`, `gmp` and `jsonlite` packages") - return(NULL) + pcents <- progressor() + pcents( + message = 'Loading cell metadata', + class = 'sticky', + amount = 0 + ) + + col.use <- c( + cell_id = 'cell', + segmentation_method = 'segmentation_method' + ) + + for(option in Filter(function(x) x$req, list( + list( + filename = "cells.parquet", + fn = function(x) as.data.frame(arrow::read_parquet(x, col_select = names(col.use))), + req = has_arrow + ), + list( + filename = "cells.csv.gz", + fn = function(x) data.table::fread(x, data.table = FALSE, stringsAsFactors = FALSE, select = names(col.use)), + req = has_dt + ), + list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) + ))) { + cell_info <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(cell_info, "try-error")) { break } } - if(file.exists(file.path(data.dir, "cells.zarr.zip"))) { - pcents <- progressor() - pcents( - message = 'Loading cell metadata', - class = 'sticky', - amount = 0 - ) - - tempdir <- path.expand(tempdir()) - unzip(file.path(data.dir, "cells.zarr.zip"), exdir = tempdir) - zattr <- jsonlite::read_json(file.path(tempdir, '.zattrs')) - - # Segmentation method only available in datasets versioned 6.0+ - if(zattr$major_version < 6) { - return(NULL) - } - - which_entry <- which(unlist(zattr$polygon_set_names) == 'cell') - - indices <- stars::read_mdim(file.path(tempdir, "polygon_sets", which_entry - 1, "cell_index"))$cell_index + 1 - indices[is.na(indices)] <- 1 - - ids <- stars::read_mdim(file.path(tempdir, "cell_id"))$cell_id - ids[is.na(ids)] <- 0 - - ids <- paste0( - gsub(' ', 'a', sprintf('%8s', sapply( - strsplit(as.character(gmp::as.bigz(ids[1,]), 16), ''), - function(id) { - rawToChar(as.raw(sapply(id, function(x) { - as.numeric(charToRaw(x)) + - ifelse(is.na(suppressWarnings(as.numeric(x))), 10, 49) - }))) - } - ))), '-', ids[2,]) - - method <- stars::read_mdim(file.path(tempdir, 'polygon_sets', which_entry - 1, 'method'))$method + 1 - method[is.na(method)] <- 1 + if(!exists('cell_info') || inherits(cell_info, "try-error")) { + NULL + } else { + cell_info <- cell_info[, names(col.use)] + colnames(cell_info) <- col.use - segmentation_method <- unlist(zattr$segmentation_methods)[method] + pcents(type = 'finish') - pcents(type = "finish") - data.frame(segmentation_method = segmentation_method, row.names = ids) - } else { - NULL + data.frame(segmentation_method = cell_info$segmentation_method, row.names = cell_info$cell_id) } }, 'centroids' = { From efcb50785833cd572c08c014762d2d7e153dd938 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Mon, 23 Sep 2024 16:41:02 -0700 Subject: [PATCH 12/15] fix: parse arrow binary type --- R/preprocessing.R | 130 +++++++++++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 54 deletions(-) diff --git a/R/preprocessing.R b/R/preprocessing.R index 1692097ad..40a8c6144 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -503,7 +503,7 @@ GetResidual <- function( #' tissue #' @param to.upper Converts all feature names to upper case. Can be useful when #' analyses require comparisons between human and mouse gene names for example. -#' @param image \code{VisiumV1}/\code{VisiumV2} instance(s) - if a vector is +#' @param image \code{VisiumV1}/\code{VisiumV2} instance(s) - if a vector is #' passed in it should be co-indexed with \code{`bin.size`} #' @param ... Arguments passed to \code{\link{Read10X_h5}} #' @@ -658,7 +658,7 @@ Load10X_Spatial <- function ( #' #' @export #' @concept preprocessing -#' +#' Read10X_probe_metadata <- function( data.dir, filename = 'raw_probe_bc_matrix.h5' @@ -1210,7 +1210,7 @@ Read10X_h5 <- function(filename, use.names = TRUE, unique.features = TRUE) { #' \code{scalefactors_json.json} and \code{tissue_positions_list.csv} #' @param image.name PNG file to read in #' @param assay Name of associated assay -#' @param slice Name for the image, used to populate the instance's key +#' @param slice Name for the image, used to populate the instance's key #' @param filter.matrix Filter spot/feature matrix to only include spots that #' have been determined to be over tissue #' @@ -1283,29 +1283,29 @@ Read10X_Image <- function( Read10X_Coordinates <- function(filename, filter.matrix) { # output columns names col.names <- c("barcodes", "tissue", "row", "col", "imagerow", "imagecol") - + # if the coordinate mappings are in a parquet file if (tools::file_ext(filename) == "parquet") { # `arrow` must be installed to read parquet files if (!requireNamespace("arrow", quietly = TRUE)) { stop("Please install arrow to read parquet files") } - + # read in coordinates and conver the resulting tibble into a data.frame coordinates <- as.data.frame(arrow::read_parquet(filename)) # normalize column names for consistency with other datatypes input.col.names <- c( - "barcode", - "in_tissue", - "array_row", - "array_col", - "pxl_row_in_fullres", + "barcode", + "in_tissue", + "array_row", + "array_col", + "pxl_row_in_fullres", "pxl_col_in_fullres" ) col.map <- stats::setNames(col.names, input.col.names) colnames(coordinates) <- ifelse( - colnames(coordinates) %in% names(col.map), - col.map[colnames(coordinates)], + colnames(coordinates) %in% names(col.map), + col.map[colnames(coordinates)], colnames(coordinates) ) @@ -2337,13 +2337,25 @@ ReadXenium <- function( has_arrow <- requireNamespace("arrow", quietly = TRUE) has_hdf5r <- requireNamespace("hdf5r", quietly = TRUE) + binary_to_string <- function(arrow_binary) { + if(typeof(arrow_binary) == 'list') { + unlist( + lapply( + arrow_binary, function(x) rawToChar(as.raw(strtoi(x, 16L))) + ) + ) + } else { + arrow_binary + } + } + data <- sapply(outs, function(otype) { switch( EXPR = otype, 'matrix' = { pmtx <- progressor() pmtx(message = 'Reading counts matrix', class = 'sticky', amount = 0) - + for(option in Filter(function(x) x$req, list( list(filename = "cell_feature_matrix.h5", fn = Read10X_h5, req = has_hdf5r), list(filename = "cell_feature_matrix", fn = Read10X, req = TRUE) @@ -2351,27 +2363,27 @@ ReadXenium <- function( matrix <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) if(!inherits(matrix, "try-error")) { break } } - + if(!exists('matrix') || inherits(matrix, "try-error")) { stop("Xenium outputs were incomplete: missing cell_feature_matrix") } - + pmtx(type = "finish") matrix }, 'segmentation_method' = { - pcents <- progressor() - pcents( + psegs <- progressor() + psegs( message = 'Loading cell metadata', class = 'sticky', amount = 0 ) - + col.use <- c( cell_id = 'cell', segmentation_method = 'segmentation_method' ) - + for(option in Filter(function(x) x$req, list( list( filename = "cells.parquet", @@ -2385,19 +2397,22 @@ ReadXenium <- function( ), list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) ))) { - cell_info <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) - if(!inherits(cell_info, "try-error")) { break } + cell_seg <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + if(!inherits(cell_seg, "try-error")) { break } } - - if(!exists('cell_info') || inherits(cell_info, "try-error")) { + + if(!exists('cell_seg') || inherits(cell_seg, "try-error") || length(intersect(names(col.use), colnames(cell_seg))) != 2) { + warning('cells did not contain a segmentation_method column. Skipping...') NULL } else { - cell_info <- cell_info[, names(col.use)] - colnames(cell_info) <- col.use - - pcents(type = 'finish') - - data.frame(segmentation_method = cell_info$segmentation_method, row.names = cell_info$cell_id) + cell_seg <- cell_seg[, names(col.use)] + colnames(cell_seg) <- col.use + + cell_seg$cell <- binary_to_string(cell_seg$cell) + + psegs(type = 'finish') + + data.frame(segmentation_method = cell_seg$segmentation_method, row.names = cell_seg$cell) } }, 'centroids' = { @@ -2407,13 +2422,13 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - + col.use <- c( x_centroid = letters[24 + flip.xy], y_centroid = letters[25 - flip.xy], cell_id = 'cell' ) - + for(option in Filter(function(x) x$req, list( list( filename = "cells.parquet", @@ -2430,16 +2445,18 @@ ReadXenium <- function( cell_info <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) if(!inherits(cell_info, "try-error")) { break } } - + if(!exists('cell_info') || inherits(cell_info, "try-error")) { stop("Xenium outputs were incomplete: missing cells") } - + + cell_info$cell_id <- binary_to_string(cell_info$cell_id) + cell_info <- cell_info[, names(col.use)] colnames(cell_info) <- col.use - + pcents(type = 'finish') - + cell_info }, 'segmentations' = { @@ -2449,7 +2466,7 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - + for(option in Filter(function(x) x$req, list( list( filename = "cell_boundaries.parquet", @@ -2466,19 +2483,21 @@ ReadXenium <- function( cell_boundaries_df <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) if(!inherits(cell_boundaries_df, "try-error")) { break } } - + if(!exists('cell_boundaries_df') || inherits(cell_boundaries_df, "try-error")) { stop("Xenium outputs were incomplete: missing cell_boundaries") } - + colnames(cell_boundaries_df) <- c( 'cell', letters[24 + flip.xy], letters[25 - flip.xy] ) - + + cell_boundaries_df$cell <- binary_to_string(cell_boundaries_df$cell) + psegs(type = "finish") - + cell_boundaries_df }, 'nucleus_segmentations' = { @@ -2488,7 +2507,7 @@ ReadXenium <- function( class = 'sticky', amount = 0 ) - + for(option in Filter(function(x) x$req, list( list( filename = "nucleus_boundaries.parquet", @@ -2505,19 +2524,20 @@ ReadXenium <- function( nucleus_boundaries_df <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) if(!inherits(nucleus_boundaries_df, "try-error")) { break } } - + if(!exists('nucleus_boundaries_df') || inherits(nucleus_boundaries_df, "try-error")) { stop("Xenium outputs were incomplete: missing nucleus_boundaries") } - + colnames(nucleus_boundaries_df) <- c( 'cell', letters[24 + flip.xy], letters[25 - flip.xy] ) - + nucleus_boundaries_df$cell <- binary_to_string(nucleus_boundaries_df$cell) + psegs(type = "finish") - + nucleus_boundaries_df }, 'microns' = { @@ -2533,7 +2553,7 @@ ReadXenium <- function( y_location = letters[25-flip.xy], feature_name = 'gene' ) - + for(option in Filter(function(x) x$req, list( list( filename = "transcripts.parquet", @@ -2550,27 +2570,29 @@ ReadXenium <- function( transcripts <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) if(!inherits(transcripts, "try-error")) { break } } - + if(!exists('transcripts') || inherits(transcripts, "try-error")) { hint <- "" if(file.exists(file.path(data.dir, "transcripts.parquet"))) { - hint <- " Xenium outputs no longer include `transcripts.csv.gz`. Instead, please install `arrow` to read transcripts.parquet" + hint <- ". Xenium outputs no longer include `transcripts.csv.gz`. Instead, please install `arrow` to read transcripts.parquet" } - - stop(paste0("Xenium outputs were incomplete: missing transcripts.", hint)) + + stop(paste0("Xenium outputs were incomplete: missing transcripts", hint)) } - + transcripts <- transcripts[, names(col.use)] colnames(transcripts) <- col.use - + + transcripts$gene <- binary_to_string(transcripts$gene) + pmicrons(type = 'finish') - + transcripts }, stop("Unknown Xenium input type: ", otype) ) }, simplify = FALSE, USE.NAMES = TRUE) - + metadata <- file.path(data.dir, "experiment.xenium") if(file.exists(metadata) && requireNamespace("jsonlite", quietly = TRUE)) { meta <- jsonlite::read_json(metadata) From d362ba023767c8b5338140945de8dc8b83eb9753 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Tue, 24 Sep 2024 11:01:32 -0700 Subject: [PATCH 13/15] docs: update vignette --- vignettes/seurat5_spatial_vignette_2.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/seurat5_spatial_vignette_2.Rmd b/vignettes/seurat5_spatial_vignette_2.Rmd index d63f6b4cf..d334aaec3 100644 --- a/vignettes/seurat5_spatial_vignette_2.Rmd +++ b/vignettes/seurat5_spatial_vignette_2.Rmd @@ -216,7 +216,7 @@ ImageDimPlot(vizgen.obj, fov = "hippo", molecules = rownames(markers.14)[1:4], c # Human Lung: 10x Genomics Xenium In Situ -This dataset is a preview of the Xenium multimodal cell segmentation solution using a development version of the assay user guide and analysis software. It uses the [Xenium Human Multi-Tissue and Cancer Panel](https://www.10xgenomics.com/support/in-situ-gene-expression/documentation/steps/panel-design/pre-designed-xenium-gene-expression-panels) (377 genes) which was pre-designed by 10x Genomics. In this vignette, we will demonstrate how to load Xenium data for analysis and visualization using Seurat and, in particular, how to parse and visualize cell metadata. Note that this vignette requires the use of some optional dependencies in Seurat, namely `stars`, `jsonlite` and `gmp` in order to read data from `.zarr` files. +This dataset is a preview of the Xenium multimodal cell segmentation solution using a development version of the assay user guide and analysis software. It uses the [Xenium Human Multi-Tissue and Cancer Panel](https://www.10xgenomics.com/support/in-situ-gene-expression/documentation/steps/panel-design/pre-designed-xenium-gene-expression-panels) (377 genes) which was pre-designed by 10x Genomics. In this vignette, we will demonstrate how to load Xenium data for analysis and visualization using Seurat and, in particular, how to parse and visualize cell metadata. This uses the full Xenium output bundle available from the [FFPE Human Lung Cancer with Xenium Multimodal Cell Segmentation Preview Data](https://www.10xgenomics.com/datasets/preview-data-ffpe-human-lung-cancer-with-xenium-multimodal-cell-segmentation-1-standard) page, which can be downloaded as described below (note that this file is \~7 GB). From 0c536c79ae9d700a14558a32259aeb86beb8b140 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Wed, 25 Sep 2024 15:47:49 -0700 Subject: [PATCH 14/15] doc: update NEWS and roxygenize --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 6 ++++++ man/Load10X_Spatial.Rd | 2 +- man/ReadXenium.Rd | 37 ++++++++++++++++++++++++++++++------- 5 files changed, 39 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ddaf72db8..c56a92a1d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -114,7 +114,7 @@ Collate: 'sketching.R' 'tree.R' 'utilities.R' -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Encoding: UTF-8 Suggests: ape, diff --git a/NAMESPACE b/NAMESPACE index 005325e25..3f4704f61 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -546,6 +546,7 @@ importFrom(SeuratObject,CreateAssayObject) importFrom(SeuratObject,CreateCentroids) importFrom(SeuratObject,CreateDimReducObject) importFrom(SeuratObject,CreateFOV) +importFrom(SeuratObject,CreateMolecules) importFrom(SeuratObject,CreateSegmentation) importFrom(SeuratObject,CreateSeuratObject) importFrom(SeuratObject,DefaultAssay) diff --git a/NEWS.md b/NEWS.md index d3b17bb2e..2da9bba79 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,12 @@ # Unreleased ## Changes +- Surfaced more fine-grained control over what parts of a Xenium experiment are loaded in `LoadXenium` +- Added ability to load Xenium nucleus segmentation masks +- Updated `LoadXenium` to also read some run metadata (run start time, preservation method, panel used, organism, tissue type, instrument software version and stain kit used) into `misc` slot +- Updated `ReadXenium` to load cell_feature_matrix.h5 when present in favor of the MEX format files +- Added ability to read Xenium `segmentation_method` directly into `meta.data` +- Updated `ReadXenium` to load .parquet files using `arrow` instead of .csv.gz files to support XOA 3.0 - Fixed `LoadXenium` to accommodate datasets without "Blank Codeword" or "Unassigned Codeword" matrices - Fixed `ReadXenium` to properly parse multiple molecular outputs at once ([#8265](https://github.com/satijalab/seurat/issues/8265)) - Fixed `RunPCA` to avoid converting `BPCells` matrices into dense matrices - significantly reduces the function's memory usage when running on `BPCells` matrices diff --git a/man/Load10X_Spatial.Rd b/man/Load10X_Spatial.Rd index dd7a031b0..9bb2bbb5d 100644 --- a/man/Load10X_Spatial.Rd +++ b/man/Load10X_Spatial.Rd @@ -34,7 +34,7 @@ tissue} \item{to.upper}{Converts all feature names to upper case. Can be useful when analyses require comparisons between human and mouse gene names for example.} -\item{image}{\code{VisiumV1}/\code{VisiumV2} instance(s) - if a vector is +\item{image}{\code{VisiumV1}/\code{VisiumV2} instance(s) - if a vector is passed in it should be co-indexed with \code{`bin.size`}} \item{...}{Arguments passed to \code{\link{Read10X_h5}}} diff --git a/man/ReadXenium.Rd b/man/ReadXenium.Rd index df86aa864..dbe85e398 100644 --- a/man/ReadXenium.Rd +++ b/man/ReadXenium.Rd @@ -5,13 +5,23 @@ \alias{ReadXenium} \title{Read and Load 10x Genomics Xenium in-situ data} \usage{ -LoadXenium(data.dir, fov = "fov", assay = "Xenium") +LoadXenium( + data.dir, + fov = "fov", + assay = "Xenium", + mols.qv.threshold = 20, + cell.centroids = TRUE, + molecule.coordinates = TRUE, + segmentations = NULL, + flip.xy = FALSE +) ReadXenium( data.dir, - outs = c("matrix", "microns"), + outs = c("segmentation_method", "matrix", "microns"), type = "centroids", - mols.qv.threshold = 20 + mols.qv.threshold = 20, + flip.xy = F ) } \arguments{ @@ -22,10 +32,26 @@ default filenames} \item{assay}{Assay name} +\item{mols.qv.threshold}{Remove transcript molecules with +a QV less than this threshold. QV >= 20 is the standard threshold +used to construct the cell x gene count matrix.} + +\item{cell.centroids}{Whether or not to load cell centroids} + +\item{molecule.coordinates}{Whether or not to load molecule pixel coordinates} + +\item{segmentations}{One of "cell", "nucleus" or NULL (to load either cell +segmentations, nucleus segmentations or neither)} + +\item{flip.xy}{Whether or not to flip the x/y coordinates of the Xenium outputs +to match what is displayed in Xenium Explorer, or fit on your screen better.} + \item{outs}{Types of molecular outputs to read; choose one or more of: \itemize{ \item \dQuote{matrix}: the counts matrix \item \dQuote{microns}: molecule coordinates + \item \dQuote{segmentation_method}: cell segmentation method (for runs which + use multi-modal segmentation) }} \item{type}{Type of cell spatial coordinate matrices to read; choose one @@ -33,11 +59,8 @@ or more of: \itemize{ \item \dQuote{centroids}: cell centroids in pixel coordinate space \item \dQuote{segmentations}: cell segmentations in pixel coordinate space + \item \dQuote{nucleus_segmentations}: nucleus segmentations in pixel coordinate space }} - -\item{mols.qv.threshold}{Remove transcript molecules with -a QV less than this threshold. QV >= 20 is the standard threshold -used to construct the cell x gene count matrix.} } \value{ \code{LoadXenium}: A \code{\link[SeuratObject]{Seurat}} object From db8bd41ea83f53c155a8376a098cdac94d94b5e5 Mon Sep 17 00:00:00 2001 From: Jordan Sicherman Date: Sun, 29 Sep 2024 15:40:29 -0700 Subject: [PATCH 15/15] fix: better error handling --- DESCRIPTION | 4 ++-- R/preprocessing.R | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e1e441a1f..f3a885a91 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: Seurat -Version: 5.1.0.9005 -Date: 2024-09-12 +Version: 5.1.0.9006 +Date: 2024-09-29 Title: Tools for Single Cell Genomics Description: A toolkit for quality control, analysis, and exploration of single cell RNA sequencing data. 'Seurat' aims to enable users to identify and interpret sources of heterogeneity from single cell transcriptomic measurements, and to integrate diverse types of single cell data. See Satija R, Farrell J, Gennert D, et al (2015) , Macosko E, Basu A, Satija R, et al (2015) , Stuart T, Butler A, et al (2019) , and Hao, Hao, et al (2020) for more details. Authors@R: c( diff --git a/R/preprocessing.R b/R/preprocessing.R index 40a8c6144..8d7b01eef 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -2397,12 +2397,12 @@ ReadXenium <- function( ), list(filename = "cells.csv.gz", fn = function(x) read.csv(x, stringsAsFactors = FALSE), req = TRUE) ))) { - cell_seg <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename)))) + cell_seg <- try(suppressWarnings(option$fn(file.path(data.dir, option$filename))), silent = TRUE) if(!inherits(cell_seg, "try-error")) { break } } if(!exists('cell_seg') || inherits(cell_seg, "try-error") || length(intersect(names(col.use), colnames(cell_seg))) != 2) { - warning('cells did not contain a segmentation_method column. Skipping...') + warning('cells did not contain a segmentation_method column. Skipping...', call. = FALSE, immediate. = TRUE) NULL } else { cell_seg <- cell_seg[, names(col.use)]