From 4602846fa41ec41e801a86b3a232dd9eff65f55d Mon Sep 17 00:00:00 2001
From: Oren Ben-Kiki <oren@ben-kiki.org>
Date: Wed, 29 May 2024 08:18:07 +0300
Subject: [PATCH] Cleanup type annotations.

---
 docs/v0.1.0/.documenter-siteinfo.json |  2 +-
 docs/v0.1.0/anndata_format.html       |  2 +-
 docs/v0.1.0/search_index.js           |  2 +-
 docs/v0.1.0/spheres.html              |  1 +
 improved_spheres.auto.log             | 24 ++++++++++++++++++++++++
 src/anndata_format.jl                 | 10 +++++-----
 src/spheres.jl                        |  5 +++--
 7 files changed, 36 insertions(+), 10 deletions(-)
 create mode 100644 improved_spheres.auto.log
diff --git a/docs/v0.1.0/.documenter-siteinfo.json b/docs/v0.1.0/.documenter-siteinfo.json
index 1896974..21c5d9a 100644
--- a/docs/v0.1.0/.documenter-siteinfo.json
+++ b/docs/v0.1.0/.documenter-siteinfo.json
@@ -1 +1 @@
-{"documenter":{"julia_version":"1.10.0","generation_timestamp":"2024-05-23T13:18:35","documenter_version":"1.4.1"}}
\ No newline at end of file
+{"documenter":{"julia_version":"1.10.0","generation_timestamp":"2024-05-29T08:17:40","documenter_version":"1.4.1"}}
\ No newline at end of file
diff --git a/docs/v0.1.0/anndata_format.html b/docs/v0.1.0/anndata_format.html
index 38f8f2e..f2ed1b7 100644
--- a/docs/v0.1.0/anndata_format.html
+++ b/docs/v0.1.0/anndata_format.html
@@ -175,7 +175,7 @@ <h1 id="AnnData-Format">
     type_property::Maybe{AbstractString} = nothing,
     rename_type::Maybe{AbstractString} = &quot;type&quot;,
     type_colors_csv::Maybe{AbstractString} = nothing,
-    type_properties::Maybe{AbstractStringSet} = nothing,
+    type_properties::Maybe{AbstractSet{&lt;:AbstractString}} = nothing,
     properties_defaults::Maybe{Dict} = nothing,
 )::Nothing
 </code>
diff --git a/docs/v0.1.0/search_index.js b/docs/v0.1.0/search_index.js
index 8975ca3..558eae8 100644
--- a/docs/v0.1.0/search_index.js
+++ b/docs/v0.1.0/search_index.js
@@ -1,3 +1,3 @@
 var documenterSearchIndex = {"docs":
-[{"location":"identify_genes.html#Identify-Genes","page":"Identify Genes","title":"Identify Genes","text":"","category":"section"},{"location":"identify_genes.html","page":"Identify Genes","title":"Identify Genes","text":"Metacells.IdentifyGenes\nMetacells.IdentifyGenes.identify_marker_genes!\nMetacells.IdentifyGenes.identify_correlated_genes!","category":"page"},{"location":"identify_genes.html#Metacells.IdentifyGenes","page":"Identify Genes","title":"Metacells.IdentifyGenes","text":"Identify special genes.\n\n\n\n\n\n","category":"module"},{"location":"identify_genes.html#Metacells.IdentifyGenes.identify_marker_genes!","page":"Identify Genes","title":"Metacells.IdentifyGenes.identify_marker_genes!","text":"function identify_marker_genes!(\n    daf::DafWriter;\n    gene_fraction_regularization::AbstractFloat = 1e-5,\n    min_gene_range_fold::AbstractFloat = 2.0,\n    noisy_gene_fold::AbstractFloat = 1.0,\n    min_max_marker_gene_fraction::AbstractFloat = 1e-4,\n    overwrite::Bool = false,\n)::Nothing\n\nIdentify the genes that distinguish at least one metacell from the rest. Such genes are called \"marker\" genes as they (potentially) mark specific cell states. If overwrite, will overwrite an existing is_marker mask.\n\nCompute the minimal and maximal expression level of each gene.\nSelect the genes whose fold factor (log2 of maximal over minimal value, using the gene_fraction_regularization is at least min_marker_gene_range_fold. For is_noisy genes, we require an additional noisy_gene_fold.\nIdentify the genes whose maximal expression is at least min_max_marker_gene_fraction.\n\nCONTRACT\n\n\n\n\n\n","category":"function"},{"location":"identify_genes.html#Metacells.IdentifyGenes.identify_correlated_genes!","page":"Identify Genes","title":"Metacells.IdentifyGenes.identify_correlated_genes!","text":"function identify_correlated_genes!(\n    daf::DafWriter;\n    gene_fraction_regularization::AbstractFloat = 1e-5,\n    min_gene_correlation::AbstractFloat = 0.5,\n    overwrite::Bool = false,\n)::Nothing\n\nIdentify genes that are correlated with other gene(s). Such genes are good candidates for looking for groups of genes that act together. If overwrite, will overwrite an existing is_correlated mask.\n\nCompute the log base 2 of the genes expression in each metacell (using the gene_fraction_regularization).\nCorrelate this between all the pairs of genes.\nFind the maximal absolute correlation for each gene (that is, strong anti-correlation also counts).\nIdentify the genes which have at least one gene with a correlation of at least min_gene_correlation.\n\nCONTRACT\n\n\n\n\n\n","category":"function"},{"location":"identify_genes.html#Index","page":"Identify Genes","title":"Index","text":"","category":"section"},{"location":"identify_genes.html","page":"Identify Genes","title":"Identify Genes","text":"Pages = [\"identify_genes.md\"]","category":"page"},{"location":"anndata_format.html#AnnData-Format","page":"AnnData Format","title":"AnnData Format","text":"","category":"section"},{"location":"anndata_format.html","page":"AnnData Format","title":"AnnData Format","text":"Metacells.AnnDataFormat\nMetacells.AnnDataFormat.import_h5ads!\nMetacells.AnnDataFormat.CopyAnnData","category":"page"},{"location":"anndata_format.html#Metacells.AnnDataFormat","page":"AnnData Format","title":"Metacells.AnnDataFormat","text":"Import and export metacells data from/to h5ad files. This allows moving data between the old Python/C++ based AnnData world and the new Julia based Daf world.\n\n\n\n\n\n","category":"module"},{"location":"anndata_format.html#Metacells.AnnDataFormat.import_h5ads!","page":"AnnData Format","title":"Metacells.AnnDataFormat.import_h5ads!","text":"function import_h5ads!(\n    destination::DafWriter;\n    raw_cells_h5ad::Maybe{AbstractString} = nothing,\n    clean_cells_h5ad::AbstractString,\n    metacells_h5ad::AbstractString,\n    copy_clean_data::Maybe{CopyAnnData} = nothing,\n    type_property::Maybe{AbstractString} = nothing,\n    rename_type::Maybe{AbstractString} = \"type\",\n    type_colors_csv::Maybe{AbstractString} = nothing,\n    type_properties::Maybe{AbstractStringSet} = nothing,\n    properties_defaults::Maybe{Dict} = nothing,\n)::Nothing\n\nImport an AnnData based metacells dataset into a Daf destination data set. Ideally, the input must include clean_cells_h5ad and the metacells_h5ad computed for them, and optionally also the raw_cells_h5ad including the excluded cells and genes.\n\nIf type annotations were assigned to the metacells, then the name of the type_property should be specified. This can be further enhanced by specifying a type_colors_csv file mapping type names to colors. This should be a comma or tab separated file containing at least two columns, one named \"color\" and one with the same name as the type_property. For consistency, by default the type_property is renamed to the value of rename_type (by default, \"type\"). You can disable this by setting rename_type to nothing. We also call reconstruct_axis! to build the type axis; you can therefore specify an empty_type name, which will be converted to the empty string, to match the Daf convention of \"no value\" for string data, and specify an explicit set of type_properties (by default, any per-metacell property that has the same value for all metacells of each type will be converted to a type property) and properties_defaults.\n\nThis will mostly just read all the specified h5ad files and copy the data into the destination, with the following changes to match the Daf capabilities and conventions:\n\nThe X matrix of the cells is renamed to UMIs, and the X matrix of the metacells is renamed to fraction.\nMatrices and vectors of counts (UMIs, zeros) or module indices are converted to an unsigned type.\nThe __name__ scalar is not copied.\nThe excluded_gene and excluded_cell masks are not copied. Instead, if raw_cells_h5ad is specified, an is_excluded mask is created for both cells and genes, marking these that exist only in the raw_cells_h5ad and not in clean_cells_h5ad and metacells_h5ad.\nThe full_gene_index is not copied.\nThe properly_sampled_gene mask is renamed to the per-gene is_properly_sampled mask.\nThe bursty_lonely_gene mask is renamed to the per-gene is_bursty_lonely mask.\nThe lateral_gene mask is renamed to the per-gene is_lateral mask.\nThe noisy_gene mask is renamed to the per-gene is_noisy mask.\nThe rare_gene mask is renamed to the per-gene is_rare mask.\nThe rare_gene_module has 1 added to it (that is, \"no module\" is 0 in Daf) and is renamed to rare_module.\nThe lateral_genes_module has 1 added to it (that is, \"no module\" is 0 in Daf) and is renamed to lateral_module.\nThe marker_gene mask is renamed to the per-gene is_marker mask.\nThe selected_gene mask is renamed to the per-gene is_selected mask.\nThe ignored_gene mask is renamed to the per-gene is_ignored mask.\nThe ignored_gene_of_<type> masks are converted to an is_ignored mask per-gene-per-type.\nThe projected_noisy_gene mask is renamed to the per-gene is_projected_noisy mask.\nThe atlas_gene, atlas_lateral_gene, atlas_noisy_gene, atlas_marker_gene masks are renamed to the is_atlas, is_atlas_lateral, is_atlas_noisy and is_atlas_marker per-gene masks.\nThe essential_gene_of_<type> masks are converted to an is_essential mask per-gene-per-type.\nThe atlas_essential_gene_of_<type> masks are converted to an is_atlas_essential mask per-gene-per-type.\nThe fitted_gene_of_<type> masks are converted to an is_fitted mask per-gene-per-type.\nThe fitted mask per-gene-per-metacell is renamed to is_fitted.\nThe misfit mask per-gene-per-metacell is renamed to is_misfit.\nThe essential mask per-gene-per-metacell is renamed to is_essential.\nThe full_cell_index is not copied.\nThe properly_sampled_cell mask is renamed to the per-cell is_properly_sampled mask.\nThe rare_cell mask is renamed to the per-cell is_rare mask.\nThe cells_rare_gene_module has 1 added to it (that is, \"no module\" is 0 in Daf) and is renamed to rare_gene_module.\nThe per-cell dissolve mask is renamed to is_dissolved.\nThe per-cell metacell integer annotation is not copied, and the metacell_name string annotation is renamed to metacell.\nThe per-cell most_similar integer annotation is not copied, and the most_similar_name string annotation is renamed to metacell.most_similar.\nThe rare_metacell mask is renamed to the per-metacell is_rare mask.\nThe per-metacell metacells_level is renamed to level.\nThe per-metacell similar mask is renamed to is_similar.\n\nnote: Note\nThere is much duplication of data between the three h5ad files (in particular, per-gene data). Data in raw_cells_h5ad will override data in clean_cells_h5ad, which will override data in metacells_h5ad.\n\nData that exists only in clean_cells_h5ad poses a question when being copied into the full data set, which includes the full raw set of cells and genes. If copy_clean_data is nothing (the default), this is simply an error. Otherwise, data that is listed in copy_clean_data is copied using the specified name and the default value is applied to the raw-only genes or cells.\n\nnote: Note\nIt is common to call reconstruct_axis! on the result (e.g., if the cells were collected from a set of batches).\n\n\n\n\n\n","category":"function"},{"location":"anndata_format.html#Metacells.AnnDataFormat.CopyAnnData","page":"AnnData Format","title":"Metacells.AnnDataFormat.CopyAnnData","text":"Specify how to copy data from AnnData to Daf. The key is simply a vector or matrix name (ignoring axes), and the value is either nothing to ignore the data, or a tuple with the name of the destination Daf property and an optional value to use for missing entries (raw-only cells and/or genes).\n\n\n\n\n\n","category":"type"},{"location":"anndata_format.html#Index","page":"AnnData Format","title":"Index","text":"","category":"section"},{"location":"anndata_format.html","page":"AnnData Format","title":"AnnData Format","text":"Pages = [\"anndata_format.md\"]","category":"page"},{"location":"spheres.html#Spheres","page":"Spheres","title":"Spheres","text":"","category":"section"},{"location":"spheres.html","page":"Spheres","title":"Spheres","text":"Metacells.Spheres\nMetacells.Spheres.compute_spheres!","category":"page"},{"location":"spheres.html#Metacells.Spheres","page":"Spheres","title":"Metacells.Spheres","text":"Given a set of raw metacells, partition them into spheres such that all metacells in the same sphere are within some (fold factor) radius of each other. The centroids of these spheres can serve as a representation of the cell state manifold which is less sensitive to oversampling of common cell states. Group these spheres in overlapping neighborhoods of \"similar\" spheres for further analysis.\n\n\n\n\n\n","category":"module"},{"location":"spheres.html#Metacells.Spheres.compute_spheres!","page":"Spheres","title":"Metacells.Spheres.compute_spheres!","text":"function compute_spheres!(\n    daf::DafWriter;\n    min_significant_gene_UMIs::Integer = 40,\n    gene_fraction_regularization::AbstractFloat = 1e-5,\n    confidence::AbstractFloat = 0.9,\n    max_sphere_diameter::AbstractFloat = 2.0,\n    max_neighborhood_diameter::AbstractFloat = 2.0,\n    noisy_gene_fold::AbstractFloat = 1.0,\n    max_deviant_genes_fraction::AbstractFloat = 0.01,\n    overwrite::Bool = false,\n)::Nothing\n\nPartition raw metacells into distinct spheres, and spheres into overlapping neighborhoods.\n\nInitial spheres and neighborhoods are computed in a first round, and then refined in a series of followup rounds.\nIn each round, we compute a distance between each two metacells. This is based on the fold factor between the expression level of each (relevant) gene in the metacells. The fold factor is the absolute value of the difference in the log (base 2) of the fraction of the gene in the metacells. This log is computed with the gene_fraction_regularization (by default, 1e-5). Since the fraction of the gene is a random variable, we decrease the high fraction and increase the low fraction by a factor based on the confidence of the test (by default, 0.9), assuming a multinomial distribution. In addition, if the sum of the total UMIs of the gene in both metacells is less than min_significant_gene_UMIs (by default, 40), we ignore this fold factor as insignificant. Finally, for noisy genes, we reduce the fold factor by noisy_gene_fold. In the first round, we simply count the number of genes whose fold factor is more than max_sphere_diameter (for computing spheres) and max_sphere_diameter + max_neighborhood_diameter (for computing neighborhoods). In the followup rounds, we use the maximal gene fold, for genes that are correlated in the vicinity of the metacells (see below).\nWe use hierarchical clustering to partition the metacells to distinct spheres, such that the maximal distance between any metacells in the sphere is bounded. In the first round, this bound is the max_deviant_genes_fraction out of the total number of genes. In the followup rounds, this is the max_sphere_diameter.\nFor each sphere, we compute a main neighborhood of other spheres such that the maximal distance between any metacells in the neighborhood is bounded. In the first round, this bound is again the maximal number of deviant genes (this time, using the increased fold distance computed above). In the followup rounds, this is the max_sphere_diameter plus the max_neighborhood_diameter. These neighborhoods may overlap. The main neighborhoods of different spheres may even be identical.\nFor each sphere, we compute the set of genes which have at least the min_gene_correlation with some other gene(s) in its main neighborhood. We restrict the correlated set of genes of each metacell to be the intersection of this set with the set from its sphere in the previous round.\nIf the new sets of correlated genes are identical to the previous round, we are done. Otherwise we repeat the round, using the more restricted sets of correlated genes.\n\nIf overwrite is set, the results will replace any previously computed spheres and neighborhoods.\n\nCONTRACT\n\n\n\n\n\n","category":"function"},{"location":"spheres.html#Index","page":"Spheres","title":"Index","text":"","category":"section"},{"location":"spheres.html","page":"Spheres","title":"Spheres","text":"Pages = [\"spheres.md\"]","category":"page"},{"location":"index.html#Metacells","page":"Metacells","title":"Metacells","text":"","category":"section"},{"location":"index.html","page":"Metacells","title":"Metacells","text":"Metacells.Metacells","category":"page"},{"location":"index.html#Metacells.Metacells","page":"Metacells","title":"Metacells.Metacells","text":"The Metacells.jl package provides computational services for the metacells package, using Daf to hold the data. In the future, we'll ideally migrate all of the metacellspackage computations to this package, converting the Python package to a thin wrapper, and provide a similar thin R wrapper to provide metacell analysis from R as well. For now,Metacells.jlonly provides a subset of the features of the Pythonmetacellspackage, which requires users to convert data fromAnnData(for the old features) toDaf (to the new features).\n\n\n\n\n\n","category":"module"},{"location":"index.html#Index","page":"Metacells","title":"Index","text":"","category":"section"},{"location":"index.html","page":"Metacells","title":"Metacells","text":"","category":"page"}]
+[{"location":"identify_genes.html#Identify-Genes","page":"Identify Genes","title":"Identify Genes","text":"","category":"section"},{"location":"identify_genes.html","page":"Identify Genes","title":"Identify Genes","text":"Metacells.IdentifyGenes\nMetacells.IdentifyGenes.identify_marker_genes!\nMetacells.IdentifyGenes.identify_correlated_genes!","category":"page"},{"location":"identify_genes.html#Metacells.IdentifyGenes","page":"Identify Genes","title":"Metacells.IdentifyGenes","text":"Identify special genes.\n\n\n\n\n\n","category":"module"},{"location":"identify_genes.html#Metacells.IdentifyGenes.identify_marker_genes!","page":"Identify Genes","title":"Metacells.IdentifyGenes.identify_marker_genes!","text":"function identify_marker_genes!(\n    daf::DafWriter;\n    gene_fraction_regularization::AbstractFloat = 1e-5,\n    min_gene_range_fold::AbstractFloat = 2.0,\n    noisy_gene_fold::AbstractFloat = 1.0,\n    min_max_marker_gene_fraction::AbstractFloat = 1e-4,\n    overwrite::Bool = false,\n)::Nothing\n\nIdentify the genes that distinguish at least one metacell from the rest. Such genes are called \"marker\" genes as they (potentially) mark specific cell states. If overwrite, will overwrite an existing is_marker mask.\n\nCompute the minimal and maximal expression level of each gene.\nSelect the genes whose fold factor (log2 of maximal over minimal value, using the gene_fraction_regularization is at least min_marker_gene_range_fold. For is_noisy genes, we require an additional noisy_gene_fold.\nIdentify the genes whose maximal expression is at least min_max_marker_gene_fraction.\n\nCONTRACT\n\n\n\n\n\n","category":"function"},{"location":"identify_genes.html#Metacells.IdentifyGenes.identify_correlated_genes!","page":"Identify Genes","title":"Metacells.IdentifyGenes.identify_correlated_genes!","text":"function identify_correlated_genes!(\n    daf::DafWriter;\n    gene_fraction_regularization::AbstractFloat = 1e-5,\n    min_gene_correlation::AbstractFloat = 0.5,\n    overwrite::Bool = false,\n)::Nothing\n\nIdentify genes that are correlated with other gene(s). Such genes are good candidates for looking for groups of genes that act together. If overwrite, will overwrite an existing is_correlated mask.\n\nCompute the log base 2 of the genes expression in each metacell (using the gene_fraction_regularization).\nCorrelate this between all the pairs of genes.\nFind the maximal absolute correlation for each gene (that is, strong anti-correlation also counts).\nIdentify the genes which have at least one gene with a correlation of at least min_gene_correlation.\n\nCONTRACT\n\n\n\n\n\n","category":"function"},{"location":"identify_genes.html#Index","page":"Identify Genes","title":"Index","text":"","category":"section"},{"location":"identify_genes.html","page":"Identify Genes","title":"Identify Genes","text":"Pages = [\"identify_genes.md\"]","category":"page"},{"location":"anndata_format.html#AnnData-Format","page":"AnnData Format","title":"AnnData Format","text":"","category":"section"},{"location":"anndata_format.html","page":"AnnData Format","title":"AnnData Format","text":"Metacells.AnnDataFormat\nMetacells.AnnDataFormat.import_h5ads!\nMetacells.AnnDataFormat.CopyAnnData","category":"page"},{"location":"anndata_format.html#Metacells.AnnDataFormat","page":"AnnData Format","title":"Metacells.AnnDataFormat","text":"Import and export metacells data from/to h5ad files. This allows moving data between the old Python/C++ based AnnData world and the new Julia based Daf world.\n\n\n\n\n\n","category":"module"},{"location":"anndata_format.html#Metacells.AnnDataFormat.import_h5ads!","page":"AnnData Format","title":"Metacells.AnnDataFormat.import_h5ads!","text":"function import_h5ads!(\n    destination::DafWriter;\n    raw_cells_h5ad::Maybe{AbstractString} = nothing,\n    clean_cells_h5ad::AbstractString,\n    metacells_h5ad::AbstractString,\n    copy_clean_data::Maybe{CopyAnnData} = nothing,\n    type_property::Maybe{AbstractString} = nothing,\n    rename_type::Maybe{AbstractString} = \"type\",\n    type_colors_csv::Maybe{AbstractString} = nothing,\n    type_properties::Maybe{AbstractSet{<:AbstractString}} = nothing,\n    properties_defaults::Maybe{Dict} = nothing,\n)::Nothing\n\nImport an AnnData based metacells dataset into a Daf destination data set. Ideally, the input must include clean_cells_h5ad and the metacells_h5ad computed for them, and optionally also the raw_cells_h5ad including the excluded cells and genes.\n\nIf type annotations were assigned to the metacells, then the name of the type_property should be specified. This can be further enhanced by specifying a type_colors_csv file mapping type names to colors. This should be a comma or tab separated file containing at least two columns, one named \"color\" and one with the same name as the type_property. For consistency, by default the type_property is renamed to the value of rename_type (by default, \"type\"). You can disable this by setting rename_type to nothing. We also call reconstruct_axis! to build the type axis; you can therefore specify an empty_type name, which will be converted to the empty string, to match the Daf convention of \"no value\" for string data, and specify an explicit set of type_properties (by default, any per-metacell property that has the same value for all metacells of each type will be converted to a type property) and properties_defaults.\n\nThis will mostly just read all the specified h5ad files and copy the data into the destination, with the following changes to match the Daf capabilities and conventions:\n\nThe X matrix of the cells is renamed to UMIs, and the X matrix of the metacells is renamed to fraction.\nMatrices and vectors of counts (UMIs, zeros) or module indices are converted to an unsigned type.\nThe __name__ scalar is not copied.\nThe excluded_gene and excluded_cell masks are not copied. Instead, if raw_cells_h5ad is specified, an is_excluded mask is created for both cells and genes, marking these that exist only in the raw_cells_h5ad and not in clean_cells_h5ad and metacells_h5ad.\nThe full_gene_index is not copied.\nThe properly_sampled_gene mask is renamed to the per-gene is_properly_sampled mask.\nThe bursty_lonely_gene mask is renamed to the per-gene is_bursty_lonely mask.\nThe lateral_gene mask is renamed to the per-gene is_lateral mask.\nThe noisy_gene mask is renamed to the per-gene is_noisy mask.\nThe rare_gene mask is renamed to the per-gene is_rare mask.\nThe rare_gene_module has 1 added to it (that is, \"no module\" is 0 in Daf) and is renamed to rare_module.\nThe lateral_genes_module has 1 added to it (that is, \"no module\" is 0 in Daf) and is renamed to lateral_module.\nThe marker_gene mask is renamed to the per-gene is_marker mask.\nThe selected_gene mask is renamed to the per-gene is_selected mask.\nThe ignored_gene mask is renamed to the per-gene is_ignored mask.\nThe ignored_gene_of_<type> masks are converted to an is_ignored mask per-gene-per-type.\nThe projected_noisy_gene mask is renamed to the per-gene is_projected_noisy mask.\nThe atlas_gene, atlas_lateral_gene, atlas_noisy_gene, atlas_marker_gene masks are renamed to the is_atlas, is_atlas_lateral, is_atlas_noisy and is_atlas_marker per-gene masks.\nThe essential_gene_of_<type> masks are converted to an is_essential mask per-gene-per-type.\nThe atlas_essential_gene_of_<type> masks are converted to an is_atlas_essential mask per-gene-per-type.\nThe fitted_gene_of_<type> masks are converted to an is_fitted mask per-gene-per-type.\nThe fitted mask per-gene-per-metacell is renamed to is_fitted.\nThe misfit mask per-gene-per-metacell is renamed to is_misfit.\nThe essential mask per-gene-per-metacell is renamed to is_essential.\nThe full_cell_index is not copied.\nThe properly_sampled_cell mask is renamed to the per-cell is_properly_sampled mask.\nThe rare_cell mask is renamed to the per-cell is_rare mask.\nThe cells_rare_gene_module has 1 added to it (that is, \"no module\" is 0 in Daf) and is renamed to rare_gene_module.\nThe per-cell dissolve mask is renamed to is_dissolved.\nThe per-cell metacell integer annotation is not copied, and the metacell_name string annotation is renamed to metacell.\nThe per-cell most_similar integer annotation is not copied, and the most_similar_name string annotation is renamed to metacell.most_similar.\nThe rare_metacell mask is renamed to the per-metacell is_rare mask.\nThe per-metacell metacells_level is renamed to level.\nThe per-metacell similar mask is renamed to is_similar.\n\nnote: Note\nThere is much duplication of data between the three h5ad files (in particular, per-gene data). Data in raw_cells_h5ad will override data in clean_cells_h5ad, which will override data in metacells_h5ad.\n\nData that exists only in clean_cells_h5ad poses a question when being copied into the full data set, which includes the full raw set of cells and genes. If copy_clean_data is nothing (the default), this is simply an error. Otherwise, data that is listed in copy_clean_data is copied using the specified name and the default value is applied to the raw-only genes or cells.\n\nnote: Note\nIt is common to call reconstruct_axis! on the result (e.g., if the cells were collected from a set of batches).\n\n\n\n\n\n","category":"function"},{"location":"anndata_format.html#Metacells.AnnDataFormat.CopyAnnData","page":"AnnData Format","title":"Metacells.AnnDataFormat.CopyAnnData","text":"Specify how to copy data from AnnData to Daf. The key is simply a vector or matrix name (ignoring axes), and the value is either nothing to ignore the data, or a tuple with the name of the destination Daf property and an optional value to use for missing entries (raw-only cells and/or genes).\n\n\n\n\n\n","category":"type"},{"location":"anndata_format.html#Index","page":"AnnData Format","title":"Index","text":"","category":"section"},{"location":"anndata_format.html","page":"AnnData Format","title":"AnnData Format","text":"Pages = [\"anndata_format.md\"]","category":"page"},{"location":"spheres.html#Spheres","page":"Spheres","title":"Spheres","text":"","category":"section"},{"location":"spheres.html","page":"Spheres","title":"Spheres","text":"Metacells.Spheres\nMetacells.Spheres.compute_spheres!","category":"page"},{"location":"spheres.html#Metacells.Spheres","page":"Spheres","title":"Metacells.Spheres","text":"Given a set of raw metacells, partition them into spheres such that all metacells in the same sphere are within some (fold factor) radius of each other. The centroids of these spheres can serve as a representation of the cell state manifold which is less sensitive to oversampling of common cell states. Group these spheres in overlapping neighborhoods of \"similar\" spheres for further analysis.\n\n\n\n\n\n","category":"module"},{"location":"spheres.html#Metacells.Spheres.compute_spheres!","page":"Spheres","title":"Metacells.Spheres.compute_spheres!","text":"function compute_spheres!(\n    daf::DafWriter;\n    min_significant_gene_UMIs::Integer = 40,\n    gene_fraction_regularization::AbstractFloat = 1e-5,\n    confidence::AbstractFloat = 0.9,\n    max_sphere_diameter::AbstractFloat = 2.0,\n    max_neighborhood_diameter::AbstractFloat = 2.0,\n    noisy_gene_fold::AbstractFloat = 1.0,\n    min_gene_correlation::AbstractFloat = 0.5,\n    max_deviant_genes_fraction::AbstractFloat = 0.01,\n    overwrite::Bool = false,\n)::Nothing\n\nPartition raw metacells into distinct spheres, and spheres into overlapping neighborhoods.\n\nInitial spheres and neighborhoods are computed in a first round, and then refined in a series of followup rounds.\nIn each round, we compute a distance between each two metacells. This is based on the fold factor between the expression level of each (relevant) gene in the metacells. The fold factor is the absolute value of the difference in the log (base 2) of the fraction of the gene in the metacells. This log is computed with the gene_fraction_regularization (by default, 1e-5). Since the fraction of the gene is a random variable, we decrease the high fraction and increase the low fraction by a factor based on the confidence of the test (by default, 0.9), assuming a multinomial distribution. In addition, if the sum of the total UMIs of the gene in both metacells is less than min_significant_gene_UMIs (by default, 40), we ignore this fold factor as insignificant. Finally, for noisy genes, we reduce the fold factor by noisy_gene_fold. In the first round, we simply count the number of genes whose fold factor is more than max_sphere_diameter (for computing spheres) and max_sphere_diameter + max_neighborhood_diameter (for computing neighborhoods). In the followup rounds, we use the maximal gene fold, for genes that are correlated in the vicinity of the metacells (see below).\nWe use hierarchical clustering to partition the metacells to distinct spheres, such that the maximal distance between any metacells in the sphere is bounded. In the first round, this bound is the max_deviant_genes_fraction out of the total number of genes. In the followup rounds, this is the max_sphere_diameter.\nFor each sphere, we compute a main neighborhood of other spheres such that the maximal distance between any metacells in the neighborhood is bounded. In the first round, this bound is again the maximal number of deviant genes (this time, using the increased fold distance computed above). In the followup rounds, this is the max_sphere_diameter plus the max_neighborhood_diameter. These neighborhoods may overlap. The main neighborhoods of different spheres may even be identical.\nFor each sphere, we compute the set of genes which have at least the min_gene_correlation with some other gene(s) in its main neighborhood. We restrict the correlated set of genes of each metacell to be the intersection of this set with the set from its sphere in the previous round.\nIf the new sets of correlated genes are identical to the previous round, we are done. Otherwise we repeat the round, using the more restricted sets of correlated genes.\n\nIf overwrite is set, the results will replace any previously computed spheres and neighborhoods.\n\nCONTRACT\n\n\n\n\n\n","category":"function"},{"location":"spheres.html#Index","page":"Spheres","title":"Index","text":"","category":"section"},{"location":"spheres.html","page":"Spheres","title":"Spheres","text":"Pages = [\"spheres.md\"]","category":"page"},{"location":"index.html#Metacells","page":"Metacells","title":"Metacells","text":"","category":"section"},{"location":"index.html","page":"Metacells","title":"Metacells","text":"Metacells.Metacells","category":"page"},{"location":"index.html#Metacells.Metacells","page":"Metacells","title":"Metacells.Metacells","text":"The Metacells.jl package provides computational services for the metacells package, using Daf to hold the data. In the future, we'll ideally migrate all of the metacellspackage computations to this package, converting the Python package to a thin wrapper, and provide a similar thin R wrapper to provide metacell analysis from R as well. For now,Metacells.jlonly provides a subset of the features of the Pythonmetacellspackage, which requires users to convert data fromAnnData(for the old features) toDaf (to the new features).\n\n\n\n\n\n","category":"module"},{"location":"index.html#Index","page":"Metacells","title":"Index","text":"","category":"section"},{"location":"index.html","page":"Metacells","title":"Metacells","text":"","category":"page"}]
 }
diff --git a/docs/v0.1.0/spheres.html b/docs/v0.1.0/spheres.html
index fa3fe20..c257de4 100644
--- a/docs/v0.1.0/spheres.html
+++ b/docs/v0.1.0/spheres.html
@@ -166,6 +166,7 @@ <h1 id="Spheres">
     max_sphere_diameter::AbstractFloat = 2.0,
     max_neighborhood_diameter::AbstractFloat = 2.0,
     noisy_gene_fold::AbstractFloat = 1.0,
+    min_gene_correlation::AbstractFloat = 0.5,
     max_deviant_genes_fraction::AbstractFloat = 0.01,
     overwrite::Bool = false,
 )::Nothing
diff --git a/improved_spheres.auto.log b/improved_spheres.auto.log
new file mode 100644
index 0000000..3824752
--- /dev/null
+++ b/improved_spheres.auto.log
@@ -0,0 +1,24 @@
+ERROR: SystemError: opening file "/Users/obk/projects/Metacells.jl/scripts/improved_spheres.jl": No such file or directory
+Stacktrace:
+  [1] systemerror(p::String, errno::Int32; extrainfo::Nothing)
+    @ Base ./error.jl:176
+  [2] systemerror
+    @ Base ./error.jl:175 [inlined]
+  [3] open(fname::String; lock::Bool, read::Nothing, write::Nothing, create::Nothing, truncate::Nothing, append::Nothing)
+    @ Base ./iostream.jl:293
+  [4] open
+    @ Base ./iostream.jl:275 [inlined]
+  [5] open(f::Base.var"#433#434"{String}, args::String; kwargs::@Kwargs{})
+    @ Base ./io.jl:394
+  [6] open
+    @ Base ./io.jl:393 [inlined]
+  [7] read
+    @ Base ./io.jl:486 [inlined]
+  [8] _include(mapexpr::Function, mod::Module, _path::String)
+    @ Base ./loading.jl:2126
+  [9] include(mod::Module, _path::String)
+    @ Base ./Base.jl:495
+ [10] exec_options(opts::Base.JLOptions)
+    @ Base ./client.jl:318
+ [11] _start()
+    @ Base ./client.jl:552
diff --git a/src/anndata_format.jl b/src/anndata_format.jl
index e52c194..d9d749b 100644
--- a/src/anndata_format.jl
+++ b/src/anndata_format.jl
@@ -102,7 +102,7 @@ METACELLS_SQUARE_DATA = CopyAnnData(["obs_outgoing_weights" => ("outgoing_weight
         type_property::Maybe{AbstractString} = nothing,
         rename_type::Maybe{AbstractString} = "type",
         type_colors_csv::Maybe{AbstractString} = nothing,
-        type_properties::Maybe{AbstractStringSet} = nothing,
+        type_properties::Maybe{AbstractSet{<:AbstractString}} = nothing,
         properties_defaults::Maybe{Dict} = nothing,
     )::Nothing
 
@@ -186,7 +186,7 @@ to the raw-only genes or cells.
     rename_type::Maybe{AbstractString} = "type",
     empty_type::Maybe{AbstractString} = nothing,
     type_colors_csv::Maybe{AbstractString} = nothing,
-    type_properties::Maybe{AbstractStringSet} = nothing,
+    type_properties::Maybe{AbstractSet{<:AbstractString}} = nothing,
     properties_defaults::Maybe{Dict} = nothing,
 )::Nothing
     metacells_daf =  # NOJET
@@ -279,7 +279,7 @@ function import_metacells(
     rename_type::Maybe{AbstractString},
     empty_type::Maybe{AbstractString},
     type_colors_csv::Maybe{AbstractString},
-    type_properties::Maybe{AbstractStringSet},
+    type_properties::Maybe{AbstractSet{<:AbstractString}},
     properties_defaults::Maybe{Dict},
 )::Nothing
     copy_axis!(; destination = destination, source = metacells_daf, axis = "metacell")
@@ -325,7 +325,7 @@ function import_metacell_types(
     rename_type::Maybe{AbstractString},
     empty_type::Maybe{AbstractString},
     type_colors_csv::Maybe{AbstractString},
-    type_properties::Maybe{AbstractStringSet},
+    type_properties::Maybe{AbstractSet{<:AbstractString}},
     properties_defaults::Maybe{Dict},
 )::Nothing
     if rename_type === nothing
@@ -510,7 +510,7 @@ function import_mask_matrix(
     destination::DafWriter,
     source::DafReader,
     rename_type::AbstractString,
-    type_names::AbstractStringVector,
+    type_names::AbstractVector{<:AbstractString},
     prefix::AbstractString,
 )::Nothing
     any_exist = false
diff --git a/src/spheres.jl b/src/spheres.jl
index 78675b2..41afcc8 100644
--- a/src/spheres.jl
+++ b/src/spheres.jl
@@ -29,6 +29,7 @@ using Statistics
         max_sphere_diameter::AbstractFloat = 2.0,
         max_neighborhood_diameter::AbstractFloat = 2.0,
         noisy_gene_fold::AbstractFloat = 1.0,
+        min_gene_correlation::AbstractFloat = 0.5,
         max_deviant_genes_fraction::AbstractFloat = 0.01,
         overwrite::Bool = false,
     )::Nothing
@@ -254,8 +255,8 @@ end
 
 @logged function write_data(  # untested
     daf::DafWriter;
-    sphere_names::AbstractStringVector,
-    neighborhood_names::AbstractStringVector,
+    sphere_names::AbstractVector{<:AbstractString},
+    neighborhood_names::AbstractVector{<:AbstractString},
     spheres_of_metacells::Vector{UInt32},
     main_neighborhoods_of_spheres::Vector{UInt32},
     is_member_of_spheres_in_neighborhoods::AbstractMatrix{Bool},