Skip to content

Commit

Permalink
Fix types, optimize, get rid of spheres data.
Browse files Browse the repository at this point in the history
  • Loading branch information
orenbenkiki committed Apr 10, 2024
1 parent b2022e2 commit 2e76969
Show file tree
Hide file tree
Showing 8 changed files with 94 additions and 100 deletions.
4 changes: 2 additions & 2 deletions deps/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
set -e -o pipefail
deps/clean.sh
JULIA_NUM_THREADS=4 julia --color=no --code-coverage=tracefile.info deps/test.jl "$@" \
rm -rf tracefile.info src/*.cov src/*/*.cov test/*.cov
JULIA_DEBUG="" JULIA_NUM_THREADS=4 julia --color=no --code-coverage=tracefile.info deps/test.jl "$@" \
|| (deps/clean.sh && false)
5 changes: 5 additions & 0 deletions deps/untested_lines.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
#!/bin/bash
set -e -o pipefail
if [ `echo */*.cov` = '*/*.cov' ]
then
echo "No coverage!"
exit 0
fi
grep -H -n '.' */*.cov \
| sed 's/\.[0-9][0-9]*\.cov:\([0-9][0-9]*\): [ ]*\([^ ]*\) /`\1`\2`/' \
| sort -t '`' -k '1,1' -k '2n,2' \
Expand Down
2 changes: 1 addition & 1 deletion docs/v0.1.0/.documenter-siteinfo.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"documenter":{"julia_version":"1.10.2","generation_timestamp":"2024-04-08T10:15:27","documenter_version":"1.3.0"}}
{"documenter":{"julia_version":"1.10.2","generation_timestamp":"2024-04-10T08:30:47","documenter_version":"1.3.0"}}
2 changes: 2 additions & 0 deletions docs/v0.1.0/anndata_format.html
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ <h1 id="AnnData-Format">
<code>fraction
</code>.
</li>
<li>Matrices and vectors of counts (UMIs, zeros) or module indices are converted to an unsigned type.
</li>
<li>The
<code>__name__
</code> scalar is not copied.
Expand Down
2 changes: 1 addition & 1 deletion docs/v0.1.0/search_index.js

Large diffs are not rendered by default.

6 changes: 0 additions & 6 deletions docs/v0.1.0/spheres.html
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,6 @@ <h1 id="Neighborhoods">
</code>, or 8x).
</p>
</li>
<li>
<p>Compute the centroid (geomean) of each gene&#39;s fraction of each sphere, using the same
<code>gene_fraction_regularization
</code> as above.
</p>
</li>
</ol>
<p>CONTRACT
</p>
Expand Down
62 changes: 42 additions & 20 deletions src/anndata_format.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,29 @@ GENE_VECTORS_DATA = CopyData([
"atlas_marker_gene" => ("is_atlas_marker", false),
"atlas_noisy_gene" => ("is_atlas_noisy", false),
"bursty_lonely_gene" => ("is_bursty_lonely", false),
"correction_factor" => ("correction_factor", 0.0),
"correction_factor" => ("correction_factor", Float32(0.0)),
"excluded_gene" => nothing,
"full_gene_index" => nothing,
"fitted" => ("is_fitted", false),
"ignored_gene" => ("is_ignored", false),
"lateral_gene" => ("is_lateral", false),
"lateral_genes_module" => ("lateral_module", 0),
"lateral_genes_module" => ("lateral_module", UInt32(0)),
"marker_gene" => ("is_marker", false),
"noisy_gene" => ("is_noisy", false),
"projected_noisy_gene" => ("is_projected_noisy", false),
"properly_sampled_gene" => ("is_properly_sampled", false),
"rare_gene" => ("is_rare", false),
"rare_gene_module" => ("rare_module", 0),
"selected_gene" => ("is_selected", false),
"significant_inner_folds_count" => ("significant_inner_folds_count", 0),
"significant_inner_folds_count" => ("significant_inner_folds_count", UInt32(0)),
])

CELL_VECTORS_DATA = CopyData([
"cell_type" => ("projected_type", ""),
"cells_rare_gene_module" => ("rare_gene_module", 0),
"cells_rare_gene_module" => ("rare_gene_module", UInt32(0)),
"dissolve" => ("is_dissolved", false),
"excluded_cell" => nothing,
"excluded_umis" => ("excluded_UMIs", 0),
"excluded_umis" => ("excluded_UMIs", UInt32(0)),
"full_cell_index" => nothing,
"metacell" => nothing,
"metacell_name" => ("metacell", ""),
Expand All @@ -59,27 +59,29 @@ METACELL_VECTORS_DATA = CopyData([
"rare_metacell" => ("is_rare", nothing),
"similar" => ("is_similar", nothing),
"total_umis" => ("total_UMIs", nothing),
"__zero" => ("total_UMIs", nothing),
"__zeros_downsample_umis" => ("__zeros_downsample_UMIs", nothing),
])

CELLS_MATRICES_DATA = CopyData([
"deviant_fold" => ("deviant_fold", 0.0),
"inner_fold" => ("inner_fold", 0.0),
"inner_stdev_log" => ("inner_stdev_log", 0.0),
"UMIs" => ("UMIs", 0.0),
"deviant_fold" => ("deviant_fold", Float32(0.0)),
"inner_fold" => ("inner_fold", Float32(0.0)),
"inner_stdev_log" => ("inner_stdev_log", Float32(0.0)),
"UMIs" => ("UMIs", UInt32(0)),
])

METACELLS_MATRICES_DATA = CopyData([
"corrected_fraction" => ("corrected_fraction", 0.0),
"corrected_fraction" => ("corrected_fraction", Float32(0.0)),
"essential" => ("is_essential", false),
"fitted" => ("is_fitted", false),
"fraction" => ("fraction", false),
"inner_fold" => ("inner_fold", 0.0),
"inner_stdev_log" => ("inner_stdev_log", 0.0),
"fraction" => ("fraction", Float32(0.0)),
"inner_fold" => ("inner_fold", Float32(0.0)),
"inner_stdev_log" => ("inner_stdev_log", Float32(0.0)),
"misfit" => ("is_misfit", false),
"projected_fold" => ("projected_fold", 0.0),
"projected_fraction" => ("projected_fraction", 0.0),
"total_umis" => ("total_UMIs", 0.0),
"zeros" => ("zeros", 0.0),
"projected_fold" => ("projected_fold", Float32(0.0)),
"projected_fraction" => ("projected_fraction", Float32(0.0)),
"total_umis" => ("total_UMIs", UInt32(0)),
"zeros" => ("zeros", UInt32(0)),
])

METACELLS_SQUARE_DATA = CopyData(["obs_outgoing_weights" => ("outgoing_weights", nothing)])
Expand Down Expand Up @@ -114,6 +116,7 @@ This will mostly just read all the specified `h5ad` files and copy the data into
changes to match the ``Daf`` capabilities and conventions:
- The `X` matrix of the cells is renamed to `UMIs`, and the `X` matrix of the metacells is renamed to `fraction`.
- Matrices and vectors of counts (UMIs, zeros) or module indices are converted to an unsigned type.
- The `__name__` scalar is not copied.
- The `excluded_gene` and `excluded_cell` masks are not copied. Instead, if `raw_cells_h5ad` is specified, an
`is_excluded` mask is created for both cells and genes, marking these that exist only in the `raw_cells_h5ad` and
Expand Down Expand Up @@ -372,12 +375,22 @@ function copy_vectors(
vector = [name in valid_names ? name : "" for name in vector]
set_vector!(source, axis, vector_name, vector; overwrite = true)
end
copy_vector!(; # NOJET

if empty !== nothing
dtype = typeof(empty)
elseif contains(rename, "UMIs")
dtype = UInt32
else
dtype = nothing
end

copy_vector!(; # NOJET # NOLINT
destination = destination,
source = source,
axis = axis,
name = vector_name,
rename = rename,
dtype = dtype,
empty = empty,
)
end
Expand Down Expand Up @@ -414,13 +427,22 @@ function copy_matrices(
if data !== nothing
rename, empty = data
if !has_matrix(destination, rows_axis, columns_axis, rename; relayout = true)
copy_matrix!(; # NOJET
if empty !== nothing
dtype = typeof(empty)
elseif contains(rename, "UMIs")
dtype = UInt32
else
dtype = nothing
end

copy_matrix!(; # NOJET # NOLINT
destination = destination,
source = source,
rows_axis = rows_axis,
columns_axis = columns_axis,
name = matrix_name,
rename = rename,
dtype = dtype,
empty = empty,
relayout = true,
)
Expand Down Expand Up @@ -457,7 +479,7 @@ function copy_mask_matrix(
mask_matrix::SparseMatrixCSC{Bool} = hcat(mask_vectors...) # NOJET
mask_name = "is_$(prefix)"
set_matrix!(source, "gene", "type", mask_name, mask_matrix; relayout = false)
return copy_matrix!(;
return copy_matrix!(; # NOJET
destination = destination,
source = source,
rows_axis = "gene",
Expand Down
111 changes: 41 additions & 70 deletions src/spheres.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ Partition raw metacells into distinct spheres, using a variant of union-find:
3. Pass on all metacell pairs, ordered by increasing distance. If the metacells belong to different spheres, merge
both spheres into a single one if the maximal distance between the metacells of the combined sphere is no more than
`max_sphere_fold_factor` (by default, `3.0`, or 8x).
4. Compute the centroid (geomean) of each gene's fraction of each sphere, using the same
`gene_fraction_regularization` as above.
CONTRACT
"""
Expand All @@ -53,13 +51,6 @@ CONTRACT
"The total number of UMIs used to estimate the fraction of each gene in each metacell.",
),
("metacell", "sphere") => (GuaranteedOutput, AbstractString, "The unique sphere each metacell belongs to."),
("sphere", "gene", "fraction") =>
(GuaranteedOutput, AbstractFloat, "The centroid fraction of each gene in each sphere."),
("sphere", "gene", "total_UMIs") => (
GuaranteedOutput,
AbstractFloat,
"The total number of UMIs used to estimate the fraction of each gene in each sphere.",
),
],
) function compute_spheres!(
daf::DafWriter;
Expand All @@ -70,8 +61,11 @@ CONTRACT
@assert max_sphere_fold_factor > 0
@assert gene_fraction_regularization > 0

genes_metacells_log_fraction = daf["/ gene / metacell : fraction % Log base 2 eps $(gene_fraction_regularization)"]
genes_metacells_total_UMIs = get_matrix(daf, "gene", "metacell", "total_UMIs")
@debug "read..."

genes_metacells_log_fraction =
daf["/ gene / metacell : fraction % Log base 2 eps $(gene_fraction_regularization)"].array
genes_metacells_total_UMIs = get_matrix(daf, "gene", "metacell", "total_UMIs").array

check_efficient_action("compute_spheres", Columns, "genes_metacells_log_fraction", genes_metacells_log_fraction)
check_efficient_action("compute_spheres", Columns, "genes_metacells_total_UMIs", genes_metacells_total_UMIs)
Expand All @@ -94,52 +88,59 @@ CONTRACT

metacells_of_spheres = collect_group_members(spheres_of_metacells)

@debug "group..."

sphere_names = group_names(daf, "metacell", metacells_of_spheres; prefix = "S")
add_axis!(daf, "sphere", sphere_names)
set_vector!(daf, "metacell", "sphere", sphere_names[spheres_of_metacells])

genes_spheres_fraction, genes_spheres_total_UMIs = compute_spheres_data(
gene_fraction_regularization,
metacells_of_spheres,
genes_metacells_log_fraction,
genes_metacells_total_UMIs,
)
set_matrix!(daf, "gene", "sphere", "fraction", genes_spheres_fraction)
set_matrix!(daf, "gene", "sphere", "total_UMIs", genes_spheres_total_UMIs)

return nothing
end

function compute_sorted_distance_pairs( # untested
@logged function compute_sorted_distance_pairs( # untested
daf::DafReader,
max_sphere_fold_factor::AbstractFloat,
min_significant_gene_UMIs::Unsigned,
genes_metacells_log_fraction::AbstractMatrix{<:AbstractFloat},
genes_metacells_total_UMIs::AbstractMatrix{<:Unsigned},
)::Vector{Tuple{Float32, UInt32, UInt32}}
distance_pairs = Vector{Tuple{Float32, UInt32, UInt32}}()

n_metacells = axis_length(daf, "metacell")
for first_metacell in 1:n_metacells
for second_metacell in 1:(first_metacell - 1)
distance = compute_distance(
first_metacell,
second_metacell,
min_significant_gene_UMIs,
genes_metacells_log_fraction,
genes_metacells_total_UMIs,
)
if distance <= max_sphere_fold_factor
push!(distance_pairs, (distance, first_metacell, second_metacell))
n_pairs = div(n_metacells * (n_metacells - 1), 2)

distance_pairs = Vector{Tuple{Float32, UInt32, UInt32}}(undef, n_pairs)
next_pair_index = Atomic{Int64}(1)

@debug "collect close distance pairs..."
@threads for base_metacell in reverse(2:n_metacells)
@views genes_log_fraction_of_base_metacell = genes_metacells_log_fraction[:, base_metacell]
@views genes_log_fraction_of_other_metacells = genes_metacells_log_fraction[:, 1:(base_metacell - 1)]
@views genes_total_UMIs_of_base_metacell = genes_metacells_total_UMIs[:, base_metacell]
@views genes_total_UMIs_of_other_metacells = genes_metacells_total_UMIs[:, 1:(base_metacell - 1)]

fold_factors_of_other_metacells =
abs.(genes_log_fraction_of_other_metacells .- genes_log_fraction_of_base_metacell) .*
(genes_total_UMIs_of_other_metacells .+ genes_total_UMIs_of_base_metacell .>= min_significant_gene_UMIs)
distances_of_other_metacells = maximum(fold_factors_of_other_metacells; dims = 1) # NOJET
is_close_of_other_metacells = distances_of_other_metacells .<= max_sphere_fold_factor
n_close = sum(is_close_of_other_metacells)
pair_index = atomic_add!(next_pair_index, n_close)
for (other_metacell, is_close) in enumerate(is_close_of_other_metacells)
if is_close
distance = distances_of_other_metacells[other_metacell]
distance_pairs[pair_index] = (distance, base_metacell, other_metacell)
pair_index += 1
end
end
end

n_close = next_pair_index[] - 1
resize!(distance_pairs, n_close)
@debug "collected $(n_close) close distance pairs, sort..."
sort!(distance_pairs)
return distance_pairs
end

function compute_sphere_of_metacells( # untested
@logged function compute_sphere_of_metacells( # untested
sorted_distance_pairs::Vector{Tuple{Float32, UInt32, UInt32}},
max_sphere_fold_factor::AbstractFloat,
min_significant_gene_UMIs::Unsigned,
Expand Down Expand Up @@ -218,15 +219,14 @@ function compute_distance( # untested
)::Float32
@views genes_log_fraction_of_first_metacell = genes_metacells_log_fraction[:, first_metacell] # NOJET
@views genes_log_fraction_of_second_metacell = genes_metacells_log_fraction[:, second_metacell]
fold_factors = abs.(genes_log_fraction_of_first_metacell .- genes_log_fraction_of_second_metacell)

@views genes_total_UMIs_of_first_metacell = genes_metacells_total_UMIs[:, first_metacell] # NOJET
@views genes_total_UMIs_of_second_metacell = genes_metacells_total_UMIs[:, second_metacell]
significant_fold_factors_mask =
(genes_total_UMIs_of_first_metacell .+ genes_total_UMIs_of_second_metacell) .>= min_significant_gene_UMIs

@views distance = max(fold_factors[significant_fold_factors_mask])
return distance
return maximum(
abs.(
genes_log_fraction_of_first_metacell .- genes_log_fraction_of_second_metacell #
)[genes_total_UMIs_of_first_metacell .+ genes_total_UMIs_of_second_metacell .>= min_significant_gene_UMIs],
)
end

function merge_spheres( # untested
Expand All @@ -243,33 +243,4 @@ function merge_spheres( # untested
return nothing
end

function compute_spheres_data( # untested
gene_fraction_regularization::AbstractFloat,
metacells_of_spheres::AbstractVector{<:AbstractVector{<:Integer}},
genes_metacells_log_fraction::AbstractMatrix{F},
genes_metacells_total_UMIs::AbstractMatrix{T},
)::Tuple{Matrix{F}, Matrix{T}} where {F <: AbstractFloat, T <: Unsigned}
n_genes = size(genes_metacells_log_fraction, 1)
n_spheres = length(metacells_of_spheres)

genes_spheres_fraction = Matrix{F}(undef, (n_genes, n_spheres))
genes_spheres_total_UMIs = Matrix{T}(undef, (n_genes, n_spheres))

check_efficient_action("compute_spheres_data", Columns, "genes_spheres_fraction", genes_spheres_fraction)
check_efficient_action("compute_spheres_data", Columns, "genes_spheres_total_UMIs", genes_spheres_total_UMIs)

@threads for (sphere, metacells_of_sphere) in enumerate(metacells_of_spheres)
@views genes_sphere_metacells_total_UMIs = genes_metacells_total_UMIs[:, metacells_of_sphere]
@views sphere_total_UMIs_of_genes = genes_spheres_total_UMIs[:, sphere]
sphere_total_UMIs_of_genes .= sum(genes_sphere_metacells_total_UMIs; dims = 2)

@views genes_sphere_metacells_log_fraction = genes_metacells_log_fraction[:, metacells_of_sphere]
@views sphere_fraction_of_genes = genes_spheres_fraction[:, sphere]
sphere_fraction_of_genes .= # NOJET
2 .^ mean(genes_sphere_metacells_log_fraction; dims = 2) - gene_fraction_regularization
end

return (genes_spheres_fraction, genes_spheres_total_UMIs)
end

end # module

0 comments on commit 2e76969

Please sign in to comment.