Skip to content

Commit

Permalink
Better support for sparse data.
Browse files Browse the repository at this point in the history
  • Loading branch information
orenbenkiki committed Mar 29, 2024
1 parent 2f1433c commit b4a6e9d
Show file tree
Hide file tree
Showing 25 changed files with 383 additions and 176 deletions.
2 changes: 1 addition & 1 deletion docs/v0.1.0/.documenter-siteinfo.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"documenter":{"julia_version":"1.10.2","generation_timestamp":"2024-03-28T10:11:10","documenter_version":"1.3.0"}}
{"documenter":{"julia_version":"1.10.2","generation_timestamp":"2024-03-29T14:38:37","documenter_version":"1.3.0"}}
10 changes: 10 additions & 0 deletions docs/v0.1.0/copies.html
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,16 @@ <h1 id="Copies">
<code>empty
</code> must be specified to fill the missing values. If a source axis contains entries that do not exist in the target, they are discarded (not copied).
</p>
<div class="admonition is-info">
<header class="admonition-header">Note
</header>
<div class="admonition-body">
<p>When copying a matrix from a subset to a superset, if the
<code>empty
</code> value is zero, then we create a sparse matrix in the destination. However, currently we create a temporary dense matrix for this; this is inefficient and should be replaced by a more efficient method.
</p>
</div>
</div>
</div>
</section>
</article>
Expand Down
12 changes: 10 additions & 2 deletions docs/v0.1.0/data.html
Original file line number Diff line number Diff line change
Expand Up @@ -1555,7 +1555,7 @@ <h3 id="Creating-properties">
name::AbstractString,
eltype::Type{T},
nnz::StorageInteger,
indtype::Type{I};
indtype::Maybe{Type{I}} = nothing;
[overwrite::Bool = false]
)::Any where {T &lt;: StorageNumber, I &lt;: StorageInteger}
</code>
Expand All @@ -1574,6 +1574,10 @@ <h3 id="Creating-properties">
<code>fill
</code>, and return the result.
</p>
<p>If
<code>indtype
</code> is not specified, it is chosen automatically to be the smallest unsigned integer type needed for the vector.
</p>
<p>The returned vector will be uninitialized; the caller is expected to
<code>fill
</code> its
Expand Down Expand Up @@ -1715,7 +1719,7 @@ <h3 id="Creating-properties">
name::AbstractString,
eltype::Type{T},
nnz::StorageInteger,
intdype::Type{I};
intdype::Maybe{Type{I}} = nothing;
[overwrite::Bool = false]
)::Any where {T &lt;: StorageNumber, I &lt;: StorageInteger}
</code>
Expand All @@ -1738,6 +1742,10 @@ <h3 id="Creating-properties">
<code>fill
</code>, and return the result.
</p>
<p>If
<code>indtype
</code> is not specified, it is chosen automatically to be the smallest unsigned integer type needed for the matrix.
</p>
<p>The returned matrix will be uninitialized; the caller is expected to
<code>fill
</code> its
Expand Down
12 changes: 12 additions & 0 deletions docs/v0.1.0/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -1914,6 +1914,18 @@ <h1 id="Index">
</a>
</li>
<li>
<a href="storage_types.html#Daf.StorageTypes.sparse_matrix_csc">
<code>Daf.StorageTypes.sparse_matrix_csc
</code>
</a>
</li>
<li>
<a href="storage_types.html#Daf.StorageTypes.sparse_vector">
<code>Daf.StorageTypes.sparse_vector
</code>
</a>
</li>
<li>
<a href="tokens.html#Daf.Tokens.decode_expression">
<code>Daf.Tokens.decode_expression
</code>
Expand Down
Binary file modified docs/v0.1.0/objects.inv
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/v0.1.0/search_index.js

Large diffs are not rendered by default.

82 changes: 82 additions & 0 deletions docs/v0.1.0/storage_types.html
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,18 @@
</span>
</a>
</li>
<li class="toplevel">
<a class="tocitem" href="#Constructors">
<span>Constructors
</span>
</a>
</li>
<li class="toplevel">
<a class="tocitem" href="#Storable-types-2">
<span>Storable types
</span>
</a>
</li>
<li>
<a class="tocitem" href="#Index">
<span>Index
Expand Down Expand Up @@ -535,6 +547,64 @@ <h1 id="Storable-types">
</div>
</section>
</article>
<h1 id="Constructors">
<a class="docs-heading-anchor" href="#Constructors">Constructors
</a>
<a id="Constructors-1">
</a>
<a class="docs-heading-anchor-permalink" href="#Constructors" title="Permalink">
</a>
</h1>
<h1 id="Storable-types-2">
<a class="docs-heading-anchor" href="#Storable-types-2">Storable types
</a>
<a class="docs-heading-anchor-permalink" href="#Storable-types-2" title="Permalink">
</a>
</h1>
<article class="docstring">
<header>
<a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring">
</a>
<a class="docstring-binding" id="Daf.StorageTypes.sparse_vector" href="#Daf.StorageTypes.sparse_vector">
<code>Daf.StorageTypes.sparse_vector
</code>
</a>
<span class="docstring-category">Function
</span>
</header>
<section>
<div>
<pre>
<code class="language-julia hljs">function sparse_vector(dense::StorageMatrix)::SparseVector
</code>
</pre>
<p>Create a sparse vector using the smallest unsigned integer type needed for this size of matrix.
</p>
</div>
</section>
</article>
<article class="docstring">
<header>
<a class="docstring-article-toggle-button fa-solid fa-chevron-down" href="javascript:;" title="Collapse docstring">
</a>
<a class="docstring-binding" id="Daf.StorageTypes.sparse_matrix_csc" href="#Daf.StorageTypes.sparse_matrix_csc">
<code>Daf.StorageTypes.sparse_matrix_csc
</code>
</a>
<span class="docstring-category">Function
</span>
</header>
<section>
<div>
<pre>
<code class="language-julia hljs">function sparse_matrix_csc(dense::StorageMatrix)::SparseMatrixCSC
</code>
</pre>
<p>Create a sparse matrix using the smallest unsigned integer type needed for this size of matrix.
</p>
</div>
</section>
</article>
<h2 id="Index">
<a class="docs-heading-anchor" href="#Index">Index
</a>
Expand Down Expand Up @@ -604,6 +674,18 @@ <h2 id="Index">
</code>
</a>
</li>
<li>
<a href="storage_types.html#Daf.StorageTypes.sparse_matrix_csc">
<code>Daf.StorageTypes.sparse_matrix_csc
</code>
</a>
</li>
<li>
<a href="storage_types.html#Daf.StorageTypes.sparse_vector">
<code>Daf.StorageTypes.sparse_vector
</code>
</a>
</li>
</ul>
</article>
<nav class="docs-footer">
Expand Down
44 changes: 33 additions & 11 deletions src/anndata_format.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,11 @@ using Daf.Formats
using Daf.Generic
using Daf.MatrixLayouts
using Daf.MemoryFormat
using Daf.MemoryFormat
using Daf.StorageTypes
using DataFrames
using HDF5
using Muon
using SparseArrays

import Daf.Data.require_matrix
import Daf.Formats
Expand Down Expand Up @@ -212,8 +212,15 @@ function verify_is_supported_type(
property::AbstractString,
unsupported_handler::AbnormalHandler,
)::Nothing
if value isa StorageMatrix && !(value isa Muon.TransposedDataset) && major_axis(value) == nothing
report_unsupported(name, unsupported_handler, "type not in row/column-major layout: $(typeof(value))\n") # untested
if value isa StorageMatrix &&
major_axis(value) == nothing &&
!(value isa Muon.TransposedDataset) &&
!(value isa Muon.SparseDataset)
report_unsupported( # untested
name,
unsupported_handler,
"type not in row/column-major layout: $(typeof(value))\n" * "of the property: $(property)\n",
)
end
if value isa CategoricalArray
return nothing # untested
Expand All @@ -222,7 +229,9 @@ function verify_is_supported_type(
report_unsupported(
name,
unsupported_handler,
"unsupported type for $(property): $(typeof(value))\nsupported type is: $(supported_type)\n",
"unsupported type: $(typeof(value))\n" *
"of the property: $(property)\n" *
"supported type is: $(supported_type)\n",
)
end
return nothing
Expand Down Expand Up @@ -287,13 +296,10 @@ function copy_supported_vectors(frame::DataFrame, memory::MemoryDaf, axis::Abstr
for column in names(frame)
vector = frame[!, column]
if vector isa CategoricalVector
vector = [ # untested
if value === missing
""
else
string(value)
end for value in vector
]
vector = [value === missing ? "" : string(value) for value in vector] # untested
end
if vector isa BitVector
vector = Vector{Bool}(vector) # untested
end
if vector isa StorageVector
set_vector!(memory, axis, column, vector)
Expand Down Expand Up @@ -330,6 +336,22 @@ function copy_supported_matrix( # untested
return nothing
end

function copy_supported_matrix( # untested
matrix::Muon.SparseDataset,
memory::MemoryDaf,
rows_axis::AbstractString,
columns_axis::AbstractString,
name::AbstractString,
)::Nothing
sparse_matrix = read(matrix)
if matrix.csr
copy_supported_matrix(transpose(sparse_matrix), memory, columns_axis, rows_axis, name)
else
copy_supported_matrix(sparse_matrix, memory, rows_axis, columns_axis, name)
end
return nothing
end

function copy_supported_matrix(
matrix::Muon.TransposedDataset,
memory::MemoryDaf,
Expand Down
31 changes: 5 additions & 26 deletions src/concat.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ using SparseArrays
import Daf.Data.require_axis
import Daf.Data.require_no_axis
import Daf.Data.require_no_matrix
import Daf.StorageTypes.indtype_for_size

"""
A vector of pairs where the key is a [`DataKey`](@ref) and the value is [`MergeAction`](@ref). Similarly to
Expand Down Expand Up @@ -577,17 +578,8 @@ function concatenate_axis_sparse_vectors(
overwrite::Bool,
)::Nothing
nnz_offsets, nnz_sizes, total_nnz = nnz_arrays(vectors)
indtype = indtype_for_size(concatenated_axis_size)

empty_sparse_vector!(
destination,
axis,
vector_property,
dtype,
total_nnz,
indtype;
overwrite = overwrite,
) do nzind, nzval
empty_sparse_vector!(destination, axis, vector_property, dtype, total_nnz; overwrite = overwrite) do nzind, nzval
@threads for index in 1:length(vectors)
offset = offsets[index]
nnz_offset = nnz_offsets[index]
Expand Down Expand Up @@ -700,16 +692,14 @@ function concatenate_axis_sparse_matrices(
overwrite::Bool,
)::Nothing
nnz_offsets, nnz_sizes, total_nnz = nnz_arrays(matrices)
indtype = indtype_for_size(concatenated_axis_size)

empty_sparse_matrix!(
destination,
other_axis,
axis,
matrix_property,
dtype,
total_nnz,
indtype;
total_nnz;
overwrite = overwrite,
) do colptr, rowval, nzval
@threads for index in 1:length(matrices)
Expand Down Expand Up @@ -1008,16 +998,14 @@ function concatenate_merge_sparse_vector(
overwrite::Bool,
)::Nothing
nnz_offsets, nnz_sizes, total_nnz = nnz_arrays(vectors)
indtype = indtype_for_size(nrows * length(vectors))

empty_sparse_matrix!(
destination,
axis,
dataset_axis,
vector_property,
dtype,
total_nnz,
indtype;
total_nnz;
overwrite = overwrite,
) do colptr, rowval, nzval
colptr[1] == 1
Expand Down Expand Up @@ -1285,7 +1273,7 @@ function sparsify_vectors(
@assert length(vector) == size
vector = vector.array
if !(vector isa SparseVector)
vector = SparseVector(vector)
vector = sparse_vector(vector)
end
sparse_vectors[index] = vector
end
Expand Down Expand Up @@ -1339,13 +1327,4 @@ function dtype_for_size(size::Integer, types::NTuple{N, Type})::Type where {N}
return types[end]
end

function indtype_for_size(size::Integer)::Type
for type in UNSIGNED_TYPES[1:(end - 1)]
if size <= typemax(type)
return type
end
end
return UNSIGNED_TYPES[end] # untested
end

end # module
Loading

0 comments on commit b4a6e9d

Please sign in to comment.