diff --git a/CHANGELOG.md b/CHANGELOG.md index b8b9ef4..b2f783a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.6] - 2023-12-15 + +### Added + +* `local_outlier_factor!` - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Supports finding neighbors in a low dimensional space (e.g. after PCA or UMAP), but computing distances in a high dimensional space (e.g. after normalization). +* `local_outlier_factor_projection!` - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Only points in the `base` data set are considered as neighbors. + +### Fixed + +* `knn_adjacency_matrix` - kwarg `make_symmetric` must now be specified by the caller. + + ## [0.3.5] - 2023-12-12 ### Added diff --git a/Project.toml b/Project.toml index e800d81..3cfbd12 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "SingleCellProjections" uuid = "03d38035-ed2f-4a36-82eb-797f1727ab2e" authors = ["Rasmus Henningsson "] -version = "0.3.5" +version = "0.3.6" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" diff --git a/src/SingleCellProjections.jl b/src/SingleCellProjections.jl index 1e19dc8..96aac5e 100644 --- a/src/SingleCellProjections.jl +++ b/src/SingleCellProjections.jl @@ -40,6 +40,8 @@ export var_to_obs_table, var_counts_fraction!, pseudobulk, + local_outlier_factor!, + local_outlier_factor_projection!, ftest!, ftest, ftest_table, @@ -89,6 +91,13 @@ include("sctransformsparse.jl") include("implicitsvd.jl") +include("lowrank.jl") +include("projectionmodels.jl") +include("datamatrix.jl") +include("subset_expression.jl") + +include("adjacency_matrices.jl") + include("barnes_hut.jl") include("force_layout.jl") include("embed.jl") @@ -97,11 +106,6 @@ include("h5ad.jl") include("mannwhitney.jl") -include("lowrank.jl") -include("projectionmodels.jl") -include("datamatrix.jl") -include("subset_expression.jl") - include("filter.jl") include("load.jl") include("transform.jl") @@ -113,6 +117,8 @@ include("statistical_tests.jl") include("counts_fraction.jl") include("pseudobulk.jl") +include("local_outlier_factor.jl") + include("precompile.jl") @static if !isdefined(Base, :get_extension) diff --git a/src/adjacency_matrices.jl b/src/adjacency_matrices.jl new file mode 100644 index 0000000..11956f2 --- /dev/null +++ b/src/adjacency_matrices.jl @@ -0,0 +1,114 @@ +function knn_adjacency_matrix(X; k, make_symmetric) + N = size(X,2) + k = min(k,N-1) + tree = KDTree(X) + indices,_ = knn(tree, X, k+1) + + I = zeros(Int, k*N) + J = zeros(Int, k*N) + destInd = 1 + for (j,ind) in enumerate(indices) + skip = 0 + for a in 1:k + ind[a]==j && (skip=1) + I[destInd] = ind[a+skip] + J[destInd] = j + destInd += 1 + end + end + adj = sparse(I,J,true,N,N) + if make_symmetric + adj = adj.|adj' + end + adj +end + +function knn_adjacency_matrix(X, Y; k) + # TODO: Avoid some code duplication by merging this with other functions? (knn_adjacency_matrix and embed_points) + N1 = size(X,2) + N2 = size(Y,2) + @assert size(X,1) == size(Y,1) + k = min(N1,k) # cannot have more neighbors than points! + + tree = KDTree(X) # Choose BallTree if dimension is higher? + indices,dists = knn(tree, Y, k) # TODO: use threads? + + # N1xN2 sparse adjacency matrix between two matrices + I = reduce(vcat,indices) + J = repeat(1:N2; inner=k) + + sparse(I,J,true,N1,N2) +end + + +function knn_adjacency_matrix(data::DataMatrix; kwargs...) + adj = knn_adjacency_matrix(obs_coordinates(data.matrix); kwargs...) + obs = copy(data.obs) + DataMatrix(adj, obs, obs; var_id_cols=data.obs_id_cols, data.obs_id_cols) +end + +function knn_adjacency_matrix(X::DataMatrix, Y::DataMatrix; kwargs...) + adj = knn_adjacency_matrix(obs_coordinates(X.matrix), obs_coordinates(Y.matrix); kwargs...) + DataMatrix(adj, copy(X.obs), copy(Y.obs); var_id_cols=X.obs_id_cols, Y.obs_id_cols) +end + + + + + +""" + adjacency_distances(adj, X, Y=X) + +For each structural non-zero in `adj`, compute the Euclidean distance between the point in +the DataMatrix `Y` and the point in the DataMatrix `X`. + +Can be useful when `adj` is created using e.g. a lower dimensional representation and we +want to know the distances in the original, high dimensional space. + +At the moment all points in `Y` are required to have the same number of neighbors in `X`, +for computation reasons. +""" +function adjacency_distances(adj::DataMatrix, X::DataMatrix, Y::DataMatrix=X) + table_cols_equal(adj.var, X.obs; cols=X.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.") + table_cols_equal(adj.obs, Y.obs; cols=Y.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.") + D = _adjacency_distances(adj.matrix, X, Y) + DataMatrix(D, copy(adj.var), copy(adj.obs); adj.var_id_cols, adj.obs_id_cols) +end + + +function _adjacency_distances(adj, X::DataMatrix, Y::DataMatrix=X) + N1,N2 = size(adj) + @assert size(X,2)==N1 + @assert size(Y,2)==N2 + + nnbors = vec(sum(!iszero, adj; dims=1)) + @assert all(isequal(nnbors[1]), nnbors) "All points in Y must have the same number of neighbors in X" + k = nnbors[1] + @assert k>0 + + DX2 = compute(DiagGram(X.matrix)) + DY2 = compute(DiagGram(Y.matrix)) + + I,J,_ = findnz(adj) + dists = zeros(length(I)) # output + + for i in 1:k + Is = I[i:k:end] + # Js = J[i:k:end] # guaranteed to be 1:N2 + + # Xs = X[:,Is] # Doesn't work since DataMatrix doesn't allow duplicate IDs + # Temporary workaround - TODO: fix proper interface? + Xs = DataMatrix(_subsetmatrix(X.matrix,:,Is), X.var, DataFrame(id=1:length(Is)); var_id_cols=X.var_id_cols) + + + # Ys = Y[:,Js] # guaranteed to be equal to Y + + DXYs = compute(Diag(matrixproduct(Xs.matrix',Y.matrix))) + + DX2s = DX2[Is] + + dists[i:k:end] .= sqrt.(max.(0.0, DX2s .+ DY2 .- 2DXYs)) + end + + sparse(I,J,dists,N1,N2) +end diff --git a/src/force_layout.jl b/src/force_layout.jl index 473a890..b26a871 100644 --- a/src/force_layout.jl +++ b/src/force_layout.jl @@ -1,31 +1,3 @@ -function knn_adjacency_matrix(X; k, make_symmetric=true) - N = size(X,2) - k = min(k,N-1) - tree = KDTree(X) - indices,_ = knn(tree, X, k+1) - - I = zeros(Int, k*N) - J = zeros(Int, k*N) - destInd = 1 - for (j,ind) in enumerate(indices) - skip = 0 - for a in 1:k - ind[a]==j && (skip=1) - I[destInd] = ind[a+skip] - J[destInd] = j - destInd += 1 - end - end - adj = sparse(I,J,true,N,N) - if make_symmetric - adj = adj.|adj' - end - adj -end - - - - _randinit(::Val{ndim}, rng, N::Int, scale) where ndim = scale.*randn(rng, SVector{ndim,Float64}, N) diff --git a/src/local_outlier_factor.jl b/src/local_outlier_factor.jl new file mode 100644 index 0000000..4e6a8de --- /dev/null +++ b/src/local_outlier_factor.jl @@ -0,0 +1,174 @@ +function local_reachability_density(dists, kdists) + N1,N2 = size(dists) + @assert length(kdists) == N1 + lrd = zeros(N2) + + R = rowvals(dists) + V = nonzeros(dists) + for j=1:N2 + rd_sum = 0.0 + for k in nzrange(dists,j) + i = R[k] + d = V[k] + rd = max(kdists[i], d) # reachability distance from point j to point i + rd_sum += rd + end + lrd[j] = length(nzrange(dists,j))/rd_sum + end + + lrd +end + + +# adj can be either the distance matrix or the adjacency matrix - since only the sparsity pattern is considered +function _local_outlier_factor(adj::AbstractSparseMatrix, lrdX::AbstractVector, lrdY::AbstractVector=lrdX) + N1,N2 = size(adj) + @assert length(lrdX) == N1 + @assert length(lrdY) == N2 + + lof = zeros(N2) + + R = rowvals(adj) + for j=1:N2 + lrdX_sum = 0.0 + for k in nzrange(adj,j) + i = R[k] + lrdX_sum += lrdX[i] + end + lof[j] = lrdX_sum/(length(nzrange(adj,j)) * lrdY[j]) + end + + lof +end + + + + + +""" + local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF") + +Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to +`data.obs` with the name `col`. + +When working with projected `DataMatrices`, use [`local_outlier_factor_projection!`](@ref) +instead. + +NB: This function might be very slow for high values of `k`. + +First, the `k` nearest neighbors are found for each observation in `data`. +Then, the Local Outlier Factor is computed by considering the distance between the +neighbors, but this time in the `full` DataMatrix. Thus `full` must have the same +observations as are present in `data`. + +A LOF value smaller than or close to one is means that the observation is an inlier, but a +LOF value much larger than one means that the observation is an inlier. + +By specifiying `full=data`, this is coincides with the standard definition for Local Outlier +Factor. +However, it is perhaps more useful to find neighbors in a dimension reduced space (after +e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space +(typically after normalization). +This way, an observation is concidered an outlier if the reduction to a lower dimensional +space didn't properly represent the neighborhood of the observation. + +!!! note + The interface is not yet fully decided and subject to change. + +# Examples + +Compute the Local Outlier Factor, with nearest neighbors based only on `reduced`, but later +using distances in `full` for the actual LOF computation. + +```julia +julia> reduced = svd(normalized; nsv=10) + +julia> local_outlier_factor!(reduced, normalized; k=10) +``` + +See also: [`local_outlier_factor_projection!`](@ref) +""" +function local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF") + adj = knn_adjacency_matrix(data; k, make_symmetric=false) + dists = adjacency_distances(adj, full) + kdists = vec(maximum(dists.matrix; dims=1)) + lrd = local_reachability_density(dists.matrix, kdists); + data.obs[:,col] = _local_outlier_factor(dists.matrix, lrd) + data +end + +""" + local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix, base::DataMatrix, base_full::DataMatrix; k=10, col="LOF_projection") + +Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to +`data.obs` with the name `col`. + +Use `local_outlier_factor_projection!` if you are working with projected data, and +[`local_outlier_factor!`](@ref) otherwise. + +Parameters: + +* `data` - A `DataMatrix` for which we compute LOF for each observation. Expected to be a `DataMatrix` projected onto `base`, so that the `data` and `base` use the same coordinate system. +* `full` - A `DataMatrix` with the same observations as `data`, used to compute distances in the LOF computation. Expected to be a `DataMatrix` projected onto `base_full`, so that the `full` and `base_full` use the same coordinate system. +* `base` - The base `DataMatrix`. +* `base_full` - The base `DataMatrix`. +* `k` - The number of nearest neighbors to use. NB: This function might be very slow for high values of `k`. + +First, for each observation in `data`, the `k` nearest neighbors in `base` are found. +Then, the distance to each neighbor is computed using `full` and `base_full`. +Thus `full` must have the same observations as are present in `data`, and `base_full` must +have the same observations as `base`. + +A LOF value smaller than or close to one is means that the observation is an inlier, but a +LOF value much larger than one means that the observation is an inlier. + +By specifiying `full=data` and `base_full=base`, this is coincides with the standard +definition for Local Outlier Factor. +However, it is perhaps more useful to find neighbors in a dimension reduced space (after +e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space +(typically after normalization). +This way, an observation is concidered an outlier if the reduction to a lower dimensional +space didn't properly represent the neighborhood of the observation. + +!!! note + The interface is not yet fully decided and subject to change. + + +# Examples + +Compute the Local Outlier Factor (LOF) for each observation in a data set `reduced`, which +has been projected onto `base_reduced`. + +The nearest neighbors are computed between observations in `reduced` and `base_reduced`, but +the distances in the actual LOF computation are between the same observations in `normalized` and `base_normalized`. + +```julia +julia> base_reduced = svd(base_normalized; nsv=10) + +julia> normalized = project(counts, base_normalized); + +julia> reduced = project(normalized, base_reduced); + +julia> local_outlier_factor!(reduced, normalized, base_reduced, base_normalized; k=10) +``` + +See also: [`local_outlier_factor!`](@ref) +""" +function local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix, + base::DataMatrix, base_full::DataMatrix; + k=10, + col="LOF_projection") + adj_X = knn_adjacency_matrix(base; k, make_symmetric=false) + adj_XY = knn_adjacency_matrix(base, data; k) + + dists_X = adjacency_distances(adj_X, base_full) + dists_XY = adjacency_distances(adj_XY, base_full, full) + + kdists = vec(maximum(dists_X.matrix; dims=1)) + + lrd_X = local_reachability_density(dists_X.matrix, kdists); + lrd_XY = local_reachability_density(dists_XY.matrix, kdists); + + data.obs[:,col] = _local_outlier_factor(dists_XY.matrix, lrd_X, lrd_XY) + data +end \ No newline at end of file diff --git a/src/reduce.jl b/src/reduce.jl index 2bad886..1d10e0d 100644 --- a/src/reduce.jl +++ b/src/reduce.jl @@ -148,7 +148,7 @@ julia> force_layout(data; ndim=3, k=100) function force_layout(data::DataMatrix; ndim=3, k=nothing, adj=nothing, kprojection=10, obs=:copy, adj_out=nothing, kwargs...) @assert isnothing(k) != isnothing(adj) "Must specify exactly one of k and adj." if k !== nothing - adj = knn_adjacency_matrix(obs_coordinates(data); k) + adj = knn_adjacency_matrix(obs_coordinates(data); k, make_symmetric=true) end adj_out !== nothing && (adj_out[] = adj)