Local Outlier Factor (#15)

Local Outlier Factor Implementation of Local Outlier Factor (LOF). Nearest neighbors can be defined using e.g. dimension reduced data, while the distances actually used for the LOF computations can be in the high dimensional space, i.e. before dimension reduction. Also supports projected data sets, where only neighbors in the base data set are considered. Unit tests are missing.
BioJulia · Dec 15, 2023 · 5e23d92 · 5e23d92 · rasmushenningsson · Dec 15, 2023
1 parent 9de4ecd
commit 5e23d92
Show file tree

Hide file tree

Showing 7 changed files with 313 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.6] - 2023-12-15
+
+### Added
+
+* `local_outlier_factor!` - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Supports finding neighbors in a low dimensional space (e.g. after PCA or UMAP), but computing distances in a high dimensional space (e.g. after normalization).
+* `local_outlier_factor_projection!` - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Only points in the `base` data set are considered as neighbors.
+
+### Fixed
+
+* `knn_adjacency_matrix` - kwarg `make_symmetric` must now be specified by the caller.
+
+
 ## [0.3.5] - 2023-12-12
 
 ### Added

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SingleCellProjections"
 uuid = "03d38035-ed2f-4a36-82eb-797f1727ab2e"
 authors = ["Rasmus Henningsson <[email protected]>"]
-version = "0.3.5"
+version = "0.3.6"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"

diff --git a/src/SingleCellProjections.jl b/src/SingleCellProjections.jl
@@ -40,6 +40,8 @@ export
 	var_to_obs_table,
 	var_counts_fraction!,
 	pseudobulk,
+	local_outlier_factor!,
+	local_outlier_factor_projection!,
 	ftest!,
 	ftest,
 	ftest_table,
@@ -89,6 +91,13 @@ include("sctransformsparse.jl")
 
 include("implicitsvd.jl")
 
+include("lowrank.jl")
+include("projectionmodels.jl")
+include("datamatrix.jl")
+include("subset_expression.jl")
+
+include("adjacency_matrices.jl")
+
 include("barnes_hut.jl")
 include("force_layout.jl")
 include("embed.jl")
@@ -97,11 +106,6 @@ include("h5ad.jl")
 
 include("mannwhitney.jl")
 
-include("lowrank.jl")
-include("projectionmodels.jl")
-include("datamatrix.jl")
-include("subset_expression.jl")
-
 include("filter.jl")
 include("load.jl")
 include("transform.jl")
@@ -113,6 +117,8 @@ include("statistical_tests.jl")
 include("counts_fraction.jl")
 include("pseudobulk.jl")
 
+include("local_outlier_factor.jl")
+
 include("precompile.jl")
 
 @static if !isdefined(Base, :get_extension)

diff --git a/src/adjacency_matrices.jl b/src/adjacency_matrices.jl
@@ -0,0 +1,114 @@
+function knn_adjacency_matrix(X; k, make_symmetric)
+	N = size(X,2)
+	k = min(k,N-1)
+	tree = KDTree(X)
+	indices,_ = knn(tree, X, k+1)
+
+	I = zeros(Int, k*N)
+	J = zeros(Int, k*N)
+	destInd = 1
+	for (j,ind) in enumerate(indices)
+		skip = 0
+		for a in 1:k
+			ind[a]==j && (skip=1)
+			I[destInd] = ind[a+skip]
+			J[destInd] = j
+			destInd += 1
+		end
+	end
+	adj = sparse(I,J,true,N,N)
+	if make_symmetric
+		adj = adj.|adj'
+	end
+	adj
+end
+
+function knn_adjacency_matrix(X, Y; k)
+	# TODO: Avoid some code duplication by merging this with other functions? (knn_adjacency_matrix and embed_points)
+	N1 = size(X,2)
+	N2 = size(Y,2)
+	@assert size(X,1) == size(Y,1)
+	k = min(N1,k) # cannot have more neighbors than points!
+
+	tree = KDTree(X) # Choose BallTree if dimension is higher?
+	indices,dists = knn(tree, Y, k) # TODO: use threads?
+
+	# N1xN2 sparse adjacency matrix between two matrices
+	I = reduce(vcat,indices)
+	J = repeat(1:N2; inner=k)
+
+	sparse(I,J,true,N1,N2)
+end
+
+
+function knn_adjacency_matrix(data::DataMatrix; kwargs...)
+	adj = knn_adjacency_matrix(obs_coordinates(data.matrix); kwargs...)
+	obs = copy(data.obs)
+	DataMatrix(adj, obs, obs; var_id_cols=data.obs_id_cols, data.obs_id_cols)
+end
+
+function knn_adjacency_matrix(X::DataMatrix, Y::DataMatrix; kwargs...)
+	adj = knn_adjacency_matrix(obs_coordinates(X.matrix), obs_coordinates(Y.matrix); kwargs...)
+	DataMatrix(adj, copy(X.obs), copy(Y.obs); var_id_cols=X.obs_id_cols, Y.obs_id_cols)
+end
+
+
+
+
+
+"""
+	adjacency_distances(adj, X, Y=X)
+
+For each structural non-zero in `adj`, compute the Euclidean distance between the point in
+the DataMatrix `Y` and the point in the DataMatrix `X`.
+
+Can be useful when `adj` is created using e.g. a lower dimensional representation and we
+want to know the distances in the original, high dimensional space.
+
+At the moment all points in `Y` are required to have the same number of neighbors in `X`,
+for computation reasons.
+"""
+function adjacency_distances(adj::DataMatrix, X::DataMatrix, Y::DataMatrix=X)
+	table_cols_equal(adj.var, X.obs; cols=X.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.")
+	table_cols_equal(adj.obs, Y.obs; cols=Y.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.")
+	D = _adjacency_distances(adj.matrix, X, Y)
+	DataMatrix(D, copy(adj.var), copy(adj.obs); adj.var_id_cols, adj.obs_id_cols)
+end
+
+
+function _adjacency_distances(adj, X::DataMatrix, Y::DataMatrix=X)
+	N1,N2 = size(adj)
+	@assert size(X,2)==N1
+	@assert size(Y,2)==N2
+
+	nnbors = vec(sum(!iszero, adj; dims=1))
+	@assert all(isequal(nnbors[1]), nnbors) "All points in Y must have the same number of neighbors in X"
+	k = nnbors[1]
+	@assert k>0
+
+	DX2 = compute(DiagGram(X.matrix))
+	DY2 = compute(DiagGram(Y.matrix))
+
+	I,J,_ = findnz(adj)
+	dists = zeros(length(I)) # output
+
+	for i in 1:k
+		Is = I[i:k:end]
+		# Js = J[i:k:end] # guaranteed to be 1:N2
+
+		# Xs = X[:,Is] # Doesn't work since DataMatrix doesn't allow duplicate IDs
+		# Temporary workaround - TODO: fix proper interface?
+		Xs = DataMatrix(_subsetmatrix(X.matrix,:,Is), X.var, DataFrame(id=1:length(Is)); var_id_cols=X.var_id_cols)
+
+
+		# Ys = Y[:,Js] # guaranteed to be equal to Y
+
+		DXYs = compute(Diag(matrixproduct(Xs.matrix',Y.matrix)))
+
+		DX2s = DX2[Is]
+
+		dists[i:k:end] .= sqrt.(max.(0.0, DX2s .+ DY2 .- 2DXYs))
+	end
+
+	sparse(I,J,dists,N1,N2)
+end
diff --git a/src/force_layout.jl b/src/force_layout.jl
@@ -1,31 +1,3 @@
-function knn_adjacency_matrix(X; k, make_symmetric=true)
-    N = size(X,2)
-    k = min(k,N-1)
-    tree = KDTree(X)
-    indices,_ = knn(tree, X, k+1)
-
-    I = zeros(Int, k*N)
-    J = zeros(Int, k*N)
-    destInd = 1
-    for (j,ind) in enumerate(indices)
-        skip = 0
-        for a in 1:k
-            ind[a]==j && (skip=1)
-            I[destInd] = ind[a+skip]
-            J[destInd] = j
-            destInd += 1
-        end
-    end
-    adj = sparse(I,J,true,N,N)
-    if make_symmetric
-        adj = adj.|adj'
-    end
-    adj
-end
-
-
-
-
 _randinit(::Val{ndim}, rng, N::Int, scale) where ndim = scale.*randn(rng, SVector{ndim,Float64}, N)
 
 

diff --git a/src/local_outlier_factor.jl b/src/local_outlier_factor.jl
@@ -0,0 +1,174 @@
+function local_reachability_density(dists, kdists)
+	N1,N2 = size(dists)
+	@assert length(kdists) == N1
+	lrd = zeros(N2)
+
+	R = rowvals(dists)
+	V = nonzeros(dists)
+	for j=1:N2
+		rd_sum = 0.0
+		for k in nzrange(dists,j)
+			i = R[k]
+			d = V[k]
+			rd = max(kdists[i], d) # reachability distance from point j to point i
+			rd_sum += rd
+		end
+		lrd[j] = length(nzrange(dists,j))/rd_sum
+	end
+
+	lrd
+end
+
+
+# adj can be either the distance matrix or the adjacency matrix - since only the sparsity pattern is considered
+function _local_outlier_factor(adj::AbstractSparseMatrix, lrdX::AbstractVector, lrdY::AbstractVector=lrdX)
+	N1,N2 = size(adj)
+	@assert length(lrdX) == N1
+	@assert length(lrdY) == N2
+
+	lof = zeros(N2)
+
+	R = rowvals(adj)
+	for j=1:N2
+		lrdX_sum = 0.0
+		for k in nzrange(adj,j)
+			i = R[k]
+			lrdX_sum += lrdX[i]
+		end
+		lof[j] = lrdX_sum/(length(nzrange(adj,j)) * lrdY[j])
+	end
+
+	lof
+end
+
+
+
+
+
+"""
+	local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF")
+
+Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to
+`data.obs` with the name `col`.
+
+When working with projected `DataMatrices`, use [`local_outlier_factor_projection!`](@ref)
+instead.
+
+NB: This function might be very slow for high values of `k`.
+
+First, the `k` nearest neighbors are found for each observation in `data`.
+Then, the Local Outlier Factor is computed by considering the distance between the
+neighbors, but this time in the `full` DataMatrix. Thus `full` must have the same
+observations as are present in `data`.
+
+A LOF value smaller than or close to one is means that the observation is an inlier, but a
+LOF value much larger than one means that the observation is an inlier.
+
+By specifiying `full=data`, this is coincides with the standard definition for Local Outlier
+Factor.
+However, it is perhaps more useful to find neighbors in a dimension reduced space (after
+e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space
+(typically after normalization).
+This way, an observation is concidered an outlier if the reduction to a lower dimensional
+space didn't properly represent the neighborhood of the observation.
+
+!!! note
+	The interface is not yet fully decided and subject to change.
+
+# Examples
+
+Compute the Local Outlier Factor, with nearest neighbors based only on `reduced`, but later
+using distances in `full` for the actual LOF computation.
+
+```julia
+julia> reduced = svd(normalized; nsv=10)
+
+julia> local_outlier_factor!(reduced, normalized; k=10)
+```
+
+See also: [`local_outlier_factor_projection!`](@ref)
+"""
+function local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF")
+	adj = knn_adjacency_matrix(data; k, make_symmetric=false)
+	dists = adjacency_distances(adj, full)
+	kdists = vec(maximum(dists.matrix; dims=1))
+	lrd = local_reachability_density(dists.matrix, kdists);
+	data.obs[:,col] = _local_outlier_factor(dists.matrix, lrd)
+	data
+end
+
+"""
+	local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix, base::DataMatrix, base_full::DataMatrix; k=10, col="LOF_projection")
+
+Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to
+`data.obs` with the name `col`.
+
+Use `local_outlier_factor_projection!` if you are working with projected data, and
+[`local_outlier_factor!`](@ref) otherwise.
+
+Parameters:
+
+* `data` - A `DataMatrix` for which we compute LOF for each observation. Expected to be a `DataMatrix` projected onto `base`, so that the `data` and `base` use the same coordinate system.
+* `full` - A `DataMatrix` with the same observations as `data`, used to compute distances in the LOF computation. Expected to be a `DataMatrix` projected onto `base_full`, so that the `full` and `base_full` use the same coordinate system.
+* `base` - The base `DataMatrix`.
+* `base_full` - The base `DataMatrix`.
+* `k` - The number of nearest neighbors to use. NB: This function might be very slow for high values of `k`.
+
+First, for each observation in `data`, the `k` nearest neighbors in `base` are found.
+Then, the distance to each neighbor is computed using `full` and `base_full`.
+Thus `full` must have the same observations as are present in `data`, and `base_full` must
+have the same observations as `base`.
+
+A LOF value smaller than or close to one is means that the observation is an inlier, but a
+LOF value much larger than one means that the observation is an inlier.
+
+By specifiying `full=data` and `base_full=base`, this is coincides with the standard
+definition for Local Outlier Factor.
+However, it is perhaps more useful to find neighbors in a dimension reduced space (after
+e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space
+(typically after normalization).
+This way, an observation is concidered an outlier if the reduction to a lower dimensional
+space didn't properly represent the neighborhood of the observation.
+
+!!! note
+	The interface is not yet fully decided and subject to change.
+
+
+# Examples
+
+Compute the Local Outlier Factor (LOF) for each observation in a data set `reduced`, which
+has been projected onto `base_reduced`.
+
+The nearest neighbors are computed between observations in `reduced` and `base_reduced`, but
+the distances in the actual LOF computation are between the same observations in `normalized` and `base_normalized`.
+
+```julia
+julia> base_reduced = svd(base_normalized; nsv=10)
+
+julia> normalized = project(counts, base_normalized);
+
+julia> reduced = project(normalized, base_reduced);
+
+julia> local_outlier_factor!(reduced, normalized, base_reduced, base_normalized; k=10)
+```
+
+See also: [`local_outlier_factor!`](@ref)
+"""
+function local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix,
+                                          base::DataMatrix, base_full::DataMatrix;
+                                          k=10,
+                                          col="LOF_projection")
+	adj_X = knn_adjacency_matrix(base; k, make_symmetric=false)
+	adj_XY = knn_adjacency_matrix(base, data; k)
+
+	dists_X = adjacency_distances(adj_X, base_full)
+	dists_XY = adjacency_distances(adj_XY, base_full, full)
+
+	kdists = vec(maximum(dists_X.matrix; dims=1))
+
+	lrd_X = local_reachability_density(dists_X.matrix, kdists);
+	lrd_XY = local_reachability_density(dists_XY.matrix, kdists);
+
+	data.obs[:,col] = _local_outlier_factor(dists_XY.matrix, lrd_X, lrd_XY)
+	data
+end