Skip to content

Commit

Permalink
Local Outlier Factor (#15)
Browse files Browse the repository at this point in the history
Local Outlier Factor

Implementation of Local Outlier Factor (LOF).
Nearest neighbors can be defined using e.g. dimension reduced data, while the distances actually used for the LOF computations can be in the high dimensional space, i.e. before dimension reduction.
Also supports projected data sets, where only neighbors in the base data set are considered.

Unit tests are missing.
  • Loading branch information
rasmushenningsson authored Dec 15, 2023
1 parent 9de4ecd commit 5e23d92
Show file tree
Hide file tree
Showing 7 changed files with 313 additions and 35 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [0.3.6] - 2023-12-15

### Added

* `local_outlier_factor!` - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Supports finding neighbors in a low dimensional space (e.g. after PCA or UMAP), but computing distances in a high dimensional space (e.g. after normalization).
* `local_outlier_factor_projection!` - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Only points in the `base` data set are considered as neighbors.

### Fixed

* `knn_adjacency_matrix` - kwarg `make_symmetric` must now be specified by the caller.


## [0.3.5] - 2023-12-12

### Added
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "SingleCellProjections"
uuid = "03d38035-ed2f-4a36-82eb-797f1727ab2e"
authors = ["Rasmus Henningsson <[email protected]>"]
version = "0.3.5"
version = "0.3.6"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Expand Down
16 changes: 11 additions & 5 deletions src/SingleCellProjections.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ export
var_to_obs_table,
var_counts_fraction!,
pseudobulk,
local_outlier_factor!,
local_outlier_factor_projection!,
ftest!,
ftest,
ftest_table,
Expand Down Expand Up @@ -89,6 +91,13 @@ include("sctransformsparse.jl")

include("implicitsvd.jl")

include("lowrank.jl")
include("projectionmodels.jl")
include("datamatrix.jl")
include("subset_expression.jl")

include("adjacency_matrices.jl")

include("barnes_hut.jl")
include("force_layout.jl")
include("embed.jl")
Expand All @@ -97,11 +106,6 @@ include("h5ad.jl")

include("mannwhitney.jl")

include("lowrank.jl")
include("projectionmodels.jl")
include("datamatrix.jl")
include("subset_expression.jl")

include("filter.jl")
include("load.jl")
include("transform.jl")
Expand All @@ -113,6 +117,8 @@ include("statistical_tests.jl")
include("counts_fraction.jl")
include("pseudobulk.jl")

include("local_outlier_factor.jl")

include("precompile.jl")

@static if !isdefined(Base, :get_extension)
Expand Down
114 changes: 114 additions & 0 deletions src/adjacency_matrices.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
function knn_adjacency_matrix(X; k, make_symmetric)
N = size(X,2)
k = min(k,N-1)
tree = KDTree(X)
indices,_ = knn(tree, X, k+1)

I = zeros(Int, k*N)
J = zeros(Int, k*N)
destInd = 1
for (j,ind) in enumerate(indices)
skip = 0
for a in 1:k
ind[a]==j && (skip=1)
I[destInd] = ind[a+skip]
J[destInd] = j
destInd += 1
end
end
adj = sparse(I,J,true,N,N)
if make_symmetric
adj = adj.|adj'
end
adj
end

function knn_adjacency_matrix(X, Y; k)
# TODO: Avoid some code duplication by merging this with other functions? (knn_adjacency_matrix and embed_points)
N1 = size(X,2)
N2 = size(Y,2)
@assert size(X,1) == size(Y,1)
k = min(N1,k) # cannot have more neighbors than points!

tree = KDTree(X) # Choose BallTree if dimension is higher?
indices,dists = knn(tree, Y, k) # TODO: use threads?

# N1xN2 sparse adjacency matrix between two matrices
I = reduce(vcat,indices)
J = repeat(1:N2; inner=k)

sparse(I,J,true,N1,N2)
end


function knn_adjacency_matrix(data::DataMatrix; kwargs...)
adj = knn_adjacency_matrix(obs_coordinates(data.matrix); kwargs...)
obs = copy(data.obs)
DataMatrix(adj, obs, obs; var_id_cols=data.obs_id_cols, data.obs_id_cols)
end

function knn_adjacency_matrix(X::DataMatrix, Y::DataMatrix; kwargs...)
adj = knn_adjacency_matrix(obs_coordinates(X.matrix), obs_coordinates(Y.matrix); kwargs...)
DataMatrix(adj, copy(X.obs), copy(Y.obs); var_id_cols=X.obs_id_cols, Y.obs_id_cols)
end





"""
adjacency_distances(adj, X, Y=X)
For each structural non-zero in `adj`, compute the Euclidean distance between the point in
the DataMatrix `Y` and the point in the DataMatrix `X`.
Can be useful when `adj` is created using e.g. a lower dimensional representation and we
want to know the distances in the original, high dimensional space.
At the moment all points in `Y` are required to have the same number of neighbors in `X`,
for computation reasons.
"""
function adjacency_distances(adj::DataMatrix, X::DataMatrix, Y::DataMatrix=X)
table_cols_equal(adj.var, X.obs; cols=X.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.")
table_cols_equal(adj.obs, Y.obs; cols=Y.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.")
D = _adjacency_distances(adj.matrix, X, Y)
DataMatrix(D, copy(adj.var), copy(adj.obs); adj.var_id_cols, adj.obs_id_cols)
end


function _adjacency_distances(adj, X::DataMatrix, Y::DataMatrix=X)
N1,N2 = size(adj)
@assert size(X,2)==N1
@assert size(Y,2)==N2

nnbors = vec(sum(!iszero, adj; dims=1))
@assert all(isequal(nnbors[1]), nnbors) "All points in Y must have the same number of neighbors in X"
k = nnbors[1]
@assert k>0

DX2 = compute(DiagGram(X.matrix))
DY2 = compute(DiagGram(Y.matrix))

I,J,_ = findnz(adj)
dists = zeros(length(I)) # output

for i in 1:k
Is = I[i:k:end]
# Js = J[i:k:end] # guaranteed to be 1:N2

# Xs = X[:,Is] # Doesn't work since DataMatrix doesn't allow duplicate IDs
# Temporary workaround - TODO: fix proper interface?
Xs = DataMatrix(_subsetmatrix(X.matrix,:,Is), X.var, DataFrame(id=1:length(Is)); var_id_cols=X.var_id_cols)


# Ys = Y[:,Js] # guaranteed to be equal to Y

DXYs = compute(Diag(matrixproduct(Xs.matrix',Y.matrix)))

DX2s = DX2[Is]

dists[i:k:end] .= sqrt.(max.(0.0, DX2s .+ DY2 .- 2DXYs))
end

sparse(I,J,dists,N1,N2)
end
28 changes: 0 additions & 28 deletions src/force_layout.jl
Original file line number Diff line number Diff line change
@@ -1,31 +1,3 @@
function knn_adjacency_matrix(X; k, make_symmetric=true)
N = size(X,2)
k = min(k,N-1)
tree = KDTree(X)
indices,_ = knn(tree, X, k+1)

I = zeros(Int, k*N)
J = zeros(Int, k*N)
destInd = 1
for (j,ind) in enumerate(indices)
skip = 0
for a in 1:k
ind[a]==j && (skip=1)
I[destInd] = ind[a+skip]
J[destInd] = j
destInd += 1
end
end
adj = sparse(I,J,true,N,N)
if make_symmetric
adj = adj.|adj'
end
adj
end




_randinit(::Val{ndim}, rng, N::Int, scale) where ndim = scale.*randn(rng, SVector{ndim,Float64}, N)


Expand Down
174 changes: 174 additions & 0 deletions src/local_outlier_factor.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
function local_reachability_density(dists, kdists)
N1,N2 = size(dists)
@assert length(kdists) == N1
lrd = zeros(N2)

R = rowvals(dists)
V = nonzeros(dists)
for j=1:N2
rd_sum = 0.0
for k in nzrange(dists,j)
i = R[k]
d = V[k]
rd = max(kdists[i], d) # reachability distance from point j to point i
rd_sum += rd
end
lrd[j] = length(nzrange(dists,j))/rd_sum
end

lrd
end


# adj can be either the distance matrix or the adjacency matrix - since only the sparsity pattern is considered
function _local_outlier_factor(adj::AbstractSparseMatrix, lrdX::AbstractVector, lrdY::AbstractVector=lrdX)
N1,N2 = size(adj)
@assert length(lrdX) == N1
@assert length(lrdY) == N2

lof = zeros(N2)

R = rowvals(adj)
for j=1:N2
lrdX_sum = 0.0
for k in nzrange(adj,j)
i = R[k]
lrdX_sum += lrdX[i]
end
lof[j] = lrdX_sum/(length(nzrange(adj,j)) * lrdY[j])
end

lof
end





"""
local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF")
Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to
`data.obs` with the name `col`.
When working with projected `DataMatrices`, use [`local_outlier_factor_projection!`](@ref)
instead.
NB: This function might be very slow for high values of `k`.
First, the `k` nearest neighbors are found for each observation in `data`.
Then, the Local Outlier Factor is computed by considering the distance between the
neighbors, but this time in the `full` DataMatrix. Thus `full` must have the same
observations as are present in `data`.
A LOF value smaller than or close to one is means that the observation is an inlier, but a
LOF value much larger than one means that the observation is an inlier.
By specifiying `full=data`, this is coincides with the standard definition for Local Outlier
Factor.
However, it is perhaps more useful to find neighbors in a dimension reduced space (after
e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space
(typically after normalization).
This way, an observation is concidered an outlier if the reduction to a lower dimensional
space didn't properly represent the neighborhood of the observation.
!!! note
The interface is not yet fully decided and subject to change.
# Examples
Compute the Local Outlier Factor, with nearest neighbors based only on `reduced`, but later
using distances in `full` for the actual LOF computation.
```julia
julia> reduced = svd(normalized; nsv=10)
julia> local_outlier_factor!(reduced, normalized; k=10)
```
See also: [`local_outlier_factor_projection!`](@ref)
"""
function local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF")
adj = knn_adjacency_matrix(data; k, make_symmetric=false)
dists = adjacency_distances(adj, full)
kdists = vec(maximum(dists.matrix; dims=1))
lrd = local_reachability_density(dists.matrix, kdists);
data.obs[:,col] = _local_outlier_factor(dists.matrix, lrd)
data
end

"""
local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix, base::DataMatrix, base_full::DataMatrix; k=10, col="LOF_projection")
Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to
`data.obs` with the name `col`.
Use `local_outlier_factor_projection!` if you are working with projected data, and
[`local_outlier_factor!`](@ref) otherwise.
Parameters:
* `data` - A `DataMatrix` for which we compute LOF for each observation. Expected to be a `DataMatrix` projected onto `base`, so that the `data` and `base` use the same coordinate system.
* `full` - A `DataMatrix` with the same observations as `data`, used to compute distances in the LOF computation. Expected to be a `DataMatrix` projected onto `base_full`, so that the `full` and `base_full` use the same coordinate system.
* `base` - The base `DataMatrix`.
* `base_full` - The base `DataMatrix`.
* `k` - The number of nearest neighbors to use. NB: This function might be very slow for high values of `k`.
First, for each observation in `data`, the `k` nearest neighbors in `base` are found.
Then, the distance to each neighbor is computed using `full` and `base_full`.
Thus `full` must have the same observations as are present in `data`, and `base_full` must
have the same observations as `base`.
A LOF value smaller than or close to one is means that the observation is an inlier, but a
LOF value much larger than one means that the observation is an inlier.
By specifiying `full=data` and `base_full=base`, this is coincides with the standard
definition for Local Outlier Factor.
However, it is perhaps more useful to find neighbors in a dimension reduced space (after
e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space
(typically after normalization).
This way, an observation is concidered an outlier if the reduction to a lower dimensional
space didn't properly represent the neighborhood of the observation.
!!! note
The interface is not yet fully decided and subject to change.
# Examples
Compute the Local Outlier Factor (LOF) for each observation in a data set `reduced`, which
has been projected onto `base_reduced`.
The nearest neighbors are computed between observations in `reduced` and `base_reduced`, but
the distances in the actual LOF computation are between the same observations in `normalized` and `base_normalized`.
```julia
julia> base_reduced = svd(base_normalized; nsv=10)
julia> normalized = project(counts, base_normalized);
julia> reduced = project(normalized, base_reduced);
julia> local_outlier_factor!(reduced, normalized, base_reduced, base_normalized; k=10)
```
See also: [`local_outlier_factor!`](@ref)
"""
function local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix,
base::DataMatrix, base_full::DataMatrix;
k=10,
col="LOF_projection")
adj_X = knn_adjacency_matrix(base; k, make_symmetric=false)
adj_XY = knn_adjacency_matrix(base, data; k)

dists_X = adjacency_distances(adj_X, base_full)
dists_XY = adjacency_distances(adj_XY, base_full, full)

kdists = vec(maximum(dists_X.matrix; dims=1))

lrd_X = local_reachability_density(dists_X.matrix, kdists);
lrd_XY = local_reachability_density(dists_XY.matrix, kdists);

data.obs[:,col] = _local_outlier_factor(dists_XY.matrix, lrd_X, lrd_XY)
data
end
Loading

2 comments on commit 5e23d92

@rasmushenningsson
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register

Release notes:

Added

  • local_outlier_factor! - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Supports finding neighbors in a low dimensional space (e.g. after PCA or UMAP), but computing distances in a high dimensional space (e.g. after normalization).
  • local_outlier_factor_projection! - Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Only points in the base data set are considered as neighbors.

Fixed

  • knn_adjacency_matrix - kwarg make_symmetric must now be specified by the caller.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/97193

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.3.6 -m "<description of version>" 5e23d9269ad1ccd65de418e9fe8dd9b24207dc2d
git push origin v0.3.6

Please sign in to comment.