-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Local Outlier Factor Implementation of Local Outlier Factor (LOF). Nearest neighbors can be defined using e.g. dimension reduced data, while the distances actually used for the LOF computations can be in the high dimensional space, i.e. before dimension reduction. Also supports projected data sets, where only neighbors in the base data set are considered. Unit tests are missing.
- Loading branch information
1 parent
9de4ecd
commit 5e23d92
Showing
7 changed files
with
313 additions
and
35 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
name = "SingleCellProjections" | ||
uuid = "03d38035-ed2f-4a36-82eb-797f1727ab2e" | ||
authors = ["Rasmus Henningsson <[email protected]>"] | ||
version = "0.3.5" | ||
version = "0.3.6" | ||
|
||
[deps] | ||
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
function knn_adjacency_matrix(X; k, make_symmetric) | ||
N = size(X,2) | ||
k = min(k,N-1) | ||
tree = KDTree(X) | ||
indices,_ = knn(tree, X, k+1) | ||
|
||
I = zeros(Int, k*N) | ||
J = zeros(Int, k*N) | ||
destInd = 1 | ||
for (j,ind) in enumerate(indices) | ||
skip = 0 | ||
for a in 1:k | ||
ind[a]==j && (skip=1) | ||
I[destInd] = ind[a+skip] | ||
J[destInd] = j | ||
destInd += 1 | ||
end | ||
end | ||
adj = sparse(I,J,true,N,N) | ||
if make_symmetric | ||
adj = adj.|adj' | ||
end | ||
adj | ||
end | ||
|
||
function knn_adjacency_matrix(X, Y; k) | ||
# TODO: Avoid some code duplication by merging this with other functions? (knn_adjacency_matrix and embed_points) | ||
N1 = size(X,2) | ||
N2 = size(Y,2) | ||
@assert size(X,1) == size(Y,1) | ||
k = min(N1,k) # cannot have more neighbors than points! | ||
|
||
tree = KDTree(X) # Choose BallTree if dimension is higher? | ||
indices,dists = knn(tree, Y, k) # TODO: use threads? | ||
|
||
# N1xN2 sparse adjacency matrix between two matrices | ||
I = reduce(vcat,indices) | ||
J = repeat(1:N2; inner=k) | ||
|
||
sparse(I,J,true,N1,N2) | ||
end | ||
|
||
|
||
function knn_adjacency_matrix(data::DataMatrix; kwargs...) | ||
adj = knn_adjacency_matrix(obs_coordinates(data.matrix); kwargs...) | ||
obs = copy(data.obs) | ||
DataMatrix(adj, obs, obs; var_id_cols=data.obs_id_cols, data.obs_id_cols) | ||
end | ||
|
||
function knn_adjacency_matrix(X::DataMatrix, Y::DataMatrix; kwargs...) | ||
adj = knn_adjacency_matrix(obs_coordinates(X.matrix), obs_coordinates(Y.matrix); kwargs...) | ||
DataMatrix(adj, copy(X.obs), copy(Y.obs); var_id_cols=X.obs_id_cols, Y.obs_id_cols) | ||
end | ||
|
||
|
||
|
||
|
||
|
||
""" | ||
adjacency_distances(adj, X, Y=X) | ||
For each structural non-zero in `adj`, compute the Euclidean distance between the point in | ||
the DataMatrix `Y` and the point in the DataMatrix `X`. | ||
Can be useful when `adj` is created using e.g. a lower dimensional representation and we | ||
want to know the distances in the original, high dimensional space. | ||
At the moment all points in `Y` are required to have the same number of neighbors in `X`, | ||
for computation reasons. | ||
""" | ||
function adjacency_distances(adj::DataMatrix, X::DataMatrix, Y::DataMatrix=X) | ||
table_cols_equal(adj.var, X.obs; cols=X.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.") | ||
table_cols_equal(adj.obs, Y.obs; cols=Y.obs_id_cols) || error("Adjacency matrix and DataMatrix have different obs.") | ||
D = _adjacency_distances(adj.matrix, X, Y) | ||
DataMatrix(D, copy(adj.var), copy(adj.obs); adj.var_id_cols, adj.obs_id_cols) | ||
end | ||
|
||
|
||
function _adjacency_distances(adj, X::DataMatrix, Y::DataMatrix=X) | ||
N1,N2 = size(adj) | ||
@assert size(X,2)==N1 | ||
@assert size(Y,2)==N2 | ||
|
||
nnbors = vec(sum(!iszero, adj; dims=1)) | ||
@assert all(isequal(nnbors[1]), nnbors) "All points in Y must have the same number of neighbors in X" | ||
k = nnbors[1] | ||
@assert k>0 | ||
|
||
DX2 = compute(DiagGram(X.matrix)) | ||
DY2 = compute(DiagGram(Y.matrix)) | ||
|
||
I,J,_ = findnz(adj) | ||
dists = zeros(length(I)) # output | ||
|
||
for i in 1:k | ||
Is = I[i:k:end] | ||
# Js = J[i:k:end] # guaranteed to be 1:N2 | ||
|
||
# Xs = X[:,Is] # Doesn't work since DataMatrix doesn't allow duplicate IDs | ||
# Temporary workaround - TODO: fix proper interface? | ||
Xs = DataMatrix(_subsetmatrix(X.matrix,:,Is), X.var, DataFrame(id=1:length(Is)); var_id_cols=X.var_id_cols) | ||
|
||
|
||
# Ys = Y[:,Js] # guaranteed to be equal to Y | ||
|
||
DXYs = compute(Diag(matrixproduct(Xs.matrix',Y.matrix))) | ||
|
||
DX2s = DX2[Is] | ||
|
||
dists[i:k:end] .= sqrt.(max.(0.0, DX2s .+ DY2 .- 2DXYs)) | ||
end | ||
|
||
sparse(I,J,dists,N1,N2) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
function local_reachability_density(dists, kdists) | ||
N1,N2 = size(dists) | ||
@assert length(kdists) == N1 | ||
lrd = zeros(N2) | ||
|
||
R = rowvals(dists) | ||
V = nonzeros(dists) | ||
for j=1:N2 | ||
rd_sum = 0.0 | ||
for k in nzrange(dists,j) | ||
i = R[k] | ||
d = V[k] | ||
rd = max(kdists[i], d) # reachability distance from point j to point i | ||
rd_sum += rd | ||
end | ||
lrd[j] = length(nzrange(dists,j))/rd_sum | ||
end | ||
|
||
lrd | ||
end | ||
|
||
|
||
# adj can be either the distance matrix or the adjacency matrix - since only the sparsity pattern is considered | ||
function _local_outlier_factor(adj::AbstractSparseMatrix, lrdX::AbstractVector, lrdY::AbstractVector=lrdX) | ||
N1,N2 = size(adj) | ||
@assert length(lrdX) == N1 | ||
@assert length(lrdY) == N2 | ||
|
||
lof = zeros(N2) | ||
|
||
R = rowvals(adj) | ||
for j=1:N2 | ||
lrdX_sum = 0.0 | ||
for k in nzrange(adj,j) | ||
i = R[k] | ||
lrdX_sum += lrdX[i] | ||
end | ||
lof[j] = lrdX_sum/(length(nzrange(adj,j)) * lrdY[j]) | ||
end | ||
|
||
lof | ||
end | ||
|
||
|
||
|
||
|
||
|
||
""" | ||
local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF") | ||
Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to | ||
`data.obs` with the name `col`. | ||
When working with projected `DataMatrices`, use [`local_outlier_factor_projection!`](@ref) | ||
instead. | ||
NB: This function might be very slow for high values of `k`. | ||
First, the `k` nearest neighbors are found for each observation in `data`. | ||
Then, the Local Outlier Factor is computed by considering the distance between the | ||
neighbors, but this time in the `full` DataMatrix. Thus `full` must have the same | ||
observations as are present in `data`. | ||
A LOF value smaller than or close to one is means that the observation is an inlier, but a | ||
LOF value much larger than one means that the observation is an inlier. | ||
By specifiying `full=data`, this is coincides with the standard definition for Local Outlier | ||
Factor. | ||
However, it is perhaps more useful to find neighbors in a dimension reduced space (after | ||
e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space | ||
(typically after normalization). | ||
This way, an observation is concidered an outlier if the reduction to a lower dimensional | ||
space didn't properly represent the neighborhood of the observation. | ||
!!! note | ||
The interface is not yet fully decided and subject to change. | ||
# Examples | ||
Compute the Local Outlier Factor, with nearest neighbors based only on `reduced`, but later | ||
using distances in `full` for the actual LOF computation. | ||
```julia | ||
julia> reduced = svd(normalized; nsv=10) | ||
julia> local_outlier_factor!(reduced, normalized; k=10) | ||
``` | ||
See also: [`local_outlier_factor_projection!`](@ref) | ||
""" | ||
function local_outlier_factor!(data::DataMatrix, full::DataMatrix; k=10, col="LOF") | ||
adj = knn_adjacency_matrix(data; k, make_symmetric=false) | ||
dists = adjacency_distances(adj, full) | ||
kdists = vec(maximum(dists.matrix; dims=1)) | ||
lrd = local_reachability_density(dists.matrix, kdists); | ||
data.obs[:,col] = _local_outlier_factor(dists.matrix, lrd) | ||
data | ||
end | ||
|
||
""" | ||
local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix, base::DataMatrix, base_full::DataMatrix; k=10, col="LOF_projection") | ||
Compute the Local Outlier Factor (LOF) for each observation in `data`, and add as column to | ||
`data.obs` with the name `col`. | ||
Use `local_outlier_factor_projection!` if you are working with projected data, and | ||
[`local_outlier_factor!`](@ref) otherwise. | ||
Parameters: | ||
* `data` - A `DataMatrix` for which we compute LOF for each observation. Expected to be a `DataMatrix` projected onto `base`, so that the `data` and `base` use the same coordinate system. | ||
* `full` - A `DataMatrix` with the same observations as `data`, used to compute distances in the LOF computation. Expected to be a `DataMatrix` projected onto `base_full`, so that the `full` and `base_full` use the same coordinate system. | ||
* `base` - The base `DataMatrix`. | ||
* `base_full` - The base `DataMatrix`. | ||
* `k` - The number of nearest neighbors to use. NB: This function might be very slow for high values of `k`. | ||
First, for each observation in `data`, the `k` nearest neighbors in `base` are found. | ||
Then, the distance to each neighbor is computed using `full` and `base_full`. | ||
Thus `full` must have the same observations as are present in `data`, and `base_full` must | ||
have the same observations as `base`. | ||
A LOF value smaller than or close to one is means that the observation is an inlier, but a | ||
LOF value much larger than one means that the observation is an inlier. | ||
By specifiying `full=data` and `base_full=base`, this is coincides with the standard | ||
definition for Local Outlier Factor. | ||
However, it is perhaps more useful to find neighbors in a dimension reduced space (after | ||
e.g. `svd` (PCA) or `umap`), but then compute distances in the high dimensional space | ||
(typically after normalization). | ||
This way, an observation is concidered an outlier if the reduction to a lower dimensional | ||
space didn't properly represent the neighborhood of the observation. | ||
!!! note | ||
The interface is not yet fully decided and subject to change. | ||
# Examples | ||
Compute the Local Outlier Factor (LOF) for each observation in a data set `reduced`, which | ||
has been projected onto `base_reduced`. | ||
The nearest neighbors are computed between observations in `reduced` and `base_reduced`, but | ||
the distances in the actual LOF computation are between the same observations in `normalized` and `base_normalized`. | ||
```julia | ||
julia> base_reduced = svd(base_normalized; nsv=10) | ||
julia> normalized = project(counts, base_normalized); | ||
julia> reduced = project(normalized, base_reduced); | ||
julia> local_outlier_factor!(reduced, normalized, base_reduced, base_normalized; k=10) | ||
``` | ||
See also: [`local_outlier_factor!`](@ref) | ||
""" | ||
function local_outlier_factor_projection!(data::DataMatrix, full::DataMatrix, | ||
base::DataMatrix, base_full::DataMatrix; | ||
k=10, | ||
col="LOF_projection") | ||
adj_X = knn_adjacency_matrix(base; k, make_symmetric=false) | ||
adj_XY = knn_adjacency_matrix(base, data; k) | ||
|
||
dists_X = adjacency_distances(adj_X, base_full) | ||
dists_XY = adjacency_distances(adj_XY, base_full, full) | ||
|
||
kdists = vec(maximum(dists_X.matrix; dims=1)) | ||
|
||
lrd_X = local_reachability_density(dists_X.matrix, kdists); | ||
lrd_XY = local_reachability_density(dists_XY.matrix, kdists); | ||
|
||
data.obs[:,col] = _local_outlier_factor(dists_XY.matrix, lrd_X, lrd_XY) | ||
data | ||
end |
Oops, something went wrong.
5e23d92
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@JuliaRegistrator register
Release notes:
Added
local_outlier_factor!
- Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Supports finding neighbors in a low dimensional space (e.g. after PCA or UMAP), but computing distances in a high dimensional space (e.g. after normalization).local_outlier_factor_projection!
- Compute the Local Outlier Factor (LOF) for each observation in a DataMatrix. Only points in thebase
data set are considered as neighbors.Fixed
knn_adjacency_matrix
- kwargmake_symmetric
must now be specified by the caller.5e23d92
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Registration pull request created: JuliaRegistries/General/97193
Tagging
After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.
This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via: