Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize multi-tree comparisons #59

Draft
wants to merge 83 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
83 commits
Select commit Hold shift + click to select a range
e5c21ea
pre-increment
Jul 19, 2021
c56b4ab
Header guards
Jul 19, 2021
daeb1d7
alpha
Jul 19, 2021
530f43a
start work on cpp_all_pairs_mci()
Jul 19, 2021
1a73ebd
Constructing tournament tree
Jul 19, 2021
2bbd1b5
Play first games
Jul 19, 2021
cc459a9
Test for equality at end
Jul 20, 2021
79ddbf6
File reorganization
Jul 20, 2021
99f7ec8
.h → .hpp
Jul 20, 2021
210db89
Initialize SplitRoster()
Jul 20, 2021
4f77795
int32s
Jul 20, 2021
fc86ba2
Log splits
Jul 20, 2021
2e02e24
Tournament implementation
Jul 20, 2021
b1d0276
Save winners & losers
Jul 20, 2021
06bae67
Simplify
Jul 20, 2021
f1228cd
.hpp
Jul 20, 2021
e9061b8
Declarations
Jul 20, 2021
2c1d60d
vector of arrays
Jul 20, 2021
36a0ccf
in_split is int16
Jul 20, 2021
33ded8f
types
Jul 20, 2021
8724fff
index format
Jul 20, 2021
7659349
hpp
Jul 20, 2021
97d4c40
std::vector<SplitList> splits
Jul 20, 2021
7de40cd
hpp
Jul 20, 2021
18cd5d3
Correct assertions
Jul 20, 2021
fa24471
emplace_back takes params only
Jul 20, 2021
0eff372
mvoe up
Jul 20, 2021
193f19e
begin cpp_all_pairs_mci
Jul 20, 2021
35248fd
Framework for self-comparison
Jul 21, 2021
caa1761
Draft of score_pairs()
Jul 21, 2021
7bda05c
Draft ready to debug?
Jul 21, 2021
0b23a33
Update tree_distances.cpp
Jul 21, 2021
41019ec
Merge branch 'master' into optim
Jul 21, 2021
6cf0c20
return
Jul 21, 2021
0071672
parent/child offset
Jul 21, 2021
cb030ae
Debug
Jul 21, 2021
6badf49
new_split
Jul 21, 2021
7b04587
SplitList(RawMatrix x, int16 n_tip);
Jul 21, 2021
3a375b2
//
Jul 21, 2021
31942d4
#define
Jul 21, 2021
4303722
leaves
Jul 21, 2021
8db011d
, n_tips
Jul 21, 2021
4284859
na/nA switch
Jul 21, 2021
6e0979d
li
Jul 21, 2021
94d2e46
sp_i/j
Jul 21, 2021
9b578b6
max-/
Jul 21, 2021
6a77b28
max lookip
Jul 21, 2021
ab9287a
n_trees - 1
Jul 21, 2021
5f79c9a
Swap in_split
Jul 21, 2021
f435292
nA, na
Jul 21, 2021
d6106eb
n_tips
Jul 21, 2021
9f4e429
Working for 9 leaves
Jul 21, 2021
38af2d7
//
Jul 21, 2021
4a324f9
* Use MACROS for SplitList(x)
Jul 22, 2021
dd41e88
i = winner + tournament_games
Jul 22, 2021
b4bbe22
Update SplitRoster.cpp
Jul 22, 2021
20b4e21
pow → <<
Jul 22, 2021
3a89e51
Full bin
Jul 22, 2021
8ba32e1
Update SplitList.cpp
Jul 22, 2021
91536f9
index for 0!
Jul 22, 2021
d3482ce
Debugging comments
Jul 22, 2021
0ff8fc8
INBIN not INSUBBIN
Jul 22, 2021
49125ae
assert(fuzzy_match < lap_dim);
Jul 22, 2021
8f7e92c
Merge branch 'master' into optim
Jul 22, 2021
dfdf4eb
Merge branch 'master' into optim
Jul 22, 2021
4d1e955
Update SplitList.cpp
Jul 22, 2021
af3a9c9
Improve comment
Jul 22, 2021
5d5c04f
ws
Jul 22, 2021
5adb89a
ai:→ roster_len
Jul 22, 2021
ec0d493
Drop TR1 reference
Jul 22, 2021
f38abdc
.hpp→.h
Jul 22, 2021
9649ec2
.hpp→.h
Jul 22, 2021
a44e205
.hpp→.h
Jul 22, 2021
9b5f61f
.hpp→.h
Jul 22, 2021
a6edeb4
Don't calculate self-similarity
Jul 22, 2021
0ba89d7
spell 'dist'
Jul 22, 2021
c1ee964
// entry++
Jul 22, 2021
2437e8d
Populate spare j-splits
Jul 22, 2021
aed65a4
spacing
Jul 22, 2021
201dc8e
don't invert
Jul 22, 2021
c23662d
consts → #defines
Jul 22, 2021
823bf06
use memory once
Jul 22, 2021
67c90ad
ws
Jul 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ cpp_msi_distance <- function(x, y, nTip) {
.Call(`_TreeDist_cpp_msi_distance`, x, y, nTip)
}

cpp_all_pairs_mci <- function(x, nTip) {
.Call(`_TreeDist_cpp_all_pairs_mci`, x, nTip)
}

cpp_mutual_clustering <- function(x, y, nTip) {
.Call(`_TreeDist_cpp_mutual_clustering`, x, y, nTip)
}
Expand Down
11 changes: 9 additions & 2 deletions R/tree_distance_info.R
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,15 @@ ExpectedVariation <- function (tree1, tree2, samples = 1e+4) {
#' @export
MutualClusteringInfo <- function (tree1, tree2 = NULL, normalize = FALSE,
reportMatching = FALSE, diag = TRUE) {
unnormalized <- CalculateTreeDistance(MutualClusteringInfoSplits, tree1, tree2,
reportMatching)
unnormalized <- if (is.null(tree2) && is.list(tree1) &&
!inherits(tree1, 'phylo')) {
structure(cpp_all_pairs_mci(as.Splits(tree1, tree1[[1]]), NTip(tree1[[1]])),
class = "dist",
Size = length(tree1), diag = FALSE, upper = FALSE)
} else {
CalculateTreeDistance(MutualClusteringInfoSplits, tree1, tree2,
reportMatching)
}
if (diag && is.null(tree2)) {
unnormalized <- as.matrix(unnormalized)
diag(unnormalized) <- ClusteringEntropy(tree1)
Expand Down
13 changes: 13 additions & 0 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,18 @@ BEGIN_RCPP
return rcpp_result_gen;
END_RCPP
}
// cpp_all_pairs_mci
NumericVector cpp_all_pairs_mci(const List x, const IntegerVector nTip);
RcppExport SEXP _TreeDist_cpp_all_pairs_mci(SEXP xSEXP, SEXP nTipSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< const List >::type x(xSEXP);
Rcpp::traits::input_parameter< const IntegerVector >::type nTip(nTipSEXP);
rcpp_result_gen = Rcpp::wrap(cpp_all_pairs_mci(x, nTip));
return rcpp_result_gen;
END_RCPP
}
// cpp_mutual_clustering
List cpp_mutual_clustering(const RawMatrix x, const RawMatrix y, const IntegerVector nTip);
RcppExport SEXP _TreeDist_cpp_mutual_clustering(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) {
Expand Down Expand Up @@ -219,6 +231,7 @@ static const R_CallMethodDef CallEntries[] = {
{"_TreeDist_cpp_matching_split_distance", (DL_FUNC) &_TreeDist_cpp_matching_split_distance, 3},
{"_TreeDist_cpp_jaccard_similarity", (DL_FUNC) &_TreeDist_cpp_jaccard_similarity, 5},
{"_TreeDist_cpp_msi_distance", (DL_FUNC) &_TreeDist_cpp_msi_distance, 3},
{"_TreeDist_cpp_all_pairs_mci", (DL_FUNC) &_TreeDist_cpp_all_pairs_mci, 2},
{"_TreeDist_cpp_mutual_clustering", (DL_FUNC) &_TreeDist_cpp_mutual_clustering, 3},
{"_TreeDist_cpp_shared_phylo", (DL_FUNC) &_TreeDist_cpp_shared_phylo, 3},
{NULL, NULL, 0}
Expand Down
130 changes: 129 additions & 1 deletion src/SplitList.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <Rcpp.h>
using namespace Rcpp;
#include "tree_distances.h" // Before SplitList.h, for int16
#include <stdint.h>
#include "tree_distances.h" // Before SplitList.h, for int16
#include "SplitList.h"

#define INLASTBIN(n, size) ((size) - ((size) - ((n) % (size))) % (size))
Expand Down Expand Up @@ -52,3 +52,131 @@ SplitList::SplitList(RawMatrix x) {
}
}
}

SplitList::SplitList(RawMatrix x, int16 n_tips) {
n_splits = x.rows();
const int16
n_input_bins = x.cols(),
input_bins_per_bin = BIN_SIZE / R_BIN_SIZE
;

n_bins = (n_input_bins + R_BIN_SIZE - 1) / input_bins_per_bin;

if (n_bins < 1) throw std::invalid_argument("No leaves present.");
if (n_splits < 0) throw std::invalid_argument("Negative number of splits!?");
if (n_bins > MAX_BINS) {
throw std::length_error("This many leaves cannot be supported. " // # nocov
"Please contact the TreeDist maintainer if you " // # nocov
"need to use more!"); // # nocov
}
// Rcout << "\n=== SplitList::SplitList(x, n_tips) ===\n";

for (int16 split = 0; split != n_splits; split++) {
int16 last_bin = n_bins - 1;
const int16 raggedy_bins = INLASTBIN(n_input_bins, R_BIN_SIZE);

// Rcout << n_input_bins << " bins in; " << raggedy_bins << " raggedy bins\n";
assert(raggedy_bins <= BIN_SIZE / R_BIN_SIZE);
assert(raggedy_bins > 0);
state[split][last_bin] = INSUBBIN(last_bin, 0);
// Rcout << " State[" << split << "][" << last_bin << "] = " << state[split][last_bin] << ".\n";
for (int16 input_bin = 1; input_bin != raggedy_bins; input_bin++) {
// Rcout << " state [" << split << "][" << last_bin
// << "] increased from " << state[split][last_bin];
state[split][last_bin] += INBIN(input_bin, last_bin);
// Rcout << " by " << INBIN(input_bin, last_bin) << "\n";
}

const bool invert = state[split][last_bin] & splitbit(1);
if (invert) {
const int16 last_bin_tips = INLASTBIN(n_tips, BIN_SIZE);
// Rcout << "Last bin tips: " << last_bin_tips << ".\n";
const splitbit last_mask = last_bin_tips == sizeof(splitbit) * 8 ?
splitbit(0) - 1 :
(splitbit(1) << INLASTBIN(n_tips, BIN_SIZE)) - 1;
// Rcout << "Last mask: " << last_mask << ".\n";

state[split][last_bin] = ~state[split][last_bin] & last_mask;
}

in_split[split] = count_bits(state[split][last_bin]);

for (int16 bin = 0; bin != n_bins - 1; bin++) {

// Rcout << "Split " << split << ", bin << " << bin << ".\n";
state[split][bin] = INSUBBIN(bin, 0);

for (int16 input_bin = 1; input_bin != input_bins_per_bin; input_bin++) {
// Rcout << " Adding " << INBIN(input_bin, bin) << " = "
// << (splitbit (x(split, (bin * input_bins_per_bin) + input_bin))) << " << "
// << (R_BIN_SIZE * input_bin) << " to state [" << split << "]["
// << bin << "], was " << state[split][bin] << "\n";

state[split][bin] += INBIN(input_bin, bin);
}

if (invert) {
state[split][bin] = ~state[split][bin];
}

in_split[split] += count_bits(state[split][bin]);
}
}
}

void SplitList::swap(int16 a, int16 b) {
for (int16 bin = n_bins; bin--; ) {
const splitbit tmp = state[a][bin];
state[a][bin] = state[b][bin];
state[b][bin] = tmp;
}
const int16 tmp = in_split[a];
in_split[a] = in_split[b];
in_split[b] = tmp;
}

bool SplitList::less_than(int16 a, int16 b) {
for (int16 bin = 0; bin != n_bins; ++bin) {
if (state[a][bin] < state[b][bin]) {
return true;
}
}
return false;
}

bool SplitList::greater_than(int16 a, int16 b) {
for (int16 bin = 0; bin != n_bins; ++bin) {
if (state[a][bin] > state[b][bin]) {
return true;
}
}
return false;
}

int16 SplitList::partition(int16 lo, int16 hi) {
const int16 pivot = (hi + lo) / 2;
for (int16 i = lo - 1, j = hi + 1; ; ) {
do {
++i;
} while (less_than(i, pivot));
do {
--j;
} while (greater_than(j, pivot));
if (i >= j) {
return j;
}
swap(i, j);
}
}

void SplitList::quicksort(int16 lo, int16 hi) {
if (lo < hi) {
const int16 p = partition(lo, hi);
quicksort(lo, p);
quicksort(p + 1, hi);
}
}

void SplitList::quicksort() {
quicksort(0, n_splits - 1);
}
12 changes: 11 additions & 1 deletion src/SplitList.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,26 @@

#include <stdint.h>
#include <Rcpp.h>
#include "ints.h"
#include "tree_distances.h"

using namespace Rcpp;

const int16 R_BIN_SIZE = 8;

class SplitList {
int16 partition(int16 lo, int16 hi);
void swap(int16 a, int16 b);
bool less_than(int16 a, int16 b);
bool greater_than(int16 a, int16 b);
public:
int16 n_splits, n_bins;
int16 in_split[MAX_SPLITS];
splitbit state[MAX_SPLITS][MAX_BINS];
SplitList(RawMatrix);
SplitList(RawMatrix x);
SplitList(RawMatrix x, int16 n_tips);
void quicksort();
void quicksort(int16 lo, int16 hi);
};

#endif
Loading