Skip to content

Commit

Permalink
Clustur patch fixes windows and names (#55)
Browse files Browse the repository at this point in the history
* Fix for clustering breaking on windows with sparse matrices

* Fix for bin_column_names_to not affecting the `shared_dataframe` and fix to the vignette.

* Updating the description for the next future release
  • Loading branch information
GregJohnsonJr authored Nov 26, 2024
1 parent cb46070 commit 3b88578
Show file tree
Hide file tree
Showing 9 changed files with 19 additions and 17 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: clustur
Type: Package
Title: Clustering
Version: 0.1.1
Version: 0.1.2
Date: 2024-11-25
Authors@R: c(
person("Gregory", "Johnson", , "[email protected]", role = c("aut"),
Expand Down
2 changes: 1 addition & 1 deletion src/MothurDependencies/SharedFile.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class SharedFile {
explicit SharedFile(const std::vector<SharedAbundance>& otherTidySharedList)
: tidySharedList(otherTidySharedList) {
}
Rcpp::DataFrame PrintData() const;
Rcpp::DataFrame PrintData(const std::string &binName) const;
private:
std::vector<SharedAbundance> tidySharedList;
};
Expand Down
2 changes: 1 addition & 1 deletion src/MothurDependencies/SharedFileBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
class SharedFileBuilder {
public:
SharedFile *BuildSharedFile(const ListVector &listVector,
const CountTableAdapter& countTable);
const CountTableAdapter& countTable, const std::string &binName);
SharedFileBuilder() = default;
private:
struct SampleInformation {
Expand Down
4 changes: 2 additions & 2 deletions src/SharedFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include "MothurDependencies/ClusterExport.h"


Rcpp::DataFrame SharedFile::PrintData() const {
Rcpp::DataFrame SharedFile::PrintData(const std::string &binName) const {
const size_t size = tidySharedList.size();
std::vector<std::string> groups(size);
std::vector<std::string> otus(size);
Expand All @@ -18,6 +18,6 @@ Rcpp::DataFrame SharedFile::PrintData() const {
abundanceList[count++] = abundances.groupAbundance;
}
return Rcpp::DataFrame::create(Rcpp::Named("samples") = groups,
Rcpp::Named("otu") = otus,
Rcpp::Named(binName) = otus,
Rcpp::Named("abundance") = abundanceList);
}
4 changes: 2 additions & 2 deletions src/SharedFileBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// TODO Comment this code
// TODO We may need to build a traditional file builder...So we can output a dataframe of how the clusters are (list)
SharedFile* SharedFileBuilder::BuildSharedFile(const ListVector &listVector,
const CountTableAdapter &countTable) {
const CountTableAdapter &countTable, const std::string &binName) {
Utils utils;
std::string largestCutoffLabel = listVector.getLabel();
std::vector<SharedAbundance> abundancesList;
Expand All @@ -20,7 +20,7 @@ SharedFile* SharedFileBuilder::BuildSharedFile(const ListVector &listVector,
if(samples.empty())
continue;
std::vector<std::string> splitSamples;
std::string otuName = "otu" + std::to_string(count++);
std::string otuName = binName + std::to_string(count++);
utils.splitAtComma(samples, splitSamples);
std::unordered_map<std::string, double> totalAbundanceInEachGroup;
for(const auto& sample : splitSamples) {
Expand Down
2 changes: 1 addition & 1 deletion src/SharedFileBuilderTestFixture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
bool SharedFileBuilderTestFixture::TestBuildSharedFile(const ListVector& listVector,
const CountTableAdapter &countTable, const bool expectedResult) {
Setup();
const SharedFile* file = builder->BuildSharedFile(listVector, countTable);
const SharedFile* file = builder->BuildSharedFile(listVector, countTable, "otu");
TearDown();
return expectedResult == (file != nullptr);

Expand Down
2 changes: 1 addition & 1 deletion src/SharedFileTestFixture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ bool SharedFileTestFixture::TestSharedFilePrintData(const std::vector<SharedAbun
const Rcpp::DataFrame &expectedResult) {
Setup();
sharedFile = new SharedFile(data);
Rcpp::DataFrame df = sharedFile->PrintData();
Rcpp::DataFrame df = sharedFile->PrintData("otu");
const std::vector<std::string> columnNames = df.names();
const std::vector<std::string> expectedNames = expectedResult.names();
TearDown();
Expand Down
17 changes: 10 additions & 7 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "Adapters/OptimatrixAdapter.h"
#include "Adapters/MatrixAdapter.h"
#include "MothurDependencies/ClusterCommand.h"
#include "MothurDependencies/ListVector.h"
#include "MothurDependencies/OptiMatrix.h"
#include "Adapters/CountTableAdapter.h"
#include "MothurDependencies/ColumnDistanceMatrixReader.h"
Expand All @@ -13,13 +14,14 @@
#include <cctype>


Rcpp::DataFrame CreateSharedDataFrame(const CountTableAdapter& countTable, const ClusterExport* result) {
Rcpp::DataFrame CreateSharedDataFrame(const CountTableAdapter& countTable, const ClusterExport* result,
const std::string& binName) {
SharedFileBuilder builder;
std::unordered_map<std::string, RAbundVector> map;
std::unordered_map<std::string, ListVector> listMap;
const ListVectorPair listVectors = result->GetListVector();
const SharedFile* sharedFile = builder.BuildSharedFile(*listVectors.listVector, countTable);
Rcpp::DataFrame tidySharedDataFrame = sharedFile->PrintData();
const SharedFile* sharedFile = builder.BuildSharedFile(*listVectors.listVector, countTable, binName);
Rcpp::DataFrame tidySharedDataFrame = sharedFile->PrintData(binName);
delete(sharedFile);
return tidySharedDataFrame;
}
Expand Down Expand Up @@ -69,8 +71,9 @@ SEXP ProcessSparseMatrix(const std::vector<int> &xPosition,
CountTableAdapter countTableAdapter;
countTableAdapter.CreateDataFrameMap(countTable);
MatrixAdapter adapter(xPosition, yPosition, data, cutoff, isSim, countTableAdapter);
auto* read = new DistanceFileReader(new SparseDistanceMatrix(adapter.CreateSparseMatrix()),
new ListVector(adapter.CreateListVector()), cutoff, isSim);
auto* sparseDistanceMatrix = new SparseDistanceMatrix(adapter.CreateSparseMatrix());
auto* listVec = new ListVector(adapter.CreateListVector());
auto* read = new DistanceFileReader(sparseDistanceMatrix,listVec,cutoff, isSim);
read->CreateCountTableAdapter(countTable);
return Rcpp::XPtr<DistanceFileReader>(read);
}
Expand Down Expand Up @@ -103,7 +106,7 @@ Rcpp::List Cluster(const SEXP& DistanceData,const std::string& method, const std
const auto label = result->GetListVector().label;
const Rcpp::DataFrame clusterDataFrame = result->GetListVector().listVector->CreateDataFrameFromList(
featureColumnName, binColumnName);
const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result);
const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result, binColumnName);
delete(result);
delete(listVector);
delete(sparseMatrix);
Expand All @@ -129,7 +132,7 @@ Rcpp::List OptiCluster(const SEXP& DistanceData, const std::string& featureColum
const auto label = result->GetListVector().label;
const Rcpp::DataFrame clusterDataFrame = result->GetListVector().listVector->CreateDataFrameFromList(
featureColumnName, binColumnName);
const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result);
const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result, binColumnName);
delete(result);
return Rcpp::List::create(Rcpp::Named("label") = std::stod(label),
Rcpp::Named("abundance") = tidySharedDataFrame,
Expand Down
1 change: 0 additions & 1 deletion vignettes/clustur.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ cluster_data <- cluster(column_distance, cutoff, method = "weighted")

## Output data from clustering

#### edit this paragraph further...
All methods produce a list object with an indicator of the cutoff that was used
(`label`), as well as cluster composition (`cluster`) and shared (`abundance`) data frames.
The `clusters` data frame shows which OTU (Operation Taxonomic Unit) each sequence was assigned to. The `abundance` data frame
Expand Down

0 comments on commit 3b88578

Please sign in to comment.