From 3b88578f7a63e9b84bb3c5fd783ad3ea54173f75 Mon Sep 17 00:00:00 2001 From: Gregory Johnson <31873199+GregJohnsonJr@users.noreply.github.com> Date: Tue, 26 Nov 2024 14:34:07 -0500 Subject: [PATCH] Clustur patch fixes windows and names (#55) * Fix for clustering breaking on windows with sparse matrices * Fix for bin_column_names_to not affecting the `shared_dataframe` and fix to the vignette. * Updating the description for the next future release --- DESCRIPTION | 2 +- src/MothurDependencies/SharedFile.h | 2 +- src/MothurDependencies/SharedFileBuilder.h | 2 +- src/SharedFile.cpp | 4 ++-- src/SharedFileBuilder.cpp | 4 ++-- src/SharedFileBuilderTestFixture.cpp | 2 +- src/SharedFileTestFixture.cpp | 2 +- src/main.cpp | 17 ++++++++++------- vignettes/clustur.Rmd | 1 - 9 files changed, 19 insertions(+), 17 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7f878f1..5cbb4b0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: clustur Type: Package Title: Clustering -Version: 0.1.1 +Version: 0.1.2 Date: 2024-11-25 Authors@R: c( person("Gregory", "Johnson", , "grejoh@umich.edu", role = c("aut"), diff --git a/src/MothurDependencies/SharedFile.h b/src/MothurDependencies/SharedFile.h index c420c19..9e15d15 100644 --- a/src/MothurDependencies/SharedFile.h +++ b/src/MothurDependencies/SharedFile.h @@ -16,7 +16,7 @@ class SharedFile { explicit SharedFile(const std::vector& otherTidySharedList) : tidySharedList(otherTidySharedList) { } - Rcpp::DataFrame PrintData() const; + Rcpp::DataFrame PrintData(const std::string &binName) const; private: std::vector tidySharedList; }; diff --git a/src/MothurDependencies/SharedFileBuilder.h b/src/MothurDependencies/SharedFileBuilder.h index 22880cd..64d157a 100644 --- a/src/MothurDependencies/SharedFileBuilder.h +++ b/src/MothurDependencies/SharedFileBuilder.h @@ -23,7 +23,7 @@ class SharedFileBuilder { public: SharedFile *BuildSharedFile(const ListVector &listVector, - const CountTableAdapter& countTable); + const CountTableAdapter& countTable, const std::string &binName); SharedFileBuilder() = default; private: struct SampleInformation { diff --git a/src/SharedFile.cpp b/src/SharedFile.cpp index eefe63a..c951b09 100644 --- a/src/SharedFile.cpp +++ b/src/SharedFile.cpp @@ -6,7 +6,7 @@ #include "MothurDependencies/ClusterExport.h" -Rcpp::DataFrame SharedFile::PrintData() const { +Rcpp::DataFrame SharedFile::PrintData(const std::string &binName) const { const size_t size = tidySharedList.size(); std::vector groups(size); std::vector otus(size); @@ -18,6 +18,6 @@ Rcpp::DataFrame SharedFile::PrintData() const { abundanceList[count++] = abundances.groupAbundance; } return Rcpp::DataFrame::create(Rcpp::Named("samples") = groups, - Rcpp::Named("otu") = otus, + Rcpp::Named(binName) = otus, Rcpp::Named("abundance") = abundanceList); } diff --git a/src/SharedFileBuilder.cpp b/src/SharedFileBuilder.cpp index cdc4c1b..8db2275 100644 --- a/src/SharedFileBuilder.cpp +++ b/src/SharedFileBuilder.cpp @@ -8,7 +8,7 @@ // TODO Comment this code // TODO We may need to build a traditional file builder...So we can output a dataframe of how the clusters are (list) SharedFile* SharedFileBuilder::BuildSharedFile(const ListVector &listVector, - const CountTableAdapter &countTable) { + const CountTableAdapter &countTable, const std::string &binName) { Utils utils; std::string largestCutoffLabel = listVector.getLabel(); std::vector abundancesList; @@ -20,7 +20,7 @@ SharedFile* SharedFileBuilder::BuildSharedFile(const ListVector &listVector, if(samples.empty()) continue; std::vector splitSamples; - std::string otuName = "otu" + std::to_string(count++); + std::string otuName = binName + std::to_string(count++); utils.splitAtComma(samples, splitSamples); std::unordered_map totalAbundanceInEachGroup; for(const auto& sample : splitSamples) { diff --git a/src/SharedFileBuilderTestFixture.cpp b/src/SharedFileBuilderTestFixture.cpp index 29f323f..4d4aef8 100644 --- a/src/SharedFileBuilderTestFixture.cpp +++ b/src/SharedFileBuilderTestFixture.cpp @@ -7,7 +7,7 @@ bool SharedFileBuilderTestFixture::TestBuildSharedFile(const ListVector& listVector, const CountTableAdapter &countTable, const bool expectedResult) { Setup(); - const SharedFile* file = builder->BuildSharedFile(listVector, countTable); + const SharedFile* file = builder->BuildSharedFile(listVector, countTable, "otu"); TearDown(); return expectedResult == (file != nullptr); diff --git a/src/SharedFileTestFixture.cpp b/src/SharedFileTestFixture.cpp index 2421276..2e7774b 100644 --- a/src/SharedFileTestFixture.cpp +++ b/src/SharedFileTestFixture.cpp @@ -8,7 +8,7 @@ bool SharedFileTestFixture::TestSharedFilePrintData(const std::vectorPrintData(); + Rcpp::DataFrame df = sharedFile->PrintData("otu"); const std::vector columnNames = df.names(); const std::vector expectedNames = expectedResult.names(); TearDown(); diff --git a/src/main.cpp b/src/main.cpp index 57a7dc7..7a835ce 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -4,6 +4,7 @@ #include "Adapters/OptimatrixAdapter.h" #include "Adapters/MatrixAdapter.h" #include "MothurDependencies/ClusterCommand.h" +#include "MothurDependencies/ListVector.h" #include "MothurDependencies/OptiMatrix.h" #include "Adapters/CountTableAdapter.h" #include "MothurDependencies/ColumnDistanceMatrixReader.h" @@ -13,13 +14,14 @@ #include -Rcpp::DataFrame CreateSharedDataFrame(const CountTableAdapter& countTable, const ClusterExport* result) { +Rcpp::DataFrame CreateSharedDataFrame(const CountTableAdapter& countTable, const ClusterExport* result, + const std::string& binName) { SharedFileBuilder builder; std::unordered_map map; std::unordered_map listMap; const ListVectorPair listVectors = result->GetListVector(); - const SharedFile* sharedFile = builder.BuildSharedFile(*listVectors.listVector, countTable); - Rcpp::DataFrame tidySharedDataFrame = sharedFile->PrintData(); + const SharedFile* sharedFile = builder.BuildSharedFile(*listVectors.listVector, countTable, binName); + Rcpp::DataFrame tidySharedDataFrame = sharedFile->PrintData(binName); delete(sharedFile); return tidySharedDataFrame; } @@ -69,8 +71,9 @@ SEXP ProcessSparseMatrix(const std::vector &xPosition, CountTableAdapter countTableAdapter; countTableAdapter.CreateDataFrameMap(countTable); MatrixAdapter adapter(xPosition, yPosition, data, cutoff, isSim, countTableAdapter); - auto* read = new DistanceFileReader(new SparseDistanceMatrix(adapter.CreateSparseMatrix()), - new ListVector(adapter.CreateListVector()), cutoff, isSim); + auto* sparseDistanceMatrix = new SparseDistanceMatrix(adapter.CreateSparseMatrix()); + auto* listVec = new ListVector(adapter.CreateListVector()); + auto* read = new DistanceFileReader(sparseDistanceMatrix,listVec,cutoff, isSim); read->CreateCountTableAdapter(countTable); return Rcpp::XPtr(read); } @@ -103,7 +106,7 @@ Rcpp::List Cluster(const SEXP& DistanceData,const std::string& method, const std const auto label = result->GetListVector().label; const Rcpp::DataFrame clusterDataFrame = result->GetListVector().listVector->CreateDataFrameFromList( featureColumnName, binColumnName); - const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result); + const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result, binColumnName); delete(result); delete(listVector); delete(sparseMatrix); @@ -129,7 +132,7 @@ Rcpp::List OptiCluster(const SEXP& DistanceData, const std::string& featureColum const auto label = result->GetListVector().label; const Rcpp::DataFrame clusterDataFrame = result->GetListVector().listVector->CreateDataFrameFromList( featureColumnName, binColumnName); - const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result); + const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result, binColumnName); delete(result); return Rcpp::List::create(Rcpp::Named("label") = std::stod(label), Rcpp::Named("abundance") = tidySharedDataFrame, diff --git a/vignettes/clustur.Rmd b/vignettes/clustur.Rmd index a712dd1..ecbe339 100644 --- a/vignettes/clustur.Rmd +++ b/vignettes/clustur.Rmd @@ -113,7 +113,6 @@ cluster_data <- cluster(column_distance, cutoff, method = "weighted") ## Output data from clustering -#### edit this paragraph further... All methods produce a list object with an indicator of the cutoff that was used (`label`), as well as cluster composition (`cluster`) and shared (`abundance`) data frames. The `clusters` data frame shows which OTU (Operation Taxonomic Unit) each sequence was assigned to. The `abundance` data frame