Clustur patch fixes windows and names (#55)

* Fix for clustering breaking on windows with sparse matrices * Fix for bin_column_names_to not affecting the `shared_dataframe` and fix to the vignette. * Updating the description for the next future release
SchlossLab · Nov 26, 2024 · 3b88578 · 3b88578
1 parent cb46070
commit 3b88578
Show file tree

Hide file tree

Showing 9 changed files with 19 additions and 17 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: clustur
 Type: Package
 Title: Clustering
-Version: 0.1.1
+Version: 0.1.2
 Date: 2024-11-25
 Authors@R: c(
     person("Gregory", "Johnson", , "[email protected]", role = c("aut"),

diff --git a/src/MothurDependencies/SharedFile.h b/src/MothurDependencies/SharedFile.h
@@ -16,7 +16,7 @@ class SharedFile {
     explicit SharedFile(const std::vector<SharedAbundance>& otherTidySharedList)
         : tidySharedList(otherTidySharedList) {
     }
-    Rcpp::DataFrame PrintData() const;
+    Rcpp::DataFrame PrintData(const std::string &binName) const;
 private:
     std::vector<SharedAbundance> tidySharedList;
 };

diff --git a/src/MothurDependencies/SharedFileBuilder.h b/src/MothurDependencies/SharedFileBuilder.h
@@ -23,7 +23,7 @@
 class SharedFileBuilder {
 public:
     SharedFile *BuildSharedFile(const ListVector &listVector,
-        const CountTableAdapter& countTable);
+                                const CountTableAdapter& countTable, const std::string &binName);
     SharedFileBuilder() = default;
 private:
     struct SampleInformation {

diff --git a/src/SharedFile.cpp b/src/SharedFile.cpp
@@ -6,7 +6,7 @@
 #include "MothurDependencies/ClusterExport.h"
 
 
-Rcpp::DataFrame SharedFile::PrintData() const {
+Rcpp::DataFrame SharedFile::PrintData(const std::string &binName) const {
     const size_t size = tidySharedList.size();
     std::vector<std::string> groups(size);
     std::vector<std::string> otus(size);
@@ -18,6 +18,6 @@ Rcpp::DataFrame SharedFile::PrintData() const {
         abundanceList[count++] = abundances.groupAbundance;
     }
     return Rcpp::DataFrame::create(Rcpp::Named("samples") = groups,
-                                                Rcpp::Named("otu") = otus,
+                                                Rcpp::Named(binName) = otus,
                                                 Rcpp::Named("abundance") = abundanceList);
 }
diff --git a/src/SharedFileBuilder.cpp b/src/SharedFileBuilder.cpp
@@ -8,7 +8,7 @@
 // TODO Comment this code
 // TODO We may need to build a traditional file builder...So we can output a dataframe of how the clusters are (list)
 SharedFile* SharedFileBuilder::BuildSharedFile(const ListVector &listVector,
-    const CountTableAdapter &countTable) {
+                                               const CountTableAdapter &countTable, const std::string &binName) {
     Utils utils;
     std::string largestCutoffLabel = listVector.getLabel();
     std::vector<SharedAbundance> abundancesList;
@@ -20,7 +20,7 @@ SharedFile* SharedFileBuilder::BuildSharedFile(const ListVector &listVector,
         if(samples.empty())
             continue;
         std::vector<std::string> splitSamples;
-        std::string otuName = "otu" + std::to_string(count++);
+        std::string otuName = binName + std::to_string(count++);
         utils.splitAtComma(samples, splitSamples);
         std::unordered_map<std::string, double> totalAbundanceInEachGroup;
         for(const auto& sample : splitSamples) {

diff --git a/src/SharedFileBuilderTestFixture.cpp b/src/SharedFileBuilderTestFixture.cpp
@@ -7,7 +7,7 @@
 bool SharedFileBuilderTestFixture::TestBuildSharedFile(const ListVector& listVector,
                                                        const CountTableAdapter &countTable, const bool expectedResult) {
     Setup();
-    const SharedFile* file = builder->BuildSharedFile(listVector, countTable);
+    const SharedFile* file = builder->BuildSharedFile(listVector, countTable, "otu");
     TearDown();
     return expectedResult == (file != nullptr);
 

diff --git a/src/SharedFileTestFixture.cpp b/src/SharedFileTestFixture.cpp
@@ -8,7 +8,7 @@ bool SharedFileTestFixture::TestSharedFilePrintData(const std::vector<SharedAbun
                                                     const Rcpp::DataFrame &expectedResult) {
     Setup();
     sharedFile = new SharedFile(data);
-    Rcpp::DataFrame df = sharedFile->PrintData();
+    Rcpp::DataFrame df = sharedFile->PrintData("otu");
     const std::vector<std::string> columnNames = df.names();
     const std::vector<std::string> expectedNames = expectedResult.names();
     TearDown();

diff --git a/src/main.cpp b/src/main.cpp
@@ -4,6 +4,7 @@
 #include "Adapters/OptimatrixAdapter.h"
 #include "Adapters/MatrixAdapter.h"
 #include "MothurDependencies/ClusterCommand.h"
+#include "MothurDependencies/ListVector.h"
 #include "MothurDependencies/OptiMatrix.h"
 #include "Adapters/CountTableAdapter.h"
 #include "MothurDependencies/ColumnDistanceMatrixReader.h"
@@ -13,13 +14,14 @@
 #include <cctype>
 
 
-Rcpp::DataFrame CreateSharedDataFrame(const CountTableAdapter& countTable, const ClusterExport* result) {
+Rcpp::DataFrame CreateSharedDataFrame(const CountTableAdapter& countTable, const ClusterExport* result,
+    const std::string& binName) {
     SharedFileBuilder builder;
     std::unordered_map<std::string, RAbundVector> map;
     std::unordered_map<std::string, ListVector> listMap;
     const ListVectorPair listVectors = result->GetListVector();
-    const SharedFile* sharedFile = builder.BuildSharedFile(*listVectors.listVector, countTable);
-    Rcpp::DataFrame tidySharedDataFrame = sharedFile->PrintData();
+    const SharedFile* sharedFile = builder.BuildSharedFile(*listVectors.listVector, countTable, binName);
+    Rcpp::DataFrame tidySharedDataFrame = sharedFile->PrintData(binName);
     delete(sharedFile);
     return tidySharedDataFrame;
 }
@@ -69,8 +71,9 @@ SEXP ProcessSparseMatrix(const std::vector<int> &xPosition,
     CountTableAdapter countTableAdapter;
     countTableAdapter.CreateDataFrameMap(countTable);
     MatrixAdapter adapter(xPosition, yPosition, data, cutoff, isSim, countTableAdapter);
-    auto* read = new DistanceFileReader(new SparseDistanceMatrix(adapter.CreateSparseMatrix()),
-        new ListVector(adapter.CreateListVector()), cutoff, isSim);
+    auto* sparseDistanceMatrix = new SparseDistanceMatrix(adapter.CreateSparseMatrix());
+    auto* listVec =  new ListVector(adapter.CreateListVector());
+    auto* read = new DistanceFileReader(sparseDistanceMatrix,listVec,cutoff, isSim);
     read->CreateCountTableAdapter(countTable);
     return Rcpp::XPtr<DistanceFileReader>(read);
 }
@@ -103,7 +106,7 @@ Rcpp::List Cluster(const SEXP& DistanceData,const std::string& method, const std
     const auto label = result->GetListVector().label;
     const Rcpp::DataFrame clusterDataFrame = result->GetListVector().listVector->CreateDataFrameFromList(
         featureColumnName, binColumnName);
-    const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result);
+    const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result, binColumnName);
     delete(result);
     delete(listVector);
     delete(sparseMatrix);
@@ -129,7 +132,7 @@ Rcpp::List OptiCluster(const SEXP& DistanceData, const std::string& featureColum
     const auto label = result->GetListVector().label;
     const Rcpp::DataFrame clusterDataFrame = result->GetListVector().listVector->CreateDataFrameFromList(
         featureColumnName, binColumnName);
-    const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result);
+    const Rcpp::DataFrame tidySharedDataFrame = CreateSharedDataFrame(countTableAdapter, result, binColumnName);
     delete(result);
     return Rcpp::List::create(Rcpp::Named("label") = std::stod(label),
       Rcpp::Named("abundance") = tidySharedDataFrame,

diff --git a/vignettes/clustur.Rmd b/vignettes/clustur.Rmd
@@ -113,7 +113,6 @@ cluster_data <- cluster(column_distance, cutoff, method = "weighted")
 
 ## Output data from clustering
 
-#### edit this paragraph further...
 All methods produce a list object with an indicator of the cutoff that was used
 (`label`), as well as cluster composition (`cluster`) and shared (`abundance`) data frames.
 The `clusters` data frame shows which OTU (Operation Taxonomic Unit) each sequence was assigned to. The `abundance` data frame