From be712e612c77ae63aab5a9fb46a40e64fd054fae Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 23 Dec 2024 15:58:15 -0500
Subject: [PATCH 01/25] Reduce seqtxt memory usage

---
 Makefile                |  11 ++-
 include/seqtxt_module.h |  13 +--
 include/utils.h         |   2 +
 src/cli.py              |  14 +--
 src/plot_utils.py       |  73 +++++++-------
 src/seqtxt_module.cpp   | 212 ++++++++++++++++++++--------------------
 src/utils.cpp           |  17 +++-
 7 files changed, 176 insertions(+), 166 deletions(-)

diff --git a/Makefile b/Makefile
index 7b1e50e..855290e 100644
--- a/Makefile
+++ b/Makefile
@@ -3,8 +3,11 @@ SRC_DIR := $(CURDIR)/src
 LIB_DIR := $(CURDIR)/lib
 
 # Set the library paths for the compiler
-LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib
-INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include
+#LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib
+#INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include
+CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
+LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
+INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
 
 # All targets
 all: swig_build compile
@@ -15,5 +18,7 @@ swig_build:
 
 # Compile the C++ shared libraries into lib/
 compile:
-	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \
+	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \
 	CXXFLAGS="$(INCLUDE_PATHS)" LDFLAGS="$(LIBRARY_PATHS)" python3 setup.py build_ext --build-lib $(LIB_DIR)
+
+# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \
diff --git a/include/seqtxt_module.h b/include/seqtxt_module.h
index 1035598..d2f0ce5 100644
--- a/include/seqtxt_module.h
+++ b/include/seqtxt_module.h
@@ -37,7 +37,7 @@ class SeqTxt_Thread_data {
       Output_SeqTxt t_output_SeqTxt_;
       std::string current_line;  // Current line being read from the file
 
-      size_t read_ss_record(std::ifstream* file_stream, std::map<std::string, int> header_columns);
+      size_t read_ss_record(std::ifstream& file_stream, std::map<std::string, int> header_columns);
       std::map<std::string, int> getHeaderColumns();
 
       SeqTxt_Thread_data(Input_Para& ref_input_op, std::map<std::string, int> header_columns, int p_thread_id, int p_batch_size);
@@ -60,20 +60,13 @@ class SeqTxt_Module{
     static std::mutex myMutex_readSeqTxt;
     static std::mutex myMutex_output;
     static size_t batch_size_of_record;
-
     Input_Para _input_parameters;
-
-    std::ifstream *input_file_stream;  // Stream for the input text file
+    std::ifstream input_file_stream;  // Stream for the input text file
     std::vector<std::thread> m_threads;
-
-
     int has_error;
 
     // Methods
-    // Assign threads
-    static void SeqTxt_do_thread(std::ifstream* file_stream, Input_Para& ref_input_op, int thread_id, SeqTxt_Thread_data& ref_thread_data, Output_SeqTxt& ref_output);
-
-    // Generate statistics
+    static void SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref_input_op, int thread_id, Output_SeqTxt& ref_output, std::map<std::string, int> header_columns, size_t batch_size_of_record);
     int generateStatistics( Output_SeqTxt& t_output_SeqTxt_info);
 
     SeqTxt_Module(Input_Para& _m_input);
diff --git a/include/utils.h b/include/utils.h
index 9637f1e..1828dbb 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -12,4 +12,6 @@ void printMessage(std::string message);
 // Print an error message to stderr in a thread-safe manner
 void printError(std::string message);
 
+void printMemoryUsage(const std::string &functionName);
+
 #endif // UTILS_H
diff --git a/src/cli.py b/src/cli.py
index d4c8739..29549f2 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -154,7 +154,7 @@ def fq_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fq_output, param_dict, 'FASTQ')
             fq_html_gen = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "base_quality",
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality",
                   "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False)
             fq_html_gen.generate_html()
 
@@ -192,7 +192,7 @@ def fa_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fa_output, param_dict, 'FASTA')
             fa_html_gen = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "base_counts"], "FASTA QC",
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts"], "FASTA QC",
                  param_dict], plot_filepaths, static=True)
             fa_html_gen.generate_html()
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
@@ -245,7 +245,7 @@ def bam_module(margs):
             plot_filepaths = plot(bam_output, param_dict, 'BAM')
 
             # Set the list of QC information to display
-            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality"]
+            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"]
 
             # If base modifications were found, add the base modification plots
             # after the first table
@@ -310,7 +310,7 @@ def rrms_module(margs):
 
                 # Generate the HTML report
                 bam_html_gen = generate_html.ST_HTML_Generator(
-                    [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "base_counts", "basic_info",
+                    [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info",
                     "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False)
                 bam_html_gen.generate_html()
                 logging.info("Done. Output files are in %s", param_dict["output_folder"])
@@ -383,7 +383,7 @@ def fast5_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fast5_output, param_dict, 'FAST5')
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality",
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality",
                   "read_avg_base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html()
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
@@ -429,7 +429,7 @@ def fast5_signal_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fast5_output, param_dict, 'FAST5s')
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality",
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality",
                   "read_avg_base_quality", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html(signal_plots=True)
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
@@ -517,7 +517,7 @@ def pod5_module(margs):
             # plot_filepaths = plot(read_signal_dict, param_dict, 'POD5')
             webpage_title = "POD5 QC"
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality",
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality",
                   "read_avg_base_quality", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html(signal_plots=True)
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 669b602..ac03046 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -34,6 +34,8 @@ def getDefaultPlotFilenames():
         "basic_info": {'title': "Basic Statistics",
                        'description': "Basic Statistics", 'summary': ""},
         "read_length_hist": {'title': "Read Length Histogram", 'description': "Read Length Histogram", 'summary': ""},
+        
+        "gc_content_hist": {'title': "GC Content Histogram", 'description': "GC Content Histogram", 'summary': ""},
 
         "base_quality": {'title': "Base Quality Histogram", 'description': "Base Quality Histogram"},
 
@@ -251,9 +253,6 @@ def read_lengths_histogram(data, font_size):
     hist, _ = np.histogram(read_lengths, bins=edges)
 
     # Create a figure with two subplots
-    # fig = make_subplots(
-    #     rows=2, cols=1,
-    #     subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.5)
     fig = make_subplots(
         rows=1, cols=2,
         subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.0)
@@ -276,13 +275,11 @@ def read_lengths_histogram(data, font_size):
     # Log histogram
     # Get the log10 histogram of read lengths
     read_lengths_log = np.log10(read_lengths, out=np.zeros_like(read_lengths), where=(read_lengths != 0))
-    # log_hist, log_edges = np.histogram(read_lengths_log, bins=bin_count)
     log_edges = np.linspace(0, np.max(read_lengths_log), num=log_bin_count + 1)
     log_hist, _ = np.histogram(read_lengths_log, bins=log_edges)
 
     xd = log_edges
     log_bindata = np.dstack((np.power(10, log_edges)[:-1], np.power(10, log_edges)[1:], log_hist))[0, :, :]
-    # log_bin_centers = np.round((log_bindata[:, 0] + log_bindata[:, 1]) / 2, 0)
     yd = log_hist
     fig.add_trace(go.Bar(x=xd, y=yd, customdata=log_bindata,
                          hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp<br>Counts:%{customdata[2]:.0f}<extra></extra>',
@@ -297,17 +294,7 @@ def read_lengths_histogram(data, font_size):
     fig.update_annotations(font=dict(color="white"))
 
     # Set tick value range for the log scale
-    # Use the bin edge centers as the tick values
-    # tick_vals = (log_edges[:-1] + log_edges[1:]) / 2
-    # tick_labels = ['{:,}'.format(int(10 ** x)) for x in tick_vals]
     tick_vals = log_edges
-    # tick_labels = ['{:,}'.format(int(10 ** x)) for x in tick_vals]
-
-    # Format the tick labels to be in kilobases (kb) if the value is greater than
-    # 1000, and in bases (b) otherwise
-    # tick_labels = ['{:,}kb'.format(int(x / 1000)) for x in tick_vals]
-    # tick_labels = ['{:,}kb'.format(int(x) for x in log_bin_centers) if x >
-    # 1000 else '{:,}b'.format(int(x)) for x in log_bin_centers]
     tick_labels = []
     for i in range(len(log_bindata)):
         # Format the tick labels to be in kilobases (kb) if the value is greater
@@ -322,21 +309,7 @@ def read_lengths_histogram(data, font_size):
         tick_labels.append('{}-{}'.format(left_val_str, right_val_str))
 
     fig.update_xaxes(ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=1, col=log_col, tickvals=tick_vals, ticktext=tick_labels, tickangle=45)
-    # fig.update_xaxes(range=[0, np.max(log_edges)], ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1)
-    # fig.update_xaxes(range=[0, np.max(log_edges)], ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1, tickvals=tick_vals)
-    # tick_vals = list(range(0, 5))
-    # fig.update_xaxes(
-    #     range=[0, np.max(log_edges)],
-    #     tickmode='array',
-    #     tickvals=tick_vals,
-    #     ticktext=['{:,}'.format(10 ** x) for x in tick_vals],
-    #     ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1)
-
-    # Set the tick value range for the linear scale
-    # tick_vals = (edges[:-1] + edges[1:]) / 2
-    # tick_labels = ['{:,}'.format(int(x)) for x in tick_vals]
     tick_vals = edges
-    # tick_labels = ['{:,}'.format(int(x)) for x in tick_vals]
     
     # Format the tick labels to be the range of the bin centers
     tick_labels = []
@@ -352,26 +325,39 @@ def read_lengths_histogram(data, font_size):
 
         tick_labels.append('{}-{}'.format(left_val_str, right_val_str))
         
-    # tick_labels = ['{:,}kb'.format(int(x / 1000)) for x in tick_vals]
-    # tick_labels = ['{:,}kb'.format(int(x)) if x > 1000 else
-    # '{:,}b'.format(int(x)) for x in linear_bin_centers]
     linear_col=1
     fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=linear_col, tickvals=tick_vals, ticktext=tick_labels, tickangle=45)
-    # fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=1, range=[0, np.max(edges)], tickvals=tick_vals)
     fig.update_yaxes(ticks="outside", title_text='Counts', title_standoff=0)
 
     # Update the layout
     fig.update_layout(showlegend=False, autosize=True, font=dict(size=PLOT_FONT_SIZE))
-    # Set font sizes
-    # fig.update_layout(showlegend=False, autosize=False)
-    # fig.update_layout(font=dict(size=font_size), autosize=True)
 
     fig.update_annotations(font_size=annotation_size)
-    # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
     html_obj = fig.to_html(full_html=False, default_height=500, default_width=1200)
                            
     return html_obj
 
+def read_gc_content_histogram(data, font_size):
+    """Plot the per-read GC content histogram."""
+
+    # Get the GC content data
+    gc_content = np.array(data.read_gc_content_count)
+    
+    # Create a histogram of the GC content (0-100% with 1% bins)
+    gc_content_bins = np.linspace(0, 100, 101)
+    gc_hist, _ = np.histogram(gc_content, bins=gc_content_bins)
+
+    # Create the figure
+    fig = go.Figure()
+    fig.add_trace(go.Bar(x=gc_content_bins, y=gc_hist, marker_color='#36a5c7'))
+
+    # Update the layout
+    fig.update_xaxes(ticks="outside", dtick=10, title_text='GC Content (%)', title_standoff=0)
+    fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0)
+    fig.update_layout(font=dict(size=PLOT_FONT_SIZE))  # Set font size
+
+    return fig.to_html(full_html=False, default_height=500, default_width=700)
+
 # Save the 'Base quality' plot image.
 def base_quality(data, font_size):
     xd = np.arange(MAX_BASE_QUALITY)
@@ -479,10 +465,17 @@ def plot(output_data, para_dict, file_type):
 
         plot_filepaths['read_length_bar']['dynamic'] = plot_read_length_stats(output_data, file_type)
 
+    # GC content histogram
+    if file_type != 'FAST5s' and file_type != 'SeqTxt':
+        if file_type == 'BAM':
+            plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.mapped_long_read_info, font_size)
+        elif file_type == 'SeqTxt':
+            plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size)
+        else:
+            plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.long_read_info, font_size)
+
+    # Quality plots
     if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt':
-        # if file_type == 'SeqTxt':
-        #     seq_quality_info = output_data.all_long_read_info.seq_quality_info
-        # else:
         seq_quality_info = output_data.seq_quality_info
 
         # Base quality histogram
diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp
index 9cd7e7f..2eb868a 100644
--- a/src/seqtxt_module.cpp
+++ b/src/seqtxt_module.cpp
@@ -4,12 +4,16 @@ Class for calling FAST5 statistics modules.
 
 */
 
+#include "seqtxt_module.h"
+
+/// @cond
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <algorithm>
+/// @endcond
 
-#include "seqtxt_module.h"
+#include "utils.h"
 
 
 size_t SeqTxt_Module::batch_size_of_record=3000;
@@ -36,10 +40,11 @@ std::map<std::string, int> SeqTxt_Thread_data::getHeaderColumns()
     return _header_columns;
 }
 
-size_t SeqTxt_Thread_data::read_ss_record(std::ifstream* file_stream, std::map<std::string, int> header_columns){
+size_t SeqTxt_Thread_data::read_ss_record(std::ifstream& file_stream, std::map<std::string, int> header_columns){
     //std::cout << "Type 1." << std::endl;
     thread_index = 0;  // Index where this thread's data will be stored
-    while( std::getline( *file_stream, current_line )) {
+    while( std::getline( file_stream, current_line ) )
+    {
         std::istringstream column_stream( current_line );
 
         // Read each column value from the record line
@@ -93,7 +98,7 @@ SeqTxt_Module::SeqTxt_Module(Input_Para& input_parameters){
     has_error = 0;
     file_index = 0;
 
-    input_file_stream = NULL;
+    // input_file_stream = NULL;
     if (file_index >= _input_parameters.num_input_files){
         std::cerr << "Input file list error." << std::endl;
         has_error |= 1;
@@ -102,8 +107,9 @@ SeqTxt_Module::SeqTxt_Module(Input_Para& input_parameters){
 
     // Open the first file in the list
     const char * first_filepath = _input_parameters.input_files[file_index].c_str();
-    input_file_stream = new std::ifstream(first_filepath);
-    if (!(input_file_stream->is_open())){
+    // input_file_stream = new std::ifstream(first_filepath);
+    input_file_stream.open(first_filepath);
+    if (!(input_file_stream.is_open())){
         std::cerr << "Cannot open sequencing_summary.txt file="<< first_filepath <<std::endl;
         has_error |= 2;
     }else{
@@ -112,7 +118,7 @@ SeqTxt_Module::SeqTxt_Module(Input_Para& input_parameters){
 
         // Ensure that we have the columns we need for statistics
         std::string column_line;
-        std::getline( *input_file_stream, column_line );
+        std::getline( input_file_stream, column_line );
         if (requiredHeadersFound(column_line))
         {
 //            // Print the column names
@@ -172,129 +178,127 @@ bool SeqTxt_Module::requiredHeadersFound(std::string header_string) {
 }
 
 SeqTxt_Module::~SeqTxt_Module(){
-   if (input_file_stream!=NULL){
-       delete input_file_stream;
-   }
-   input_file_stream = NULL;
+    if (input_file_stream.is_open()){
+        input_file_stream.close();
+    }
 }
 
 int SeqTxt_Module::generateStatistics( Output_SeqTxt& t_output_SeqTxt_info){  
    auto relapse_start_time = std::chrono::high_resolution_clock::now();
 
-   t_output_SeqTxt_info.all_long_read_info.long_read_info.resize();
-   t_output_SeqTxt_info.passed_long_read_info.long_read_info.resize();
-   t_output_SeqTxt_info.failed_long_read_info.long_read_info.resize();
-
-   if (has_error==0){
-       m_threads.reserve(_input_parameters.threads+3);
-
-      int _i_t=0;
-      SeqTxt_Thread_data** thread_data_vector = new SeqTxt_Thread_data*[_input_parameters.threads];
-      try{
-         for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){
-            //  std::cout<<"INFO: generate threads "<<_i_t<<std::endl<<std::flush;
-             thread_data_vector[_i_t] = new SeqTxt_Thread_data(_input_parameters, _header_columns, _i_t, SeqTxt_Module::batch_size_of_record);
-            //  std::cout<<"INFO: Thread = "<< _i_t+1  <<std::endl<<std::flush;
-             m_threads.push_back(std::thread((SeqTxt_Module::SeqTxt_do_thread), input_file_stream, std::ref(_input_parameters), _i_t, std::ref(*(thread_data_vector[_i_t])), std::ref(t_output_SeqTxt_info) ));
-         }
-
-        //  std::cout<<"INFO: join threads"<<std::endl<<std::flush;
-        std::cout << "Joining " << _input_parameters.threads << " threads..." << std::endl;
-         for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){
-            //  std::cout<<"INFO: join threads "<<_i_t<<std::endl<<std::flush;
-             m_threads[_i_t].join();
-         }
-         std::cout << "All threads joined." << std::endl;
-
-      }catch(const std::runtime_error& re){
-         std::cerr << "Runtime error: " << re.what() << std::endl;
-      }catch(const std::exception& ex){
-         std::cerr << "Error occurred: " << ex.what() << std::endl;
-      }catch(...){
-         std::cerr << "Unknown failure occurred. Possible memory corruption" << std::endl;
-      }
-     
-      for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){
-         delete thread_data_vector[_i_t];
-      }
-      delete [] thread_data_vector;
-   }
+    t_output_SeqTxt_info.all_long_read_info.long_read_info.resize();
+    t_output_SeqTxt_info.passed_long_read_info.long_read_info.resize();
+    t_output_SeqTxt_info.failed_long_read_info.long_read_info.resize();
+    printMemoryUsage("Before generating statistics");
 
-   t_output_SeqTxt_info.global_sum();
+    if (has_error==0) {
+        m_threads.reserve(_input_parameters.threads+3);
+
+        int _i_t=0;
+        printMessage("Generating statistics...");
+        try {
+            for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){
+                m_threads.push_back(std::thread((SeqTxt_Module::SeqTxt_do_thread), std::ref(input_file_stream), std::ref(_input_parameters), _i_t, std::ref(t_output_SeqTxt_info), _header_columns, SeqTxt_Module::batch_size_of_record ));
+            }
+            printMessage("Joining " + std::to_string(_input_parameters.threads) + " threads...");
+            for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){
+                m_threads[_i_t].join();
+            }
+
+        }catch(const std::runtime_error& re){
+            printError("Runtime error: " + std::string(re.what()));
+        }catch(const std::exception& ex){
+            std::cerr << "Error occurred: " << ex.what() << std::endl;
+            printError("Error: " + std::string(ex.what()));
+        }catch(...){
+            printError("Unknown error occurred in thread " + std::to_string(_i_t));
+        }
+    }
+    t_output_SeqTxt_info.global_sum();
+    printMemoryUsage("After generating statistics");
  
-   auto relapse_end_time = std::chrono::high_resolution_clock::now();
-   std::cout<<"Elapsed time (seconds): "<< std::chrono::duration_cast<std::chrono::seconds>(relapse_end_time - relapse_start_time).count() << std::endl;
+    auto relapse_end_time = std::chrono::high_resolution_clock::now();
+    std::cout<<"Elapsed time (seconds): "<< std::chrono::duration_cast<std::chrono::seconds>(relapse_end_time - relapse_start_time).count() << std::endl;
 
-   std::cout<<"sequencing_summary.txt QC "<< (has_error==0?"generated":"failed") << std::endl;
+    std::cout<<"sequencing_summary.txt QC "<< (has_error==0?"generated":"failed") << std::endl;
  
-   return has_error;
+    return has_error;
 }
 
-void SeqTxt_Module::SeqTxt_do_thread(std::ifstream* file_stream, Input_Para& ref_input_op, int thread_id, SeqTxt_Thread_data& ref_thread_data, Output_SeqTxt& ref_output ){
+void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref_input_op, int thread_id, Output_SeqTxt& ref_output, std::map<std::string, int> header_columns, size_t batch_size_of_record){
     size_t read_ss_size, read_ss_i;
-    while (true){
-        myMutex_readSeqTxt.lock();
-        std::map<std::string, int> header_column_data = ref_thread_data.getHeaderColumns();
-        read_ss_size = ref_thread_data.read_ss_record(file_stream, header_column_data);
-
-        if (read_ss_size == 0 && !(file_index < ref_input_op.num_input_files) ){
-            myMutex_readSeqTxt.unlock();
-            break;
-        }
-        if ( read_ss_size < batch_size_of_record ){
-            if ( file_index < ref_input_op.num_input_files ){ 
-               std::cout<< "INFO: Open sequencing_summary.txt file="<< ref_input_op.input_files[file_index] <<std::endl;
-               file_stream->close();
-               file_stream->clear();
-
-               file_stream->open( ref_input_op.input_files[file_index].c_str() );
-               std::string firstline;
-               std::getline( *file_stream, firstline );
-               file_index++;
+    int total_read_count = 0;
+    while (true) {
+        SeqTxt_Thread_data ref_thread_data(ref_input_op, header_columns, thread_id, batch_size_of_record);
+        {
+            std::lock_guard<std::mutex> lock(myMutex_readSeqTxt);
+            std::map<std::string, int> header_column_data = ref_thread_data.getHeaderColumns();
+            read_ss_size = ref_thread_data.read_ss_record(file_stream, header_column_data);
+
+            if (read_ss_size == 0 && !(file_index < ref_input_op.num_input_files) ){
+                break;
             }
+
+            if ( read_ss_size < batch_size_of_record ){
+                if ( file_index < ref_input_op.num_input_files ){
+                    // std::cout<< "INFO: Open sequencing_summary.txt file="<< ref_input_op.input_files[file_index] <<std::endl;
+                    file_stream.close();
+                    file_stream.clear();
+
+                    file_stream.open( ref_input_op.input_files[file_index].c_str() );
+                    std::string firstline;
+                    std::getline( file_stream, firstline );
+                    file_index++;
+                }
+            }
+        }
+        if (read_ss_size == 0 ) {
+            printMessage("No records read.");
+            continue;
+        } else {
+            total_read_count += read_ss_size;
+            printMessage("Thread " + std::to_string(thread_id) + " read " + std::to_string(read_ss_size) + " records (total " + std::to_string(total_read_count) + ")");
         }
-        myMutex_readSeqTxt.unlock();
-        if (read_ss_size == 0 ) { continue; }
 
         // Columns used for statistics: passes_filtering, sequence_length_template, mean_qscore_template
-        //ref_thread_data.t_output_SeqTxt_.reset();
         ref_thread_data.t_output_SeqTxt_.all_long_read_info.long_read_info.resize();
         ref_thread_data.t_output_SeqTxt_.passed_long_read_info.long_read_info.resize();
         ref_thread_data.t_output_SeqTxt_.failed_long_read_info.long_read_info.resize();
         for(read_ss_i=0; read_ss_i<read_ss_size; read_ss_i++){
-           Basic_SeqTxt_Statistics* seqtxt_statistics = NULL;
-           bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering;
-           if ( passes_filtering_value == true) {
-                seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.passed_long_read_info);
-           } else {
-                seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.failed_long_read_info);
-           }
-           seqtxt_statistics->long_read_info.total_num_reads++;
-           size_t sequence_base_count = ref_thread_data.stored_records[read_ss_i].sequence_length_template;
-           seqtxt_statistics->long_read_info.total_num_bases += sequence_base_count;
+            // Basic_SeqTxt_Statistics* seqtxt_statistics = NULL;
+            // bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering;
+            // if ( passes_filtering_value == true) {
+            //     seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.passed_long_read_info);
+            // } else {
+            //     seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.failed_long_read_info);
+            // }
+            bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering;
+            Basic_SeqTxt_Statistics& seqtxt_statistics = (passes_filtering_value == true) ? ref_thread_data.t_output_SeqTxt_.passed_long_read_info : ref_thread_data.t_output_SeqTxt_.failed_long_read_info;
+            
+            seqtxt_statistics.long_read_info.total_num_reads++;
+            size_t sequence_base_count = ref_thread_data.stored_records[read_ss_i].sequence_length_template;
+            seqtxt_statistics.long_read_info.total_num_bases += sequence_base_count;
 
             // Store the read length
-            seqtxt_statistics->long_read_info.read_lengths.push_back(sequence_base_count);
+            seqtxt_statistics.long_read_info.read_lengths.push_back(sequence_base_count);
 
             // Update the longest read length
             int64_t current_read_length = (int64_t) ref_thread_data.stored_records[read_ss_i].sequence_length_template;
-           if ( seqtxt_statistics->long_read_info.longest_read_length < current_read_length){
-               seqtxt_statistics->long_read_info.longest_read_length = current_read_length;
-           }
-           seqtxt_statistics->long_read_info.read_length_count[ ref_thread_data.stored_records[read_ss_i].sequence_length_template<MAX_READ_LENGTH?ref_thread_data.stored_records[read_ss_i].sequence_length_template:(MAX_READ_LENGTH-1) ] += 1;
-
-           seqtxt_statistics->seq_quality_info.read_quality_distribution[ int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ] += 1;
-           if ( seqtxt_statistics->seq_quality_info.min_read_quality == MoneDefault ||
-               seqtxt_statistics->seq_quality_info.min_read_quality>int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ){
-              seqtxt_statistics->seq_quality_info.min_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template );
-           }
-           if ( seqtxt_statistics->seq_quality_info.max_read_quality < int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template) ){
-              seqtxt_statistics->seq_quality_info.max_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template);
-           }
-        }
+            if ( seqtxt_statistics.long_read_info.longest_read_length < current_read_length){
+                seqtxt_statistics.long_read_info.longest_read_length = current_read_length;
+            }
+            seqtxt_statistics.long_read_info.read_length_count[ ref_thread_data.stored_records[read_ss_i].sequence_length_template<MAX_READ_LENGTH?ref_thread_data.stored_records[read_ss_i].sequence_length_template:(MAX_READ_LENGTH-1) ] += 1;
 
-        myMutex_output.lock();
+            seqtxt_statistics.seq_quality_info.read_quality_distribution[ int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ] += 1;
+            if ( seqtxt_statistics.seq_quality_info.min_read_quality == MoneDefault ||
+                seqtxt_statistics.seq_quality_info.min_read_quality>int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ){
+                seqtxt_statistics.seq_quality_info.min_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template );
+            }
+            if ( seqtxt_statistics.seq_quality_info.max_read_quality < int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template) ){
+                seqtxt_statistics.seq_quality_info.max_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template);
+            }
+        }
+        std::lock_guard<std::mutex> lock(myMutex_output);
         ref_output.add( ref_thread_data.t_output_SeqTxt_ );
-        myMutex_output.unlock();
     }
 }
diff --git a/src/utils.cpp b/src/utils.cpp
index 4d27130..c2b1dc2 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -1,10 +1,12 @@
 #include "utils.h"
 
 /// @cond
-#include <stdio.h>
-#include <string>
 #include <iostream>
+#include <iomanip>
+#include <string>
 #include <mutex>
+#include <stdio.h>
+#include <sys/resource.h>  // getrusage
 /// @endcond
 
 
@@ -24,3 +26,14 @@ void printError(std::string message)
     std::lock_guard<std::mutex> lock(print_mtx);
     std::cerr << message << std::endl;
 }
+
+void printMemoryUsage(const std::string& functionName) {
+    struct rusage usage;
+    getrusage(RUSAGE_SELF, &usage);
+
+    // Convert from KB to GB
+    double mem_usage_gb = (double)usage.ru_maxrss / 1024.0 / 1024.0;
+    std::lock_guard<std::mutex> lock(print_mtx);
+    std::cout << functionName << " memory usage: "
+              << std::fixed << std::setprecision(2) << mem_usage_gb << " GB" << std::endl;
+}

From 3a24515a4eaa05ea5894c3c5acd9985c52c798b1 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 23 Dec 2024 16:11:27 -0500
Subject: [PATCH 02/25] remove comments

---
 src/seqtxt_module.cpp | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp
index 2eb868a..62d9929 100644
--- a/src/seqtxt_module.cpp
+++ b/src/seqtxt_module.cpp
@@ -241,7 +241,6 @@ void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref
 
             if ( read_ss_size < batch_size_of_record ){
                 if ( file_index < ref_input_op.num_input_files ){
-                    // std::cout<< "INFO: Open sequencing_summary.txt file="<< ref_input_op.input_files[file_index] <<std::endl;
                     file_stream.close();
                     file_stream.clear();
 
@@ -253,11 +252,10 @@ void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref
             }
         }
         if (read_ss_size == 0 ) {
-            printMessage("No records read.");
             continue;
         } else {
             total_read_count += read_ss_size;
-            printMessage("Thread " + std::to_string(thread_id) + " read " + std::to_string(read_ss_size) + " records (total " + std::to_string(total_read_count) + ")");
+            printMessage("Thread " + std::to_string(thread_id+1) + " read " + std::to_string(read_ss_size) + " records (total " + std::to_string(total_read_count) + ")");
         }
 
         // Columns used for statistics: passes_filtering, sequence_length_template, mean_qscore_template
@@ -265,13 +263,6 @@ void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref
         ref_thread_data.t_output_SeqTxt_.passed_long_read_info.long_read_info.resize();
         ref_thread_data.t_output_SeqTxt_.failed_long_read_info.long_read_info.resize();
         for(read_ss_i=0; read_ss_i<read_ss_size; read_ss_i++){
-            // Basic_SeqTxt_Statistics* seqtxt_statistics = NULL;
-            // bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering;
-            // if ( passes_filtering_value == true) {
-            //     seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.passed_long_read_info);
-            // } else {
-            //     seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.failed_long_read_info);
-            // }
             bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering;
             Basic_SeqTxt_Statistics& seqtxt_statistics = (passes_filtering_value == true) ? ref_thread_data.t_output_SeqTxt_.passed_long_read_info : ref_thread_data.t_output_SeqTxt_.failed_long_read_info;
             

From bb4682ddb5f029f2e2f5dc0d5522c68036ed16bc Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 30 Dec 2024 16:09:53 -0500
Subject: [PATCH 03/25] Add per-read gc content distribution plot

---
 Makefile             |  6 ++---
 include/hts_reader.h |  4 ++--
 lib/__init__.py      |  0
 src/bam_module.cpp   |  3 +--
 src/fasta_module.cpp | 16 +++++++++-----
 src/fastq_module.cpp | 52 ++++++++++++++++++++++++++++++--------------
 src/hts_reader.cpp   | 31 +++++++++++++-------------
 src/output_data.cpp  | 44 +++++++++++++++++++++++++------------
 src/plot_utils.py    | 28 +++++++++++++++++++-----
 9 files changed, 120 insertions(+), 64 deletions(-)
 delete mode 100644 lib/__init__.py

diff --git a/Makefile b/Makefile
index 855290e..5b6392f 100644
--- a/Makefile
+++ b/Makefile
@@ -3,8 +3,6 @@ SRC_DIR := $(CURDIR)/src
 LIB_DIR := $(CURDIR)/lib
 
 # Set the library paths for the compiler
-#LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib
-#INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include
 CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
 LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
 INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
@@ -21,4 +19,6 @@ compile:
 	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \
 	CXXFLAGS="$(INCLUDE_PATHS)" LDFLAGS="$(LIBRARY_PATHS)" python3 setup.py build_ext --build-lib $(LIB_DIR)
 
-# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \
+# Clean the build directory
+clean:
+	$(RM) -r $(LIB_DIR)/*.so $(LIB_DIR)/*.py $(SRC_DIR)/lrst_wrap.cpp build/
diff --git a/include/hts_reader.h b/include/hts_reader.h
index 8790e88..79332c6 100644
--- a/include/hts_reader.h
+++ b/include/hts_reader.h
@@ -38,7 +38,7 @@ class HTSReader {
         bool reading_complete = false;
 
         // Update read and base counts
-        int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, uint64_t *base_quality_distribution);
+        int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution);
 
         // Read the next batch of records from the BAM file
         int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold);
@@ -49,7 +49,7 @@ class HTSReader {
         // Return the number of records in the BAM file using the BAM index
         int64_t getNumRecords(const std::string &bam_file_name, Output_BAM &final_output, bool mod_analysis, double base_mod_threshold);
 
-        std::map<int, int> getQueryToRefMap(bam1_t *record);
+        std::map<int, int> getQueryToRefMap(bam1_t* record);
 
         // Add a modification to the base modification map
         void addModificationToQueryMap(std::map<int32_t, std::tuple<char, char, double, int>> &base_modifications, int32_t pos, char mod_type, char canonical_base, double likelihood, int strand);
diff --git a/lib/__init__.py b/lib/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/src/bam_module.cpp b/src/bam_module.cpp
index f3a0573..2d47cb4 100644
--- a/src/bam_module.cpp
+++ b/src/bam_module.cpp
@@ -243,9 +243,8 @@ void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unorder
     reader.readNextRecords(batch_size, record_output, bam_mutex, read_ids, base_mod_threshold);
 
     // Update the final output
-    output_mutex.lock();
+    std::lock_guard<std::mutex> lock(output_mutex);
     final_output.add(record_output);
-    output_mutex.unlock();
 }
 
 std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_file, bool accepted_reads)
diff --git a/src/fasta_module.cpp b/src/fasta_module.cpp
index 666d369..93f3e90 100644
--- a/src/fasta_module.cpp
+++ b/src/fasta_module.cpp
@@ -6,6 +6,7 @@ FASTA_module.cpp:
 #include <stdlib.h>
 // #include <zlib.h>
 #include <ctype.h>
+#include <cmath>  // std::round
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -92,8 +93,11 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_
 
                     long_read_info.total_num_bases += base_count;
                     long_read_info.total_n_cnt += n_count;
-                    read_gc_cnt = 100.0 * gc_count / (double)base_count;
-                    long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1;
+
+                    // Update the per-read GC content distribution
+                    double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count);
+                    int gc_content_int = static_cast<int>(std::round(gc_content_pct));
+                    long_read_info.read_gc_content_count[gc_content_int] += 1;
 
                     // Remove the newline character from the sequence data
                     size_t pos = sequence_data_str.find_first_of("\r\n");
@@ -168,10 +172,10 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_
                 long_read_info.read_length_count[(int)base_count] += 1;
             }
 
-            long_read_info.total_num_bases += base_count;
-            long_read_info.total_n_cnt += n_count;
-            read_gc_cnt = 100.0 * gc_count / (double)base_count;
-            long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1;
+            // Update the per-read GC content distribution
+            double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count);
+            int gc_content_int = static_cast<int>(std::round(gc_content_pct));
+            long_read_info.read_gc_content_count[gc_content_int] += 1;
 
             // Remove the newline character from the sequence data
             size_t pos = sequence_data_str.find_first_of("\r\n");
diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp
index c45a79d..e16dadd 100644
--- a/src/fastq_module.cpp
+++ b/src/fastq_module.cpp
@@ -1,15 +1,19 @@
+#include "fastq_module.h"
+
+#include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <ctype.h>
 
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <iostream>
+#include <algorithm>  // std::sort
+#include <cmath>  // std::round
+
 #include <fstream>
-#include <algorithm>
+#include <iostream>
 
-#include "fastq_module.h"
+#include <sys/stat.h>
+#include <sys/types.h>
 
+#include "utils.h"
 
 int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &output_data, FILE *read_details_fp)
 {
@@ -83,14 +87,33 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
                         long_read_info.total_tu_cnt += 1;
                     }
                     base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset;
-                    seq_quality_info.base_quality_distribution[base_quality_value] += 1;
+                    try {
+                        seq_quality_info.base_quality_distribution[base_quality_value] += 1;
+                    } catch (const std::out_of_range& oor) {
+                        printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value");
+                    }
                     read_mean_base_qual += (double) base_quality_value;
                 }
-                read_gc_cnt = 100.0 * read_gc_cnt / (double)read_len;
-                long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1;
-                read_mean_base_qual /= (double) read_len;
-                seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1;
-                fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, read_gc_cnt, read_mean_base_qual);
+
+                // Update the per-read GC content distribution
+                double gc_content_pct = (100.0 * read_gc_cnt) / static_cast<double>(read_len);
+                int gc_content_int = static_cast<int>(std::round(gc_content_pct));
+                try {
+                    long_read_info.read_gc_content_count[gc_content_int] += 1;
+                } catch (const std::out_of_range& oor) {
+                    printError("Warning: Invalid GC content value " + std::to_string(gc_content_int));
+                }
+                
+                // Update the per-read base quality distribution
+                double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len);
+                unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct));
+                try {
+                    seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1;
+                } catch (const std::out_of_range& oor) {
+                    printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
+                }
+
+                fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual);  // Write to file
             }
         }
         input_file_stream.close();
@@ -140,10 +163,7 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data)
     output_data.long_read_info.NXX_read_length.resize(101, 0);
     // NXX_read_length[50] means N50 read length; NXX_read_length[95] means N95 read length;
 
-    //output_data.seq_quality_info.base_quality_distribution.resize(256, 0);
-    // base_quality_distribution[x] means number of bases that quality = x.
-
-    output_data.seq_quality_info.read_average_base_quality_distribution.resize(256, 0);
+    output_data.seq_quality_info.read_average_base_quality_distribution.resize(MAX_BASE_QUALITY, 0);
 
     if (_input_data.user_defined_fastq_base_qual_offset > 0) {
         fastq_base_qual_offset = _input_data.user_defined_fastq_base_qual_offset;
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 31e9ed9..6b93974 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -35,16 +35,16 @@ HTSReader::~HTSReader(){
 }
 
 // Update read and base counts
-int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics *basic_qc, uint64_t *base_quality_distribution){
+int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution) {
     int exit_code = 0;
 
     // Update the total number of reads
-    basic_qc->total_num_reads++;
+    basic_qc.total_num_reads++;
 
     // Update read length statistics
     int read_length = (int) record->core.l_qseq;
-    basic_qc->total_num_bases += (uint64_t) read_length;  // Update the total number of bases
-    basic_qc->read_lengths.push_back(read_length);
+    basic_qc.total_num_bases += (uint64_t) read_length;  // Update the total number of bases
+    basic_qc.read_lengths.push_back(read_length);
 
     // Loop and count the number of each base
     uint8_t *seq = bam_get_seq(record);
@@ -57,19 +57,19 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics *bas
         char base = seq_nt16_str[bam_seqi(seq, i)];
         switch (base) {
             case 'A':
-                basic_qc->total_a_cnt++;
+                basic_qc.total_a_cnt++;
                 break;
             case 'C':
-                basic_qc->total_c_cnt++;
+                basic_qc.total_c_cnt++;
                 break;
             case 'G':
-                basic_qc->total_g_cnt++;
+                basic_qc.total_g_cnt++;
                 break;
             case 'T':
-                basic_qc->total_tu_cnt++;
+                basic_qc.total_tu_cnt++;
                 break;
             case 'N':
-                basic_qc->total_n_cnt++;
+                basic_qc.total_n_cnt++;
                 std::cerr << "Warning: N base found in read " << bam_get_qname(record) << std::endl;
                 break;
             default:
@@ -195,14 +195,17 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
 
         // Determine if this is an unmapped read
         if (record->core.flag & BAM_FUNMAP) {
-            Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info;
+            Basic_Seq_Statistics& basic_qc = output_data.unmapped_long_read_info;
+            // Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info;
 
             // Update read and base QC
             this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution);
 
         } else {
             // Set up the basic QC object
-            Basic_Seq_Statistics *basic_qc = &output_data.mapped_long_read_info;
+            // Basic_Seq_Statistics *basic_qc =
+            // &output_data.mapped_long_read_info;
+            Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info;
 
             // Calculate base alignment statistics on non-secondary alignments
             if (!(record->core.flag & BAM_FSECONDARY)) {
@@ -323,10 +326,8 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
                 this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution);
 
                 // Calculate the percent GC content
-                int percent_gc = round((basic_qc->total_g_cnt + basic_qc->total_c_cnt) / (double) (basic_qc->total_a_cnt + basic_qc->total_c_cnt + basic_qc->total_g_cnt + basic_qc->total_tu_cnt) * 100);
-
-                // Update the GC content histogram
-                basic_qc->read_gc_content_count.push_back(percent_gc);
+                int percent_gc = round((basic_qc.total_g_cnt + basic_qc.total_c_cnt) / (double) (basic_qc.total_a_cnt + basic_qc.total_c_cnt + basic_qc.total_g_cnt + basic_qc.total_tu_cnt) * 100);
+                basic_qc.read_gc_content_count[percent_gc]++;  // Update the GC content distribution
 
             } else {
                 std::cerr << "Error: Unknown alignment type" << std::endl;
diff --git a/src/output_data.cpp b/src/output_data.cpp
index cb89804..c2e58d0 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -3,6 +3,7 @@
 #include <math.h>  // sqrt
 #include <iostream>
 #include <sstream>
+#include <cmath>  // std::round
 
 #include "output_data.h"
 #include "utils.h"
@@ -84,9 +85,9 @@ void Basic_Seq_Statistics::add(Basic_Seq_Statistics& basic_qc){
         this->read_lengths.insert(this->read_lengths.end(), basic_qc.read_lengths.begin(), basic_qc.read_lengths.end());
     }
 
-    // Add GC content if not empty
-    if (!basic_qc.read_gc_content_count.empty()) {
-        this->read_gc_content_count.insert(this->read_gc_content_count.end(), basic_qc.read_gc_content_count.begin(), basic_qc.read_gc_content_count.end());
+    // Update the per-read GC content distribution
+    for (int i = 0; i < 101; i++) {
+        this->read_gc_content_count[i] += basic_qc.read_gc_content_count[i];
     }
 }
 
@@ -190,7 +191,6 @@ Basic_Seq_Quality_Statistics::Basic_Seq_Quality_Statistics(){
     pos_quality_distribution.resize(MAX_READ_LENGTH, ZeroDefault);
     pos_quality_distribution_dev.resize(MAX_READ_LENGTH, ZeroDefault);
     pos_quality_distribution_count.resize(MAX_READ_LENGTH, ZeroDefault);
-
     read_average_base_quality_distribution.resize(MAX_READ_QUALITY, ZeroDefault);
     read_quality_distribution.resize(MAX_READ_QUALITY, ZeroDefault);
 }
@@ -544,7 +544,6 @@ void Output_FAST5::addReadBaseSignals(Base_Signals values){
 void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_fp)
 {
     const char * read_name;
-    double gc_content_pct;
 
     // Access the read name
     std::string header_str = fq[0];
@@ -601,21 +600,38 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
         }
         // Get the base quality
         base_quality_value = (uint64_t)base_quality_values[i];
-        seq_quality_info.base_quality_distribution[base_quality_value] += 1;
+        try {
+            seq_quality_info.base_quality_distribution[base_quality_value] += 1;
+        } catch (const std::out_of_range& oor) {
+            printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value");
+        }
         read_mean_base_qual += (double)base_quality_value;
     }
 
     // Calculate percent guanine & cytosine
-    gc_content_pct = 100.0 *( (double)gc_count / (double)base_count );
+    // gc_content_pct = 100.0 *( (double)gc_count / (double)base_count );
+
+    // Update the per-read GC content distribution
+    double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count);
+    int gc_content_int = static_cast<int>(std::round(gc_content_pct));
+    try {
+        long_read_info.read_gc_content_count[gc_content_int] += 1;
+    } catch (const std::out_of_range& oor) {
+        printError("Warning: Invalid GC content value " + std::to_string(gc_content_int));
+    }
+
+    // Update the per-read base quality distribution
+    double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count);
+    unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct));
+    try {
+        seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1;
+    } catch (const std::out_of_range& oor) {
+        printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
+    }
 
-    // Look into this section
-    long_read_info.read_gc_content_count[(int)(gc_content_pct + 0.5)] += 1;
-    read_mean_base_qual /= (double) base_count;
-    seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1;
-    fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual);
+    fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual);  // Write to file
 
-    // Update the total number of reads
-    long_read_info.total_num_reads += 1;
+    long_read_info.total_num_reads += 1;  // Update read count
 }
 
 // Get the read count
diff --git a/src/plot_utils.py b/src/plot_utils.py
index ac03046..dd911ae 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -339,17 +339,33 @@ def read_lengths_histogram(data, font_size):
 
 def read_gc_content_histogram(data, font_size):
     """Plot the per-read GC content histogram."""
+    bin_size = 1
 
-    # Get the GC content data
+    # Bin the GC content if the bin size is greater than 1
     gc_content = np.array(data.read_gc_content_count)
-    
-    # Create a histogram of the GC content (0-100% with 1% bins)
-    gc_content_bins = np.linspace(0, 100, 101)
-    gc_hist, _ = np.histogram(gc_content, bins=gc_content_bins)
+    if bin_size > 1:
+        gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)])
+
+    gc_content_bins = [i for i in range(0, 101, bin_size)]
+
+    # Generate hover text for each bin
+    hover_text = []
+    if bin_size > 1:
+        for i in range(len(gc_content_bins)):
+            hover_text.append('GC content: {}-{}%<br>Counts: {}'.format(gc_content_bins[i], gc_content_bins[i] + bin_size, gc_content[i]))
+    else:
+        for i in range(len(gc_content_bins)):
+            hover_text.append('GC content: {}%<br>Counts: {}'.format(gc_content_bins[i], gc_content[i]))
+
+    # Set the X values to be the center of the bins
+    if bin_size > 1:
+        x_values = [gc_content_bins[i] + bin_size / 2 for i in range(len(gc_content_bins))]
+    else:
+        x_values = gc_content_bins
 
     # Create the figure
     fig = go.Figure()
-    fig.add_trace(go.Bar(x=gc_content_bins, y=gc_hist, marker_color='#36a5c7'))
+    fig.add_trace(go.Bar(x=x_values, y=gc_content, marker_color='#36a5c7', hovertext=hover_text, hoverinfo='text'))
 
     # Update the layout
     fig.update_xaxes(ticks="outside", dtick=10, title_text='GC Content (%)', title_standoff=0)

From 9dce02531b952a2127264ca3e77f3e7b7d1b11f2 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 2 Jan 2025 16:58:42 -0500
Subject: [PATCH 04/25] Fix per-read gc content error

---
 include/hts_reader.h |  2 +-
 src/hts_reader.cpp   | 61 ++++++++++++++++++++++----------------------
 src/plot_utils.py    |  6 +++++
 3 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/include/hts_reader.h b/include/hts_reader.h
index 79332c6..00900b2 100644
--- a/include/hts_reader.h
+++ b/include/hts_reader.h
@@ -38,7 +38,7 @@ class HTSReader {
         bool reading_complete = false;
 
         // Update read and base counts
-        int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution);
+        int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution, bool is_primary);
 
         // Read the next batch of records from the BAM file
         int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold);
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 6b93974..0cd1a10 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -35,18 +35,17 @@ HTSReader::~HTSReader(){
 }
 
 // Update read and base counts
-int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution) {
-    int exit_code = 0;
-
-    // Update the total number of reads
-    basic_qc.total_num_reads++;
+int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution, bool is_primary) {
 
-    // Update read length statistics
+    // Update read QC
+    basic_qc.total_num_reads++;  // Update the total number of reads
     int read_length = (int) record->core.l_qseq;
     basic_qc.total_num_bases += (uint64_t) read_length;  // Update the total number of bases
     basic_qc.read_lengths.push_back(read_length);
 
-    // Loop and count the number of each base
+    // Get base counts, quality, and GC content
+    double read_gc_count = 0.0;  // For GC content calculation
+    double read_base_total = 0.0;  // For GC content calculation
     uint8_t *seq = bam_get_seq(record);
     for (int i = 0; i < read_length; i++) {
         // Get the base quality and update the base quality histogram
@@ -58,28 +57,42 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& bas
         switch (base) {
             case 'A':
                 basic_qc.total_a_cnt++;
+                read_base_total++;
                 break;
             case 'C':
                 basic_qc.total_c_cnt++;
+                read_gc_count++;
+                read_base_total++;
                 break;
             case 'G':
                 basic_qc.total_g_cnt++;
+                read_gc_count++;
+                read_base_total++;
                 break;
             case 'T':
                 basic_qc.total_tu_cnt++;
+                read_base_total++;
                 break;
             case 'N':
                 basic_qc.total_n_cnt++;
                 std::cerr << "Warning: N base found in read " << bam_get_qname(record) << std::endl;
                 break;
             default:
-                std::cerr << "Error reading nucleotide: " << base << std::endl;
-                exit_code = 1;
+                printError("Invalid base: " + std::to_string(base));
                 break;
         }
     }
 
-    return exit_code;
+    // Calculate the read GC content percentage if a primary alignment
+    if (is_primary) {
+        double gc_content = read_gc_count / read_base_total;
+        int gc_content_percent = (int) round(gc_content * 100);
+        std::string query_name = bam_get_qname(record);
+        // printMessage("Read name: " + query_name + ", GC content: " + std::to_string(gc_content) + ", GC count: " + std::to_string(read_gc_count) + ", Total count: " + std::to_string(read_base_total));
+        basic_qc.read_gc_content_count[gc_content_percent]++;
+    }
+
+    return 0;
 }
 
 // Read the next batch of records from the BAM file and store QC in the output_data object
@@ -193,21 +206,14 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
             output_data.addReadMoveTable(query_name, seq_str, signal_index_vector, ts, ns);
         }
 
-        // Determine if this is an unmapped read
+        // Unmapped reads
         if (record->core.flag & BAM_FUNMAP) {
             Basic_Seq_Statistics& basic_qc = output_data.unmapped_long_read_info;
-            // Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info;
-
-            // Update read and base QC
-            this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution);
+            this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, false);
 
         } else {
-            // Set up the basic QC object
-            // Basic_Seq_Statistics *basic_qc =
-            // &output_data.mapped_long_read_info;
-            Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info;
-
             // Calculate base alignment statistics on non-secondary alignments
+            Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info;
             if (!(record->core.flag & BAM_FSECONDARY)) {
 
                 // Determine if this is a forward or reverse read
@@ -261,7 +267,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
                 output_data.num_mismatched_bases += num_mismatches;
             }
 
-            // Determine if this is a secondary alignment (not included in QC, only read count)
+            // Secondary alignment (not included in QC, only read count)
             if (record->core.flag & BAM_FSECONDARY) {
                 output_data.num_secondary_alignment++;
 
@@ -271,7 +277,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
                 // Update the read's secondary alignments (count once per read)
                 output_data.reads_with_secondary[query_name] = true;
 
-            // Determine if this is a supplementary alignment (not included in QC, only read count)
+            // Supplementary alignment (not included in QC, only read count)
             } else if (record->core.flag & BAM_FSUPPLEMENTARY) {
                 output_data.num_supplementary_alignment++;
 
@@ -281,7 +287,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
                 // Update the read's supplementary alignments (count once per read)
                 output_data.reads_with_supplementary[query_name] = true;
 
-            // Determine if this is a primary alignment
+            // Primary alignment
             } else if (!(record->core.flag & BAM_FSECONDARY || record->core.flag & BAM_FSUPPLEMENTARY)) {
                 output_data.num_primary_alignment++;  // Update the number of primary alignments
 
@@ -323,15 +329,10 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
                 }
 
                 // Update read and base QC
-                this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution);
-
-                // Calculate the percent GC content
-                int percent_gc = round((basic_qc.total_g_cnt + basic_qc.total_c_cnt) / (double) (basic_qc.total_a_cnt + basic_qc.total_c_cnt + basic_qc.total_g_cnt + basic_qc.total_tu_cnt) * 100);
-                basic_qc.read_gc_content_count[percent_gc]++;  // Update the GC content distribution
+                this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, true);
 
             } else {
-                std::cerr << "Error: Unknown alignment type" << std::endl;
-                std::cerr << "Flag: " << record->core.flag << std::endl;
+                printError("Error: Unknown alignment type with flag " + std::to_string(record->core.flag));
             }
         }
 
diff --git a/src/plot_utils.py b/src/plot_utils.py
index dd911ae..8adb1ee 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -346,6 +346,12 @@ def read_gc_content_histogram(data, font_size):
     if bin_size > 1:
         gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)])
 
+    # # Print the GC content if count > 0
+    # logging.info("[HIST] GC content values:")
+    # for i in range(len(gc_content)):
+    #     if gc_content[i] > 0:
+    #         logging.info("{}-{}%: {}".format(i * bin_size, i * bin_size + bin_size, gc_content[i]))
+
     gc_content_bins = [i for i in range(0, 101, bin_size)]
 
     # Generate hover text for each bin

From 19136048e063e684733b0c5c084cea3828ceb2d6 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 4 Jan 2025 13:57:30 -0500
Subject: [PATCH 05/25] Work on read length vs base mod rate

---
 include/output_data.h | 193 +++++++++++++++++++++++-------------------
 src/fasta_module.cpp  |   2 +
 src/hts_reader.cpp    |  61 ++++++++-----
 src/output_data.cpp   |  24 ++++++
 4 files changed, 170 insertions(+), 110 deletions(-)

diff --git a/include/output_data.h b/include/output_data.h
index 53d30bf..0651341 100644
--- a/include/output_data.h
+++ b/include/output_data.h
@@ -14,6 +14,7 @@ Define the output structures for each module.
 
 #include "input_parameters.h"
 #include "tin_stats.h"
+#include "utils.h"
 
 #define MAX_READ_LENGTH 10485760
 #define MAX_BASE_QUALITY 100
@@ -114,7 +115,7 @@ class Output_FQ : public Output_FA
 
 // Define the base modification data structure (modification type, canonical
 // base, likelihood, strand: 0 for forward, 1 for reverse, and CpG flag: T/F)
-using Base_Modification = std::tuple<char, char, double, int, bool>;
+// using Base_Modification = std::tuple<char, char, double, int, bool>;
 
 // Define the signal-level data structure for POD5 (ts, ns, move table vector)
 using POD5_Signal_Data = std::tuple<int32_t, int32_t, std::vector<int32_t>>;
@@ -159,92 +160,110 @@ class Base_Move_Table
 // BAM output
 class Output_BAM : public Output_FQ
 {
-public:
-   uint64_t num_primary_alignment = ZeroDefault;                                 // the number of primary alignment/
-   uint64_t num_secondary_alignment = ZeroDefault;                               // the number of secondary alignment
-   uint64_t num_reads_with_secondary_alignment = ZeroDefault;                    // the number of long reads with the secondary alignment: one read might have multiple seconard alignment
-   uint64_t num_supplementary_alignment = ZeroDefault;                           // the number of supplementary alignment
-   uint64_t num_reads_with_supplementary_alignment = ZeroDefault;                // the number of long reads with secondary alignment;
-   uint64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment.
-   uint64_t forward_alignment = ZeroDefault;  // Total number of forward alignments
-   uint64_t reverse_alignment = ZeroDefault;  // Total number of reverse alignments
-   std::map<std::string, bool> reads_with_supplementary;  // Map of reads with supplementary alignments
-   std::map<std::string, bool> reads_with_secondary;  // Map of reads with secondary alignments
-
-   // Similar to Output_FA: below are for mapped.
-   uint64_t num_matched_bases = ZeroDefault;    // the number of matched bases with =
-   uint64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X
-   uint64_t num_ins_bases = ZeroDefault;        // the number of inserted bases;
-   uint64_t num_del_bases = ZeroDefault;        // the number of deleted bases;
-   uint64_t num_clip_bases = ZeroDefault;       // the number of soft-clipped bases;
-
-   // The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators
-   int num_columns = ZeroDefault; // the number of columns
-   double percent_identity = ZeroDefault;  // Percent identity = (num columns - NM) / num columns
-   std::vector<int> accuracy_per_read;
-
-   // Preprint revisions: Remove all counts with unique positions in the
-   // reference genome, and only report raw counts
-   uint64_t modified_prediction_count = ZeroDefault;  // Total number of modified base predictions
-   uint64_t sample_modified_base_count = ZeroDefault;  // Total number of modified bases passing the threshold
-   uint64_t sample_modified_base_count_forward = ZeroDefault;  // Total number of modified bases passing the threshold on the forward strand
-   uint64_t sample_modified_base_count_reverse = ZeroDefault;  // Total number of modified bases passing the threshold on the reverse strand
-   uint64_t sample_cpg_forward_count = ZeroDefault;  // Total number of modified bases passing the threshold that are in CpG sites and in the forward strand (non-unique)
-   uint64_t sample_cpg_reverse_count = ZeroDefault;  // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique)
-   std::map<std::string, std::vector<std::pair<int32_t, int>>> sample_c_modified_positions;  // chr -> vector of (position, strand) for modified bases passing the threshold
-
-    // Signal data section
-   int read_count = ZeroDefault;
-   int base_count = ZeroDefault;
-   std::unordered_map<std::string, Base_Move_Table> read_move_table;
-
-   // POD5 signal-level information is stored in a map of read names to a map of
-   // reference positions to a tuple of (ts, ns, move table vector)
-   std::unordered_map<std::string, POD5_Signal_Data> pod5_signal_data;
-
-   // Dictionary of bam filepath to TIN data
-   std::unordered_map<std::string, TINStats> tin_data;
-
-   Basic_Seq_Statistics mapped_long_read_info;
-   Basic_Seq_Statistics unmapped_long_read_info;
-
-   Basic_Seq_Quality_Statistics mapped_seq_quality_info;
-   Basic_Seq_Quality_Statistics unmapped_seq_quality_info;
-
-   // POD5 signal data functions
-   int getReadCount();
-   void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table, int start, int end);
-   std::vector<int> getReadMoveTable(std::string read_id);
-   std::string getReadSequence(std::string read_id);
-   int getReadSequenceStart(std::string read_id);
-   int getReadSequenceEnd(std::string read_id);
-
-   // Add a batch of records to the output
-   void add(Output_BAM &t_output_bam);
-
-   // Add TIN data for a single BAM file
-   void addTINData(std::string &bam_file, TINStats &tin_data);
-
-   // Get the TIN mean for a single BAM file
-   double getTINMean(std::string bam_file);
-
-   // Get the TIN median for a single BAM file
-   double getTINMedian(std::string bam_file);
-
-   // Get the TIN standard deviation for a single BAM file
-   double getTINStdDev(std::string bam_file);
-
-   // Get the TIN count for a single BAM file
-   int getTINCount(std::string bam_file);
-
-   // Calculate QC across all records
-   void global_sum();
-
-   // Save the output to a summary text file
-   void save_summary(std::string &output_file, Input_Para &params, Output_BAM &output_data);
-
-   Output_BAM();
-   ~Output_BAM();
+   public:
+      uint64_t num_primary_alignment = ZeroDefault;                                 // the number of primary alignment/
+      uint64_t num_secondary_alignment = ZeroDefault;                               // the number of secondary alignment
+      uint64_t num_reads_with_secondary_alignment = ZeroDefault;                    // the number of long reads with the secondary alignment: one read might have multiple seconard alignment
+      uint64_t num_supplementary_alignment = ZeroDefault;                           // the number of supplementary alignment
+      uint64_t num_reads_with_supplementary_alignment = ZeroDefault;                // the number of long reads with secondary alignment;
+      uint64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment.
+      uint64_t forward_alignment = ZeroDefault;  // Total number of forward alignments
+      uint64_t reverse_alignment = ZeroDefault;  // Total number of reverse alignments
+      std::map<std::string, bool> reads_with_supplementary;  // Map of reads with supplementary alignments
+      std::map<std::string, bool> reads_with_secondary;  // Map of reads with secondary alignments
+
+      // Similar to Output_FA: below are for mapped.
+      uint64_t num_matched_bases = ZeroDefault;    // the number of matched bases with =
+      uint64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X
+      uint64_t num_ins_bases = ZeroDefault;        // the number of inserted bases;
+      uint64_t num_del_bases = ZeroDefault;        // the number of deleted bases;
+      uint64_t num_clip_bases = ZeroDefault;       // the number of soft-clipped bases;
+
+      // The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators
+      int num_columns = ZeroDefault; // the number of columns
+      double percent_identity = ZeroDefault;  // Percent identity = (num columns - NM) / num columns
+      std::vector<int> accuracy_per_read;
+
+      // Preprint revisions: Remove all counts with unique positions in the
+      // reference genome, and only report raw counts
+      uint64_t modified_prediction_count = ZeroDefault;  // Total number of modified base predictions
+      uint64_t sample_modified_base_count = ZeroDefault;  // Total number of modified bases passing the threshold
+      uint64_t sample_modified_base_count_forward = ZeroDefault;  // Total number of modified bases passing the threshold on the forward strand
+      uint64_t sample_modified_base_count_reverse = ZeroDefault;  // Total number of modified bases passing the threshold on the reverse strand
+      uint64_t sample_cpg_forward_count = ZeroDefault;  // Total number of modified bases passing the threshold that are in CpG sites and in the forward strand (non-unique)
+      uint64_t sample_cpg_reverse_count = ZeroDefault;  // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique)
+      std::map<std::string, std::vector<std::pair<int32_t, int>>> sample_c_modified_positions;  // chr -> vector of (position, strand) for modified bases passing the threshold
+
+      // Further revisions
+      // Structures for storing read length vs. base modification rate data
+      struct ReadModData
+      {
+         int read_length;
+         double mod_rate;
+         std::unordered_map<char, double> base_mod_rates;  // Type-specific base modification rates
+      };
+      std::vector<ReadModData> read_mod_data;  // Read length vs. base modification rate
+
+      // std::pair<std::vector<int>, std::vector<double>> read_length_mod_rate;  // Read length vs. base modification rate
+      // std::unordered_map<char, std::pair<std::vector<int>, std::vector<double>>> read_length_mod_rate;  // Read length vs. base modification rate for each base modification type
+      std::unordered_map<char, uint64_t> base_mod_counts;  // Counts for each base modification type exceeding the threshold
+      std::unordered_map<char, uint64_t> base_mod_counts_forward;  // Counts for each base modification type exceeding the threshold on the forward strand
+      std::unordered_map<char, uint64_t> base_mod_counts_reverse;  // Counts for each base modification type exceeding the threshold on the reverse strand
+
+      // Signal data section
+      int read_count = ZeroDefault;
+      int base_count = ZeroDefault;
+      std::unordered_map<std::string, Base_Move_Table> read_move_table;
+
+      // POD5 signal-level information is stored in a map of read names to a map of
+      // reference positions to a tuple of (ts, ns, move table vector)
+      std::unordered_map<std::string, POD5_Signal_Data> pod5_signal_data;
+
+      std::unordered_map<std::string, TINStats> tin_data;  // TIN data for each BAM file
+
+      Basic_Seq_Statistics mapped_long_read_info;
+      Basic_Seq_Statistics unmapped_long_read_info;
+
+      Basic_Seq_Quality_Statistics mapped_seq_quality_info;
+      Basic_Seq_Quality_Statistics unmapped_seq_quality_info;
+
+      // POD5 signal data functions
+      int getReadCount();
+      void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table, int start, int end);
+      std::vector<int> getReadMoveTable(std::string read_id);
+      std::string getReadSequence(std::string read_id);
+      int getReadSequenceStart(std::string read_id);
+      int getReadSequenceEnd(std::string read_id);
+
+      void updateBaseModCounts(char mod_type, int strand);  // Update base modification counts for predictions exceeding the threshold
+      void updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates);  // Update read length vs. base modification rate data
+
+      // Add TIN data for a single BAM file
+      void addTINData(std::string &bam_file, TINStats &tin_data);
+
+      // TIN mean for a single BAM file
+      double getTINMean(std::string bam_file);  // Get the TIN mean for a single BAM file
+
+      // TIN median for a single BAM file
+      double getTINMedian(std::string bam_file);
+
+      // TIN standard deviation for a single BAM file
+      double getTINStdDev(std::string bam_file);
+
+      // TIN count for a single BAM file
+      int getTINCount(std::string bam_file);
+
+      // Add a batch of records to the output
+      void add(Output_BAM &t_output_bam);
+
+      // Calculate QC across all records
+      void global_sum();
+
+      // Save the output to a summary text file
+      void save_summary(std::string &output_file, Input_Para &params, Output_BAM &output_data);
+
+      Output_BAM();
+      ~Output_BAM();
 };
 
 
diff --git a/src/fasta_module.cpp b/src/fasta_module.cpp
index 93f3e90..8b48371 100644
--- a/src/fasta_module.cpp
+++ b/src/fasta_module.cpp
@@ -172,6 +172,8 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_
                 long_read_info.read_length_count[(int)base_count] += 1;
             }
 
+            long_read_info.total_num_bases += base_count;  // Update the total number of bases
+
             // Update the per-read GC content distribution
             double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count);
             int gc_content_int = static_cast<int>(std::round(gc_content_pct));
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 0cd1a10..8056c19 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -355,8 +355,12 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
     samFile* bam_file = sam_open(bam_filename.c_str(), "r");
     bam_hdr_t* bam_header = sam_hdr_read(bam_file);
     bam1_t* bam_record = bam_init1();
-
     int64_t num_reads = 0;
+
+    // Data structure for storing read length vs. base modification rate
+    std::vector<int> read_lengths;  // Read lengths
+    std::vector<double> read_mod_rates;  // Total base modification rate for each read length
+    std::vector<std::unordered_map<char, double>> read_base_mod_rates;  // Type-specific base modification rates for each read length
     while (sam_read1(bam_file, bam_header, bam_record) >= 0) {
         num_reads++;
 
@@ -366,14 +370,13 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
             // Follow here to get base modification tags:
             // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c
             // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274
+            int read_length = bam_record->core.l_qseq;
             hts_base_mod_state *state = hts_base_mod_state_alloc();
-
-            // Preprint revisions: New data structure that does not require unique
-            // positions for each base modification
-            // chr -> vector of (position, strand) for C modified bases passing the threshold
-            std::vector<std::pair<int32_t, int>> c_modified_positions;
+            std::vector<std::pair<int32_t, int>> c_modified_positions;  // C-modified positions for CpG analysis (chr->(position, strand))
+            std::unordered_map<char, int> base_mod_counts;  // Type-specific base modification counts for the read
 
             // Parse the base modification tags if a primary alignment
+            int read_mod_count = 0;
             int ret = bam_parse_basemod(bam_record, state);
             if (ret >= 0 && !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP)) {
 
@@ -402,8 +405,12 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                 std::vector<int> query_pos;
                 while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) {
                     for (int i = 0; i < n; i++) {
-                        // Update the prediction count
-                        final_output.modified_prediction_count++;
+                        // Update the modified prediction counts
+                        read_mod_count++;  // Read-specific count
+                        final_output.modified_prediction_count++;  // Cumulative count
+                        char mod_type = mods[i].modified_base;
+                        base_mod_counts[mod_type]++;  // Update the type-specific count
+
 
                         // Note: The modified base value can be a positive char (e.g. 'm',
                         // 'h') (DNA Mods DB) or negative integer (ChEBI ID):
@@ -419,22 +426,13 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                         if (mods[i].qual != -1) {
                             probability = mods[i].qual / 256.0;
 
-                            // If the probability is greater than the threshold,
-                            // update the count
+                            // Update counts for predictions exceeding the threshold
                             if (probability >= base_mod_threshold) {
-                                final_output.sample_modified_base_count++;
-
-                                // Update the modified base count for the strand
-                                if (strand == 0) {
-                                    final_output.sample_modified_base_count_forward++;
-                                } else {
-                                    final_output.sample_modified_base_count_reverse++;
-                                }
+                                final_output.updateBaseModCounts(mod_type, strand);  // Update the base modification counts
 
-                                // Preprint revisions: Store the modified positions
+                                // Store the modified positions for later CpG analysis
                                 char canonical_base_char = std::toupper(mods[i].canonical_base);
-                                char mod_type = mods[i].modified_base;
-                                if (canonical_base_char == 'C' && mod_type == 'm') {
+                                if (canonical_base_char == 'C' && mod_type != 'C') {
 
                                     // Convert the query position to reference position if available
                                     if (alignments_present) {
@@ -465,9 +463,26 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                     }
                 }
             }
+            hts_base_mod_state_free(state);  // Deallocate the base modification state object
+
+            // Calculate the base modification rate for the read
+            double read_mod_rate = 0.0;
+            if (read_length > 0) {
+                read_mod_rate = (double) read_mod_count / read_length;
+            }
 
-            // Deallocate the state object
-            hts_base_mod_state_free(state);
+            // Calculate the type-specific base modification rates for the read
+            std::unordered_map<char, double> base_mod_rates;
+            for (auto const &it : base_mod_counts) {
+                char mod_type = it.first;
+                int mod_count = it.second;
+                double mod_rate = 0.0;
+                if (read_length > 0) {
+                    mod_rate = (double) mod_count / read_length;
+                }
+                base_mod_rates[mod_type] = mod_rate;
+            }
+            final_output.updateReadModRate(read_length, read_mod_rate, base_mod_rates);  // Update the output data
         }
     }
 
diff --git a/src/output_data.cpp b/src/output_data.cpp
index c2e58d0..7405a7c 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -262,6 +262,30 @@ Output_BAM::Output_BAM(){
 Output_BAM::~Output_BAM(){
 }
 
+void Output_BAM::updateBaseModCounts(char mod_type, int strand)
+{
+    // Update the sample modified base count for predictions exceeding the threshold
+    this->sample_modified_base_count++;
+    this->base_mod_counts[mod_type]++;  // Update the type-specific modified base count
+
+    // Update the modified base count for the strand
+    if (strand == 0) {
+        this->sample_modified_base_count_forward++;
+        this->base_mod_counts_forward[mod_type]++;  // Update the type-specific modified base count
+    } else {
+        this->sample_modified_base_count_reverse++;
+        this->base_mod_counts_reverse[mod_type]++;  // Update the type-specific modified base count
+    }
+}
+
+void Output_BAM::updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates) {
+    ReadModData read_mod_data;
+    read_mod_data.read_length = read_length;
+    read_mod_data.mod_rate = read_mod_rate;
+    read_mod_data.base_mod_rates = base_mod_rates;
+    this->read_mod_data.push_back(read_mod_data);
+}
+
 int Output_BAM::getReadCount()
 {
     return this->read_move_table.size();

From 345528210a18309d690caeca9300e7a0f536a3ce Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 4 Jan 2025 17:06:52 -0500
Subject: [PATCH 06/25] Add base mod plots

---
 README.md             |   2 +-
 include/output_data.h |  27 ++++++----
 src/cli.py            |   6 ++-
 src/lrst.i            |  40 +--------------
 src/output_data.cpp   |  55 ++++++++++++++++++++
 src/plot_utils.py     | 115 +++++++++++++++++++++++++++++++-----------
 6 files changed, 165 insertions(+), 80 deletions(-)

diff --git a/README.md b/README.md
index e5b4bed..1ce0df6 100644
--- a/README.md
+++ b/README.md
@@ -148,7 +148,7 @@ MinION R9.4.1 from https://labs.epi2me.io/gm24385-5mc/)
 
 ## General usage
 ```
-longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --ref $REF_GENOME --modprob 0.8
+longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --mod --modprob 0.8 --ref $REF_GENOME
 ```
 
 # RRMS BAM
diff --git a/include/output_data.h b/include/output_data.h
index 0651341..06608d6 100644
--- a/include/output_data.h
+++ b/include/output_data.h
@@ -157,6 +157,14 @@ class Base_Move_Table
 };
 
 
+// Structures for storing read length vs. base modification rate data
+struct ReadModData
+{
+   int read_length;
+   double mod_rate;
+   std::unordered_map<char, double> base_mod_rates;  // Type-specific base modification rates
+};
+
 // BAM output
 class Output_BAM : public Output_FQ
 {
@@ -194,16 +202,6 @@ class Output_BAM : public Output_FQ
       uint64_t sample_cpg_reverse_count = ZeroDefault;  // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique)
       std::map<std::string, std::vector<std::pair<int32_t, int>>> sample_c_modified_positions;  // chr -> vector of (position, strand) for modified bases passing the threshold
 
-      // Further revisions
-      // Structures for storing read length vs. base modification rate data
-      struct ReadModData
-      {
-         int read_length;
-         double mod_rate;
-         std::unordered_map<char, double> base_mod_rates;  // Type-specific base modification rates
-      };
-      std::vector<ReadModData> read_mod_data;  // Read length vs. base modification rate
-
       // std::pair<std::vector<int>, std::vector<double>> read_length_mod_rate;  // Read length vs. base modification rate
       // std::unordered_map<char, std::pair<std::vector<int>, std::vector<double>>> read_length_mod_rate;  // Read length vs. base modification rate for each base modification type
       std::unordered_map<char, uint64_t> base_mod_counts;  // Counts for each base modification type exceeding the threshold
@@ -227,6 +225,15 @@ class Output_BAM : public Output_FQ
       Basic_Seq_Quality_Statistics mapped_seq_quality_info;
       Basic_Seq_Quality_Statistics unmapped_seq_quality_info;
 
+      std::vector<ReadModData> read_mod_data;  // Read length vs. base modification rate
+      std::vector<char> getBaseModTypes();  // Get the types of base modifications found
+      int getReadModDataSize();  // Get the number of read length vs. base modification rate data points
+      int getNthReadModLength(int read_index);  // Get the read length for the nth read
+      double getNthReadModRate(int read_index);  // Get the base modification rate for the nth read
+      double getNthReadModRate(int read_index, char mod_type);  // Get the base modification rate for the nth read for a specific base modification type
+      uint64_t getModTypeCount(char mod_type);  // Get the count of a specific base modification type
+      uint64_t getModTypeCount(char mod_type, int strand);  // Get the count of a specific base modification type for a specific strand
+
       // POD5 signal data functions
       int getReadCount();
       void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table, int start, int end);
diff --git a/src/cli.py b/src/cli.py
index 29549f2..59b8a9c 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -97,10 +97,11 @@ def get_common_param(margs):
         # Set up logging to stdout
         logging.basicConfig(stream=sys.stdout,
                             level=get_log_level(margs.log_level),
-                            format="%(asctime)s [%(levelname)s] %(message)s")
+                            format="%(asctime)s %(message)s")
+                            # format="%(asctime)s [%(levelname)s] %(message)s")
     else:
         logging.basicConfig(level=get_log_level(margs.log_level),
-                            format="%(asctime)s [%(levelname)s] %(message)s",
+                            format="%(asctime)s %(message)s",
                             handlers=[
                                 logging.FileHandler(margs.log),
                                 logging.StreamHandler(sys.stdout)
@@ -250,6 +251,7 @@ def bam_module(margs):
             # If base modifications were found, add the base modification plots
             # after the first table
             if bam_output.sample_modified_base_count > 0:
+                qc_info_list.insert(1, "read_length_mod_rates")  # Read length modification rates
                 qc_info_list.insert(1, "base_mods")
 
             # If gene BED file was provided, add the TIN plots
diff --git a/src/lrst.i b/src/lrst.i
index 9def020..fc93d0e 100644
--- a/src/lrst.i
+++ b/src/lrst.i
@@ -36,41 +36,6 @@ lrst.i: SWIG module defining the Python wrapper for our C++ modules
     $result = list;
 }
 
-// Map std::map<int32_t, std::map<char, std::tuple<char, double>>> to Python
-// dictionary
-// %typemap(out) std::map<int32_t, std::map<char, std::tuple<char, double>>> {
-//     PyObject *dict = PyDict_New();
-//     for (auto const &it : $1) {
-//         PyObject *inner_dict = PyDict_New();
-//         for (auto const &inner_it : it.second) {
-//             PyObject *tuple = PyTuple_Pack(2, 
-//                                            PyUnicode_FromStringAndSize(&std::get<0>(inner_it.second), 1), 
-//                                            PyFloat_FromDouble(std::get<1>(inner_it.second)));
-//             PyDict_SetItem(inner_dict, 
-//                            PyUnicode_FromStringAndSize(&inner_it.first, 1), 
-//                            tuple);
-//         }
-//         PyDict_SetItem(dict, PyLong_FromLong(it.first), inner_dict);
-//     }
-//     $result = dict;
-// }
-
-// Map std::map<int32_t, std::tuple<char, char, double, int, bool>> to Python
-// dictionary
-// %typemap(out) std::map<int32_t, std::tuple<char, char, double, int, bool>> {
-//     PyObject *dict = PyDict_New();
-//     for (auto const &it : $1) {
-//         PyObject *tuple = PyTuple_Pack(5, 
-//                                        PyUnicode_FromStringAndSize(&std::get<0>(it.second), 1), 
-//                                        PyUnicode_FromStringAndSize(&std::get<1>(it.second), 1), 
-//                                        PyFloat_FromDouble(std::get<2>(it.second)),
-//                                        PyLong_FromLong(std::get<3>(it.second)),
-//                                        PyBool_FromLong(std::get<4>(it.second)));
-//         PyDict_SetItem(dict, PyLong_FromLong(it.first), tuple);
-//     }
-//     $result = dict;
-// }
-
 // Map std::map<std::string, std::map<int32_t, std::tuple<char, char, double,
 // int, bool>>> to Python dictionary
 %typemap(out) std::map<std::string, std::map<int32_t, std::tuple<char, char, double, int, bool>>> {
@@ -104,12 +69,11 @@ lrst.i: SWIG module defining the Python wrapper for our C++ modules
 %include <stdint.i>
 %include <std_vector.i>
 
-// Define the conversion for uint64_t arrays
-//%array_class(uint64_t, uint64Array);
-
 %template(IntVector) std::vector<int>;
 %template(DoubleVector) std::vector<double>;
 %template(Int2DVector) std::vector<std::vector<int>>;
+%template(StringVector) std::vector<std::string>;
+%template(CharVector) std::vector<char>;
 
 // These are the header functions wrapped by our lrst module (Like an 'import')
 %include "input_parameters.h"  // Contains InputPara for passing parameters to C++
diff --git a/src/output_data.cpp b/src/output_data.cpp
index 7405a7c..c92a60d 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -286,6 +286,61 @@ void Output_BAM::updateReadModRate(int read_length, double read_mod_rate, std::u
     this->read_mod_data.push_back(read_mod_data);
 }
 
+std::vector<char> Output_BAM::getBaseModTypes()
+{
+    std::vector<char> base_mod_types;
+    for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) {
+        base_mod_types.push_back(it->first);
+    }
+    return base_mod_types;
+}
+
+int Output_BAM::getReadModDataSize()
+{
+    return this->read_mod_data.size();
+}
+
+int Output_BAM::getNthReadModLength(int read_index)
+{
+    return this->read_mod_data[read_index].read_length;
+}
+
+double Output_BAM::getNthReadModRate(int read_index)
+{
+    return this->read_mod_data[read_index].mod_rate;
+}
+
+double Output_BAM::getNthReadModRate(int read_index, char mod_type)
+{
+    double mod_rate = 0.0;
+    try {
+        this->read_mod_data.at(read_index);
+    } catch (const std::out_of_range& oor) {
+        std::cerr << "Error: Read index " << read_index << " is out of range." << std::endl;
+    }
+    try {
+        mod_rate = this->read_mod_data[read_index].base_mod_rates.at(mod_type);
+    } catch (const std::out_of_range& oor) {
+        // No modification rate found for the specified type in the read
+        mod_rate = 0.0;
+    }
+    return mod_rate;
+}
+
+uint64_t Output_BAM::getModTypeCount(char mod_type)
+{
+    return this->base_mod_counts[mod_type];
+}
+
+uint64_t Output_BAM::getModTypeCount(char mod_type, int strand)
+{
+    if (strand == 0) {
+        return this->base_mod_counts_forward[mod_type];
+    } else {
+        return this->base_mod_counts_reverse[mod_type];
+    }
+}
+
 int Output_BAM::getReadCount()
 {
     return this->read_move_table.size();
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 8adb1ee..d82bffb 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -37,6 +37,8 @@ def getDefaultPlotFilenames():
         
         "gc_content_hist": {'title': "GC Content Histogram", 'description': "GC Content Histogram", 'summary': ""},
 
+        "read_length_mod_rates": {'title': "Read Length vs. Modification Rates", 'description': "Read Length vs. Modification Rates", 'summary': ""},
+
         "base_quality": {'title': "Base Quality Histogram", 'description': "Base Quality Histogram"},
 
         "read_avg_base_quality": {'title': "Read Base Quality Histogram", 'description': "Read Base Quality Histogram"},
@@ -346,12 +348,6 @@ def read_gc_content_histogram(data, font_size):
     if bin_size > 1:
         gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)])
 
-    # # Print the GC content if count > 0
-    # logging.info("[HIST] GC content values:")
-    # for i in range(len(gc_content)):
-    #     if gc_content[i] > 0:
-    #         logging.info("{}-{}%: {}".format(i * bin_size, i * bin_size + bin_size, gc_content[i]))
-
     gc_content_bins = [i for i in range(0, 101, bin_size)]
 
     # Generate hover text for each bin
@@ -450,17 +446,22 @@ def plot(output_data, para_dict, file_type):
     # Create the summary table
     create_summary_table(output_data, plot_filepaths, file_type)
 
-    # Create the modified base table if available
+    # Modified base table and plots
     if file_type == 'BAM' and para_dict["mod"] > 0:
+        # Modified base table
         base_modification_threshold = para_dict["modprob"]
         create_modified_base_table(output_data, plot_filepaths, base_modification_threshold)
-
-        # Check if the modified base table is available
-        if 'base_mods' in plot_filepaths:
-            logging.info("SUCCESS: Modified base table created")
-        else:
+        if 'base_mods' not in plot_filepaths:
             logging.warning("WARNING: Modified base table not created")
 
+        # # Print the types of modifications
+        # base_mod_types = output_data.getBaseModTypes()
+        # logging.info("Modification types: ")
+        # for mod_type in base_mod_types:
+        #     logging.info(mod_type)
+
+        
+
     # Create the TIN table if available
     if file_type == 'BAM' and para_dict["genebed"] != "":
         input_files = para_dict["input_files"]
@@ -886,6 +887,67 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     plot_filepaths["base_mods"]['title'] = "Base Modifications"
     plot_filepaths["base_mods"]['description'] = "Base modification statistics"
 
+    # Print the types of modifications
+    base_mod_types = output_data.getBaseModTypes()
+    logging.info("Modification types: ")
+    for mod_type in base_mod_types:
+        logging.info(mod_type)
+
+    # Get the read length vs. base modification rate data for each modification type
+    read_mod_data_size = output_data.getReadModDataSize()
+    read_length_mod_rates = {}
+    for i in range(read_mod_data_size):
+        for mod_type in base_mod_types:
+            if mod_type not in read_length_mod_rates:
+                read_length_mod_rates[mod_type] = []
+
+            read_length = output_data.getNthReadModLength(i)
+            mod_rate = output_data.getNthReadModRate(i, mod_type)
+            read_length_mod_rates[mod_type].append((read_length, mod_rate))
+
+    # Dictionary of modification character to full name
+    mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \
+                        'g': '5hmU', 'e': '5fu', 'b': '5caU', \
+                        'a': '6mA', 'o': '8oxoG', 'n': 'Xao', \
+                        'C': 'Amb. C', 'A': 'Amb. A', 'T': 'Amb. T', 'G': 'Amb. G',\
+                        'N': 'Amb. N'}
+
+
+    # Create a plot of read length vs. base modification rate for each
+    # modification type
+    for mod_type in base_mod_types:
+
+        # Format the data
+        mod_data = read_length_mod_rates[mod_type]
+        x_vals = [data[0] for data in mod_data]
+        read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals]
+        mod_rates = [data[1] * 100 for data in mod_data]
+
+        # Get the modification name
+        try:
+            mod_char_to_name[mod_type]
+        except KeyError:
+            logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
+            mod_name = mod_type
+
+        mod_name = mod_char_to_name[mod_type]
+
+        # Create the figure
+        fig = go.Figure()
+        fig.add_trace(go.Scatter(x=x_vals, y=mod_rates, mode='markers', name=mod_name))
+
+    # Update the layout
+    fig.update_layout(xaxis_title='Read Length',
+                      yaxis_title='Modification Rate (%)',
+                      showlegend=True,
+                      yaxis=dict(range=[0, 100]),
+                      xaxis=dict(tickvals=x_vals, ticktext=read_lengths),
+                      font=dict(size=PLOT_FONT_SIZE))
+    
+    # Generate the HTML
+    html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
+    plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj
+
     # Create the base modification statistics table
     table_str = "<table>\n<tbody>"
     table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count)
@@ -895,6 +957,18 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse)
     table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count)
     table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count)
+
+    # Add the modification type data
+    for mod_type in base_mod_types:
+        mod_name = mod_char_to_name[mod_type]
+        mod_count = output_data.getModTypeCount(mod_type)
+        mod_count_fwd = output_data.getModTypeCount(mod_type, 0)
+        mod_count_rev = output_data.getModTypeCount(mod_type, 1)
+        table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count)
+        table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd)
+        table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev)
+
+    # Finish the table
     table_str += "\n</tbody>\n</table>"
     plot_filepaths["base_mods"]['detail'] = table_str
 
@@ -929,23 +1003,6 @@ def create_tin_table(output_data, input_files, plot_filepaths):
     # Add the table to the plot filepaths
     plot_filepaths["tin"]['detail'] = table_str
 
-    # plot_filepaths["base_mods"] = {}
-    # plot_filepaths["base_mods"]['file'] = ""
-    # plot_filepaths["base_mods"]['title'] = "Base Modifications"
-    # plot_filepaths["base_mods"]['description'] = "Base modification statistics"
-
-    # # Create the base modification statistics table
-    # table_str = "<table>\n<tbody>"
-    # table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count)
-    # table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold)
-    # table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count)
-    # table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward)
-    # table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse)
-    # table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count)
-    # table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count)
-    # table_str += "\n</tbody>\n</table>"
-    # plot_filepaths["base_mods"]['detail'] = table_str
-
 def create_pod5_table(output_dict, plot_filepaths):
     """Create a summary table for the ONT POD5 signal data."""
     plot_filepaths["basic_st"] = {}

From 5cfcf9f4b772200f206d40cbd51509069cb3507e Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 4 Jan 2025 18:10:20 -0500
Subject: [PATCH 07/25] Work on flags

---
 src/generate_html.py | 24 ++++++++++++++++++++++--
 src/plot_utils.py    |  3 +--
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/generate_html.py b/src/generate_html.py
index dcc5be7..a2e061e 100644
--- a/src/generate_html.py
+++ b/src/generate_html.py
@@ -237,19 +237,39 @@ def generate_left(self):
         self.html_writer.write('<h2>Summary</h2>')
         self.html_writer.write('<ul>')
 
+        # Define ASCII/Unicode icons for different flags
+        flag_icons = {
+            "PASS": "&#10004;",
+            "WARN": "&#9888;",
+        }
+        # "WARN": "&#9888;",
+        # "PASS": "&#10004;",
+        # "FAIL": "&#10060;",
+        # "INFO": "&#8505;"
+
         # Add links to the right sections
         key_index = 0
         for plot_key in self.image_key_list:
-            self.html_writer.write('<li>')
 
+            # Determine the flag icon
+            # [TEST] Select a random flag for testing
+            flags = ["PASS", "WARN"]
+            flag = flags[key_index % 2]
+            
+            # flag = self.plot_filepaths[plot_key]['flag']
+            flag_icon = flag_icons[flag]
+            self.html_writer.write('<li>')
+            self.html_writer.write(f'{flag_icon} ')
             self.html_writer.write(
                 '<a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>')
+                # f'{flag_icon} <a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>')
+            
             key_index += 1
             self.html_writer.write('</li>')
 
         # Add the input files section link
         self.html_writer.write('<li>')
-        self.html_writer.write('<a href="#lrst' + str(key_index) + '">Input File List</a>')
+        self.html_writer.write('• <a href="#lrst' + str(key_index) + '">Input File List</a>')
         key_index += 1
         self.html_writer.write('</li>')
         
diff --git a/src/plot_utils.py b/src/plot_utils.py
index d82bffb..5accb86 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -915,6 +915,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
     # Create a plot of read length vs. base modification rate for each
     # modification type
+    fig = go.Figure()
     for mod_type in base_mod_types:
 
         # Format the data
@@ -932,8 +933,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
         mod_name = mod_char_to_name[mod_type]
 
-        # Create the figure
-        fig = go.Figure()
         fig.add_trace(go.Scatter(x=x_vals, y=mod_rates, mode='markers', name=mod_name))
 
     # Update the layout

From 13b150f529790388db10948b995c10bb8f7e6ac5 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 5 Jan 2025 18:34:39 -0500
Subject: [PATCH 08/25] Work on flags and mod plots

---
 src/cli.py           |  52 +++---
 src/generate_html.py |  34 ++--
 src/hts_reader.cpp   |  18 +-
 src/output_data.cpp  |   4 +-
 src/plot_utils.py    | 426 ++++++++++++++++++++++++++++---------------
 5 files changed, 338 insertions(+), 196 deletions(-)

diff --git a/src/cli.py b/src/cli.py
index 59b8a9c..85951a6 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -155,7 +155,7 @@ def fq_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fq_output, param_dict, 'FASTQ')
             fq_html_gen = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality",
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality", 
                   "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False)
             fq_html_gen.generate_html()
 
@@ -246,11 +246,12 @@ def bam_module(margs):
             plot_filepaths = plot(bam_output, param_dict, 'BAM')
 
             # Set the list of QC information to display
-            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"]
+            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", "read_avg_base_quality"]
 
             # If base modifications were found, add the base modification plots
             # after the first table
             if bam_output.sample_modified_base_count > 0:
+                logging.info("Base modifications found. Adding base modification plots to the HTML report.")
                 qc_info_list.insert(1, "read_length_mod_rates")  # Read length modification rates
                 qc_info_list.insert(1, "base_mods")
 
@@ -313,7 +314,7 @@ def rrms_module(margs):
                 # Generate the HTML report
                 bam_html_gen = generate_html.ST_HTML_Generator(
                     [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info",
-                    "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False)
+                    "base_quality", "read_avg_base_quality"], "BAM QC", param_dict], plot_filepaths, static=False)
                 bam_html_gen.generate_html()
                 logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
@@ -431,8 +432,7 @@ def fast5_signal_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fast5_output, param_dict, 'FAST5s')
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality",
-                  "read_avg_base_quality", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False)
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html(signal_plots=True)
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
@@ -440,25 +440,6 @@ def fast5_signal_module(margs):
             logging.error("QC did not generate.")
 
 
-def set_file_parser_defaults(file_parser):
-    """Create a parser with default arguments for a specific filetype."""
-    file_parser.add_argument("-i", "--input", type=argparse.FileType('r'), default=None,
-                        help="Single input filepath")
-    file_parser.add_argument("-I", "--inputs", type=str, default=None,
-                        help="Multiple comma-separated input filepaths")
-    file_parser.add_argument("-P", "--pattern", type=str, default=None,
-                        help="Use pattern matching (*) to specify multiple input files. Enclose the pattern in double quotes.")
-    file_parser.add_argument("-g", "--log", type=str, default="log_output.log",
-                        help="Log file")
-    file_parser.add_argument("-G", "--log-level", type=int, default=2,
-                        help="Logging level. 1: DEBUG, 2: INFO, 3: WARNING, 4: ERROR, 5: CRITICAL. Default: 2.")
-    file_parser.add_argument("-o", "--outputfolder", type=str, default="output_" + prg_name,
-                        help="The output folder.")
-    file_parser.add_argument("-t", "--threads", type=int, default=1,
-                        help="The number of threads used. Default: 1.")
-    file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_",
-                        help="The prefix for output filenames. Default: `QC_`.")
-
 def pod5_module(margs):
     """POD5 file input module."""
     # Get the filetype-specific parameters
@@ -519,13 +500,32 @@ def pod5_module(margs):
             # plot_filepaths = plot(read_signal_dict, param_dict, 'POD5')
             webpage_title = "POD5 QC"
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality",
-                  "read_avg_base_quality", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html(signal_plots=True)
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
         else:
             logging.error("QC did not generate.")
+            
+
+def set_file_parser_defaults(file_parser):
+    """Create a parser with default arguments for a specific filetype."""
+    file_parser.add_argument("-i", "--input", type=argparse.FileType('r'), default=None,
+                        help="Single input filepath")
+    file_parser.add_argument("-I", "--inputs", type=str, default=None,
+                        help="Multiple comma-separated input filepaths")
+    file_parser.add_argument("-P", "--pattern", type=str, default=None,
+                        help="Use pattern matching (*) to specify multiple input files. Enclose the pattern in double quotes.")
+    file_parser.add_argument("-g", "--log", type=str, default="log_output.log",
+                        help="Log file")
+    file_parser.add_argument("-G", "--log-level", type=int, default=2,
+                        help="Logging level. 1: DEBUG, 2: INFO, 3: WARNING, 4: ERROR, 5: CRITICAL. Default: 2.")
+    file_parser.add_argument("-o", "--outputfolder", type=str, default="output_" + prg_name,
+                        help="The output folder.")
+    file_parser.add_argument("-t", "--threads", type=int, default=1,
+                        help="The number of threads used. Default: 1.")
+    file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_",
+                        help="The prefix for output filenames. Default: `QC_`.")
 
 
 # Set up the argument parser
diff --git a/src/generate_html.py b/src/generate_html.py
index a2e061e..64f1641 100644
--- a/src/generate_html.py
+++ b/src/generate_html.py
@@ -237,39 +237,34 @@ def generate_left(self):
         self.html_writer.write('<h2>Summary</h2>')
         self.html_writer.write('<ul>')
 
-        # Define ASCII/Unicode icons for different flags
-        flag_icons = {
-            "PASS": "&#10004;",
-            "WARN": "&#9888;",
+        # Define ASCII/Unicode icons for error flags
+        error_flag_icon = {
+            True: "&#9888;",
+            False: "&#10004;",
         }
-        # "WARN": "&#9888;",
-        # "PASS": "&#10004;",
-        # "FAIL": "&#10060;",
-        # "INFO": "&#8505;"
 
         # Add links to the right sections
         key_index = 0
         for plot_key in self.image_key_list:
 
             # Determine the flag icon
-            # [TEST] Select a random flag for testing
-            flags = ["PASS", "WARN"]
-            flag = flags[key_index % 2]
+            try:
+                flag = self.plot_filepaths[plot_key]['error_flag']
+            except KeyError:
+                flag = False
             
-            # flag = self.plot_filepaths[plot_key]['flag']
-            flag_icon = flag_icons[flag]
+            flag_icon = error_flag_icon[flag]
             self.html_writer.write('<li>')
             self.html_writer.write(f'{flag_icon} ')
             self.html_writer.write(
                 '<a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>')
-                # f'{flag_icon} <a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>')
             
             key_index += 1
             self.html_writer.write('</li>')
 
         # Add the input files section link
-        self.html_writer.write('<li>')
-        self.html_writer.write('• <a href="#lrst' + str(key_index) + '">Input File List</a>')
+        self.html_writer.write('<br><li>')
+        self.html_writer.write('<a href="#lrst' + str(key_index) + '">Input File List</a>')
         key_index += 1
         self.html_writer.write('</li>')
         
@@ -297,7 +292,12 @@ def generate_right(self):
                     self.html_writer.write(dynamic_plot)
 
                 except KeyError:
-                    logging.error("Missing dynamic plot for %s", plot_key)
+                    # See if an image is available
+                    try:
+                        image_path = self.plot_filepaths[plot_key]['file']
+                        self.html_writer.write(f'<img src="{image_path}" alt="{plot_key}">')
+                    except KeyError:
+                        logging.error("Missing plot for %s", plot_key)
 
             self.html_writer.write('</div>')
 
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 8056c19..8762f45 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -378,7 +378,8 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
             // Parse the base modification tags if a primary alignment
             int read_mod_count = 0;
             int ret = bam_parse_basemod(bam_record, state);
-            if (ret >= 0 && !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP)) {
+            if (ret >= 0) {
+                bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
 
                 // Get the chromosome if alignments are present
                 bool alignments_present = true;
@@ -397,13 +398,20 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                 // but it always yields 0...)
                 int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0;
 
+                // Set strand to null (-1) if the read is not primary
+                if (!is_primary) {
+                    strand = -1;
+                }
+
                 // Iterate over the state object to get the base modification tags
                 // using bam_next_basemod
                 hts_base_mod mods[10];
                 int n = 0;
                 int32_t pos = 0;
                 std::vector<int> query_pos;
+                bool first_mod_found = false;
                 while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) {
+
                     for (int i = 0; i < n; i++) {
                         // Update the modified prediction counts
                         read_mod_count++;  // Read-specific count
@@ -411,7 +419,6 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                         char mod_type = mods[i].modified_base;
                         base_mod_counts[mod_type]++;  // Update the type-specific count
 
-
                         // Note: The modified base value can be a positive char (e.g. 'm',
                         // 'h') (DNA Mods DB) or negative integer (ChEBI ID):
                         // https://github.com/samtools/hts-specs/issues/741
@@ -430,9 +437,10 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                             if (probability >= base_mod_threshold) {
                                 final_output.updateBaseModCounts(mod_type, strand);  // Update the base modification counts
 
-                                // Store the modified positions for later CpG analysis
+                                // Store the modified positions for later CpG
+                                // analysis if a C modification on a primary alignment
                                 char canonical_base_char = std::toupper(mods[i].canonical_base);
-                                if (canonical_base_char == 'C' && mod_type != 'C') {
+                                if (is_primary && canonical_base_char == 'C' && mod_type != 'C') {
 
                                     // Convert the query position to reference position if available
                                     if (alignments_present) {
@@ -447,7 +455,7 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                     }
                 }
 
-                // Preprint revisions: Append the modified positions to the output data
+                // Append the modified positions to the output data
                 if (c_modified_positions.size() > 0) {
                     // Set the atomic flag and print a message if base
                     // modification tags are present in the file
diff --git a/src/output_data.cpp b/src/output_data.cpp
index c92a60d..02cd5a8 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -268,11 +268,11 @@ void Output_BAM::updateBaseModCounts(char mod_type, int strand)
     this->sample_modified_base_count++;
     this->base_mod_counts[mod_type]++;  // Update the type-specific modified base count
 
-    // Update the modified base count for the strand
+    // Update the modified base count for the strand from primary alignments
     if (strand == 0) {
         this->sample_modified_base_count_forward++;
         this->base_mod_counts_forward[mod_type]++;  // Update the type-specific modified base count
-    } else {
+    } else if (strand == 1) {
         this->sample_modified_base_count_reverse++;
         this->base_mod_counts_reverse[mod_type]++;  // Update the type-specific modified base count
     }
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 5accb86..209cfbd 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -422,7 +422,7 @@ def plot_base_modifications(base_modifications):
         mod_data = base_modifications[mod_type]
 
         # Create the trace
-        trace = go.Scatter(x=mod_data['positions'], y=mod_data['counts'], mode='markers', name=mod_type)
+        trace = go.Scattergl(x=mod_data['positions'], y=mod_data['counts'], mode='markers', name=mod_type)
 
         # Add the trace to the figure
         fig.add_trace(trace)
@@ -448,20 +448,17 @@ def plot(output_data, para_dict, file_type):
 
     # Modified base table and plots
     if file_type == 'BAM' and para_dict["mod"] > 0:
-        # Modified base table
+        # Output file for the read length vs. modification rates plot
+        output_folder = para_dict["output_folder"]
+        read_length_hist_file = os.path.join(output_folder, 'read_length_hist.png')
+        plot_filepaths['read_length_mod_rates']['file'] = read_length_hist_file
+
+        # Generate the modified base table and read length vs. modification rates plot
         base_modification_threshold = para_dict["modprob"]
         create_modified_base_table(output_data, plot_filepaths, base_modification_threshold)
         if 'base_mods' not in plot_filepaths:
             logging.warning("WARNING: Modified base table not created")
 
-        # # Print the types of modifications
-        # base_mod_types = output_data.getBaseModTypes()
-        # logging.info("Modification types: ")
-        # for mod_type in base_mod_types:
-        #     logging.info(mod_type)
-
-        
-
     # Create the TIN table if available
     if file_type == 'BAM' and para_dict["genebed"] != "":
         input_files = para_dict["input_files"]
@@ -616,7 +613,7 @@ def plot_pod5(pod5_output, para_dict, bam_output=None):
 
         # Plot the signal data
         x = np.arange(signal_length)
-        fig.add_trace(go.Scatter(
+        fig.add_trace(go.Scattergl(
             x=x, y=nth_read_data,
             mode='markers',
             marker=dict(color='LightSkyBlue',
@@ -624,7 +621,7 @@ def plot_pod5(pod5_output, para_dict, bam_output=None):
                         line=dict(color='MediumPurple', width=2)),
             opacity=0.5))
 
-        # Update the plot style
+        # Update the plot style (using 0-100 to improve performance)
         fig.update_layout(
             title=nth_read_name,
             yaxis_title="Signal",
@@ -708,7 +705,7 @@ def plot_signal(output_data, para_dict):
 
             # Plot
             x = np.arange(start_index, end_index, 1)
-            fig.add_trace(go.Scatter(
+            fig.add_trace(go.Scattergl(
                 x=x, y=base_signals,
                 mode='markers',
                 marker=dict(color='LightSkyBlue',
@@ -763,6 +760,29 @@ def plot_signal(output_data, para_dict):
 
     return output_html_plots
 
+def format_cell(value, type_str='int', error_flag=False):
+    """Format the cell value for the summary table."""
+    style = "background-color: #F88379;" if error_flag else ""
+    if type_str == 'int':
+        return "<td style=\"text-align:right;{}\">{:,d}</td>".format(style, value)
+    elif type_str == 'float':
+        return "<td style=\"text-align:right;{}\">{:.1f}</td>".format(style, value)
+    else:
+        logging.error("ERROR: Invalid type for formatting cell value")
+
+def format_row(row_name, values, type_str='int', col_ignore=None):
+    """Format the row for the summary table. Skip flagging null values in specific columns."""
+    cell_str = []
+    row_flag = False
+    for i, value in enumerate(values):
+        # Set the error flag if the value is 0 except for unmapped reads
+        error_flag = value == 0 and i != col_ignore
+        row_flag = row_flag or error_flag  # Flag for the entire row
+        cell_str.append(format_cell(value, type_str, error_flag))
+
+    return "<tr><td>{}</td>{}</tr>".format(row_name, "".join(cell_str)), row_flag
+
+
 def create_summary_table(output_data, plot_filepaths, file_type):
     """Create the summary table for the basic statistics."""
     plot_filepaths["basic_st"] = {}
@@ -777,73 +797,135 @@ def create_summary_table(output_data, plot_filepaths, file_type):
         file_type_label = 'Basecall Summary'
         
     plot_filepaths["basic_st"]['description'] = "{} Basic Statistics".format(file_type_label)
+    table_error_flag = False
 
     if file_type == 'BAM':
+
         # Add alignment statistics to the summary table
         table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Mapped</th><th>Unmapped</th><th>All</th></tr>\n" \
                     "</thead> "
         table_str += "\n<tbody>"
-        int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:," \
-                             "d}</td><td style=\"text-align:right\">{:,d}</td></tr> "
-        double_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:.1f}</td><td " \
-                                "style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr> "
-        table_str += int_str_for_format.format("#Total Reads", output_data.mapped_long_read_info.total_num_reads,
-                                               output_data.unmapped_long_read_info.total_num_reads,
-                                               output_data.long_read_info.total_num_reads)
-        table_str += int_str_for_format.format("#Total Bases",
-                                               output_data.mapped_long_read_info.total_num_bases,
-                                               output_data.unmapped_long_read_info.total_num_bases,
-                                               output_data.long_read_info.total_num_bases)
-        table_str += int_str_for_format.format("Longest Read Length",
-                                               output_data.mapped_long_read_info.longest_read_length,
-                                               output_data.unmapped_long_read_info.longest_read_length,
-                                               output_data.long_read_info.longest_read_length)
-        table_str += int_str_for_format.format("N50",
-                                               output_data.mapped_long_read_info.n50_read_length,
-                                               output_data.unmapped_long_read_info.n50_read_length,
-                                               output_data.long_read_info.n50_read_length)
-        table_str += double_str_for_format.format("GC Content(%)",
-                                                  output_data.mapped_long_read_info.gc_cnt * 100,
-                                                  output_data.unmapped_long_read_info.gc_cnt * 100,
-                                                  output_data.long_read_info.gc_cnt * 100)
-        table_str += double_str_for_format.format("Mean Read Length",
-                                                  output_data.mapped_long_read_info.mean_read_length,
-                                                  output_data.unmapped_long_read_info.mean_read_length,
-                                                  output_data.long_read_info.mean_read_length)
-        table_str += int_str_for_format.format("Median Read Length",
-                                               output_data.mapped_long_read_info.median_read_length,
-                                               output_data.unmapped_long_read_info.median_read_length,
-                                               output_data.long_read_info.median_read_length)
+
+        # Total reads
+        row_str, row_flag = format_row("Total Reads", \
+                                        [output_data.mapped_long_read_info.total_num_reads, \
+                                            output_data.unmapped_long_read_info.total_num_reads, \
+                                            output_data.long_read_info.total_num_reads], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+        
+        # Total bases
+        row_str, row_flag = format_row("Total Bases", \
+                                        [output_data.mapped_long_read_info.total_num_bases, \
+                                         output_data.unmapped_long_read_info.total_num_bases, \
+                                         output_data.long_read_info.total_num_bases], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Longest read length
+        row_str, row_flag = format_row("Longest Read Length", \
+                                        [output_data.mapped_long_read_info.longest_read_length, \
+                                         output_data.unmapped_long_read_info.longest_read_length, \
+                                         output_data.long_read_info.longest_read_length], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # N50
+        row_str, row_flag = format_row("N50", \
+                                        [output_data.mapped_long_read_info.n50_read_length, \
+                                            output_data.unmapped_long_read_info.n50_read_length, \
+                                            output_data.long_read_info.n50_read_length], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # GC content
+        row_str, row_flag = format_row("GC Content(%)", \
+                                        [output_data.mapped_long_read_info.gc_cnt * 100, \
+                                            output_data.unmapped_long_read_info.gc_cnt * 100, \
+                                            output_data.long_read_info.gc_cnt * 100], \
+                                        'float', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Mean read length
+        row_str, row_flag = format_row("Mean Read Length", \
+                                        [output_data.mapped_long_read_info.mean_read_length, \
+                                            output_data.unmapped_long_read_info.mean_read_length, \
+                                            output_data.long_read_info.mean_read_length], \
+                                        'float', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Median read length
+        row_str, row_flag = format_row("Median Read Length", \
+                                        [output_data.mapped_long_read_info.median_read_length, \
+                                            output_data.unmapped_long_read_info.median_read_length, \
+                                            output_data.long_read_info.median_read_length], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
         
     elif file_type == 'SeqTxt':
         table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Passed</th><th>Failed</th><th>All</th></tr>\n</thead>"
         table_str += "\n<tbody>"
-        int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:,d}</td></tr>"
-        double_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>"
-        table_str += int_str_for_format.format("#Total Reads",
-                                               output_data.passed_long_read_info.long_read_info.total_num_reads,
-                                               output_data.failed_long_read_info.long_read_info.total_num_reads,
-                                               output_data.all_long_read_info.long_read_info.total_num_reads)
-        table_str += int_str_for_format.format("#Total Bases",
-                                               output_data.passed_long_read_info.long_read_info.total_num_bases,
-                                               output_data.failed_long_read_info.long_read_info.total_num_bases,
-                                               output_data.all_long_read_info.long_read_info.total_num_bases)
-        table_str += int_str_for_format.format("Longest Read Length",
-                                               output_data.passed_long_read_info.long_read_info.longest_read_length,
-                                               output_data.failed_long_read_info.long_read_info.longest_read_length,
-                                               output_data.all_long_read_info.long_read_info.longest_read_length)
-        table_str += int_str_for_format.format("N50",
-                                               output_data.passed_long_read_info.long_read_info.n50_read_length,
-                                               output_data.failed_long_read_info.long_read_info.n50_read_length,
-                                               output_data.all_long_read_info.long_read_info.n50_read_length)
-        table_str += double_str_for_format.format("Mean Read Length",
-                                                  output_data.passed_long_read_info.long_read_info.mean_read_length,
-                                                  output_data.failed_long_read_info.long_read_info.mean_read_length,
-                                                  output_data.all_long_read_info.long_read_info.mean_read_length)
-        table_str += int_str_for_format.format("Median Read Length",
-                                               output_data.passed_long_read_info.long_read_info.median_read_length,
-                                               output_data.failed_long_read_info.long_read_info.median_read_length,
-                                               output_data.all_long_read_info.long_read_info.median_read_length)
+        
+        # Total reads
+        row_str, row_flag = format_row("Total Reads", \
+                                        [output_data.passed_long_read_info.long_read_info.total_num_reads, \
+                                            output_data.failed_long_read_info.long_read_info.total_num_reads, \
+                                            output_data.all_long_read_info.long_read_info.total_num_reads], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Total bases
+        row_str, row_flag = format_row("Total Bases", \
+                                        [output_data.passed_long_read_info.long_read_info.total_num_bases, \
+                                            output_data.failed_long_read_info.long_read_info.total_num_bases, \
+                                            output_data.all_long_read_info.long_read_info.total_num_bases], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Longest read length
+        row_str, row_flag = format_row("Longest Read Length", \
+                                        [output_data.passed_long_read_info.long_read_info.longest_read_length, \
+                                            output_data.failed_long_read_info.long_read_info.longest_read_length, \
+                                            output_data.all_long_read_info.long_read_info.longest_read_length], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # N50
+        row_str, row_flag = format_row("N50", \
+                                        [output_data.passed_long_read_info.long_read_info.n50_read_length, \
+                                            output_data.failed_long_read_info.long_read_info.n50_read_length, \
+                                            output_data.all_long_read_info.long_read_info.n50_read_length], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Mean read length
+        row_str, row_flag = format_row("Mean Read Length", \
+                                        [output_data.passed_long_read_info.long_read_info.mean_read_length, \
+                                            output_data.failed_long_read_info.long_read_info.mean_read_length, \
+                                            output_data.all_long_read_info.long_read_info.mean_read_length], \
+                                        'float', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Median read length
+        row_str, row_flag = format_row("Median Read Length", \
+                                        [output_data.passed_long_read_info.long_read_info.median_read_length, \
+                                            output_data.failed_long_read_info.long_read_info.median_read_length, \
+                                            output_data.all_long_read_info.long_read_info.median_read_length], \
+                                        'int', 1)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
 
     elif file_type == 'FAST5s':
         # Get values
@@ -853,32 +935,58 @@ def create_summary_table(output_data, plot_filepaths, file_type):
         # Set up the HTML table
         table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>"
         table_str += "\n<tbody>"
-        int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>"
-        table_str += int_str_for_format.format("#Total Reads", read_count)
-        table_str += int_str_for_format.format("#Total Bases", total_base_count)
+
+        # Total reads
+        row_str, row_flag = format_row("Total Reads", [read_count], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Total bases
+        row_str, row_flag = format_row("Total Bases", [total_base_count], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
 
     else:
         table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>"
         table_str += "\n<tbody>"
-        int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>"
-        double_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:.1f}</td></tr>"
-        table_str += int_str_for_format.format("#Total Reads",
-                                               output_data.long_read_info.total_num_reads)
-        table_str += int_str_for_format.format("#Total Bases",
-                                               output_data.long_read_info.total_num_bases)
-        table_str += int_str_for_format.format("Longest Read Length",
-                                               output_data.long_read_info.longest_read_length)
-        table_str += int_str_for_format.format("N50",
-                                               output_data.long_read_info.n50_read_length)
-        table_str += double_str_for_format.format("GC Content(%)",
-                                                  output_data.long_read_info.gc_cnt * 100)
-        table_str += double_str_for_format.format("Mean Read Length",
-                                                  output_data.long_read_info.mean_read_length)
-        table_str += int_str_for_format.format("Median Read Length",
-                                               output_data.long_read_info.median_read_length)
+        # Total reads
+        row_str, row_flag = format_row("Total Reads", [output_data.long_read_info.total_num_reads], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Total bases
+        row_str, row_flag = format_row("Total Bases", [output_data.long_read_info.total_num_bases], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Longest read length
+        row_str, row_flag = format_row("Longest Read Length", [output_data.long_read_info.longest_read_length], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # N50
+        row_str, row_flag = format_row("N50", [output_data.long_read_info.n50_read_length], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # GC content
+        row_str, row_flag = format_row("GC Content(%)", [output_data.long_read_info.gc_cnt * 100], 'float', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Mean read length
+        row_str, row_flag = format_row("Mean Read Length", [output_data.long_read_info.mean_read_length], 'float', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        # Median read length
+        row_str, row_flag = format_row("Median Read Length", [output_data.long_read_info.median_read_length], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
         
     table_str += "\n</tbody>\n</table>"
     plot_filepaths["basic_st"]['detail'] = table_str
+    plot_filepaths["basic_st"]['error_flag'] = table_error_flag
 
 def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold):
     """Create a summary table for the base modifications."""
@@ -888,64 +996,84 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     plot_filepaths["base_mods"]['description'] = "Base modification statistics"
 
     # Print the types of modifications
+    logging.info("Getting base modification types")
     base_mod_types = output_data.getBaseModTypes()
-    logging.info("Modification types: ")
-    for mod_type in base_mod_types:
-        logging.info(mod_type)
-
-    # Get the read length vs. base modification rate data for each modification type
-    read_mod_data_size = output_data.getReadModDataSize()
-    read_length_mod_rates = {}
-    for i in range(read_mod_data_size):
+    if base_mod_types:
+        logging.info("Modification types: ")
         for mod_type in base_mod_types:
-            if mod_type not in read_length_mod_rates:
-                read_length_mod_rates[mod_type] = []
-
-            read_length = output_data.getNthReadModLength(i)
-            mod_rate = output_data.getNthReadModRate(i, mod_type)
-            read_length_mod_rates[mod_type].append((read_length, mod_rate))
-
-    # Dictionary of modification character to full name
-    mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \
-                        'g': '5hmU', 'e': '5fu', 'b': '5caU', \
-                        'a': '6mA', 'o': '8oxoG', 'n': 'Xao', \
-                        'C': 'Amb. C', 'A': 'Amb. A', 'T': 'Amb. T', 'G': 'Amb. G',\
-                        'N': 'Amb. N'}
-
-
-    # Create a plot of read length vs. base modification rate for each
-    # modification type
-    fig = go.Figure()
-    for mod_type in base_mod_types:
-
-        # Format the data
-        mod_data = read_length_mod_rates[mod_type]
-        x_vals = [data[0] for data in mod_data]
-        read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals]
-        mod_rates = [data[1] * 100 for data in mod_data]
-
-        # Get the modification name
-        try:
-            mod_char_to_name[mod_type]
-        except KeyError:
-            logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
-            mod_name = mod_type
-
-        mod_name = mod_char_to_name[mod_type]
-
-        fig.add_trace(go.Scatter(x=x_vals, y=mod_rates, mode='markers', name=mod_name))
-
-    # Update the layout
-    fig.update_layout(xaxis_title='Read Length',
-                      yaxis_title='Modification Rate (%)',
-                      showlegend=True,
-                      yaxis=dict(range=[0, 100]),
-                      xaxis=dict(tickvals=x_vals, ticktext=read_lengths),
-                      font=dict(size=PLOT_FONT_SIZE))
-    
-    # Generate the HTML
-    html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
-    plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj
+            logging.info(mod_type)
+
+        # Get the read length vs. base modification rate data for each modification type
+        read_mod_data_size = output_data.getReadModDataSize()
+        logging.info("[TEST] read_mod_data_size: {}".format(read_mod_data_size))
+        read_length_mod_rates = {}
+        for i in range(read_mod_data_size):
+            for mod_type in base_mod_types:
+                if mod_type not in read_length_mod_rates:
+                    read_length_mod_rates[mod_type] = []
+
+                read_length = output_data.getNthReadModLength(i)
+                mod_rate = output_data.getNthReadModRate(i, mod_type)
+                read_length_mod_rates[mod_type].append((read_length, mod_rate))
+
+        # Dictionary of modification character to full name
+        mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \
+                            'g': '5hmU', 'e': '5fu', 'b': '5caU', \
+                            'a': '6mA', 'o': '8oxoG', 'n': 'Xao', \
+                            'C': 'Amb. C', 'A': 'Amb. A', 'T': 'Amb. T', 'G': 'Amb. G',\
+                            'N': 'Amb. N', \
+                            'v': 'pseU'}
+
+
+        # Create a plot of read length vs. base modification rate for each
+        # modification type
+        # Make subplots vertically for each modification type
+        fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1)
+        min_x = float('inf')
+        max_x = 0
+        # for mod_type in base_mod_types:
+        for i, mod_type in enumerate(base_mod_types):
+
+            # Format the data
+            mod_data = read_length_mod_rates[mod_type]
+            x_vals = [data[0] for data in mod_data]
+            read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals]
+            mod_rates = [data[1] * 100 for data in mod_data]
+
+            # Update the min and max x values
+            min_x = min(min_x, min(x_vals))
+            max_x = max(max_x, max(x_vals))
+
+            # Get the modification name
+            try:
+                mod_name = mod_char_to_name[mod_type]
+            except KeyError:
+                logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
+                mod_name = mod_type
+
+            fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name), row=i + 1, col=1)
+
+            # Update the layout
+            max_x_range = min(max_x, 10000)  # To improve the plot performance
+            fig.update_layout(title='Read Length vs. {} Modification Rate'.format(mod_name),
+                            xaxis_title='Read Length',
+                            yaxis_title='Modification Rate (%)',
+                            showlegend=False,
+                            yaxis=dict(range=[0, 100]),
+                            xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]),
+                            font=dict(size=PLOT_FONT_SIZE))
+            
+            logging.info("Plotting read length vs. {} modification rate".format(mod_name))
+
+        # Save the plot image
+        fig_file = plot_filepaths["read_length_mod_rates"]['file']
+        fig.write_image(fig_file)
+        
+        # Generate the HTML
+        # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
+        # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj
+    else:
+        logging.warning("WARNING: No modification types found")
 
     # Create the base modification statistics table
     table_str = "<table>\n<tbody>"
@@ -959,7 +1087,13 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
     # Add the modification type data
     for mod_type in base_mod_types:
-        mod_name = mod_char_to_name[mod_type]
+        # mod_name = mod_char_to_name[mod_type]
+        try:
+            mod_name = mod_char_to_name[mod_type]
+        except KeyError:
+            logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
+            mod_name = mod_type
+
         mod_count = output_data.getModTypeCount(mod_type)
         mod_count_fwd = output_data.getModTypeCount(mod_type, 0)
         mod_count_rev = output_data.getModTypeCount(mod_type, 1)

From 957a8d833eed15b25c18570edad7a77801878add Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 5 Jan 2025 19:52:11 -0500
Subject: [PATCH 09/25] work on flags

---
 environment.yml   |   4 +-
 src/plot_utils.py | 164 +++++++++++++++++++++++++++++++++++++---------
 2 files changed, 136 insertions(+), 32 deletions(-)

diff --git a/environment.yml b/environment.yml
index 6a9e2af..a76645c 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,6 +4,7 @@ channels:
   - bioconda
   - defaults
   - jannessp  # for pod5
+  - plotly  # for kaleido
 dependencies:
   - python=3.9
   - numpy
@@ -14,4 +15,5 @@ dependencies:
   - plotly
   - pytest
   - pod5
-  - pyarrow
\ No newline at end of file
+  - pyarrow
+  - python-kaleido
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 209cfbd..db96017 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -376,8 +376,9 @@ def read_gc_content_histogram(data, font_size):
 
     return fig.to_html(full_html=False, default_height=500, default_width=700)
 
-# Save the 'Base quality' plot image.
-def base_quality(data, font_size):
+
+def base_quality(data, font_size, plot_filepaths):
+    """Plot the base quality distribution."""
     xd = np.arange(MAX_BASE_QUALITY)
     yd = np.array(data.base_quality_distribution)
     fig = go.Figure()
@@ -392,9 +393,19 @@ def base_quality(data, font_size):
     fig.update_yaxes(ticks="outside", title_text='Number of bases', title_standoff=0)
     fig.update_layout(font=dict(size=PLOT_FONT_SIZE))  # Set font size
 
-    return fig.to_html(full_html=False, default_height=500, default_width=700)
+    # return fig.to_html(full_html=False, default_height=500, default_width=700)
+    plot_filepaths['base_quality']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
+
+    # Set the error flag if the base quality is below 20 for more than 10% of
+    # the bases
+    error_flag = False
+    if np.sum(yd[:20]) / np.sum(yd) > 0.1:
+        error_flag = True
+
+    plot_filepaths['base_quality']['error_flag'] = error_flag
 
-def read_avg_base_quality(data, font_size):
+
+def read_avg_base_quality(data, font_size, plot_filepaths):
     """Plot the read average base quality distribution."""
     xd = np.arange(MAX_READ_QUALITY)
     yd = np.array(data.read_average_base_quality_distribution)
@@ -405,7 +416,16 @@ def read_avg_base_quality(data, font_size):
     fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0)
     fig.update_layout(font=dict(size=PLOT_FONT_SIZE))  # Set font size
 
-    return fig.to_html(full_html=False, default_height=500, default_width=700)
+    # return fig.to_html(full_html=False, default_height=500, default_width=700)
+    plot_filepaths['read_avg_base_quality']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
+
+    # Set the error flag if the average base quality is below 20 for more than
+    # 10% of the reads
+    error_flag = False
+    if np.sum(yd[:20]) / np.sum(yd) > 0.1:
+        error_flag = True
+
+    plot_filepaths['read_avg_base_quality']['error_flag'] = error_flag
 
 
 def plot_base_modifications(base_modifications):
@@ -499,16 +519,24 @@ def plot(output_data, para_dict, file_type):
         seq_quality_info = output_data.seq_quality_info
 
         # Base quality histogram
-        plot_filepaths['base_quality']['dynamic'] = base_quality(seq_quality_info, font_size)
+        base_quality(seq_quality_info, font_size, plot_filepaths)
+        # plot_filepaths['base_quality']['dynamic'] = base_quality(seq_quality_info, font_size)
 
         # Read quality histogram
-        read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size)
-        plot_filepaths['read_avg_base_quality']['dynamic'] = read_quality_dynamic
+        # read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size)
+        # plot_filepaths['read_avg_base_quality']['dynamic'] =
+        # read_quality_dynamic
+        read_avg_base_quality(seq_quality_info, font_size, plot_filepaths)
 
     if file_type == 'BAM':
         # Plot read alignment QC
-        plot_filepaths['read_alignments_bar']['dynamic'] = plot_alignment_numbers(output_data)
-        plot_filepaths['base_alignments_bar']['dynamic'] = plot_errors(output_data)
+        plot_alignment_numbers(output_data, plot_filepaths)
+        # plot_filepaths['read_alignments_bar']['dynamic'] =
+        # plot_alignment_numbers(output_data)
+        
+        # Plot base alignment and error QC
+        plot_errors(output_data, plot_filepaths)
+        # plot_filepaths['base_alignments_bar']['dynamic'] = plot_errors(output_data)
         
     elif file_type == 'FAST5s':
         plot_filepaths['ont_signal']['dynamic'] = plot_signal(output_data, para_dict)
@@ -988,12 +1016,14 @@ def create_summary_table(output_data, plot_filepaths, file_type):
     plot_filepaths["basic_st"]['detail'] = table_str
     plot_filepaths["basic_st"]['error_flag'] = table_error_flag
 
+
 def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold):
     """Create a summary table for the base modifications."""
     plot_filepaths["base_mods"] = {}
     plot_filepaths["base_mods"]['file'] = ""
     plot_filepaths["base_mods"]['title'] = "Base Modifications"
     plot_filepaths["base_mods"]['description'] = "Base modification statistics"
+    table_error_flag = False
 
     # Print the types of modifications
     logging.info("Getting base modification types")
@@ -1005,7 +1035,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
         # Get the read length vs. base modification rate data for each modification type
         read_mod_data_size = output_data.getReadModDataSize()
-        logging.info("[TEST] read_mod_data_size: {}".format(read_mod_data_size))
         read_length_mod_rates = {}
         for i in range(read_mod_data_size):
             for mod_type in base_mod_types:
@@ -1077,13 +1106,41 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
     # Create the base modification statistics table
     table_str = "<table>\n<tbody>"
-    table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count)
-    table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold)
-    table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count)
-    table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward)
-    table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse)
-    table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count)
-    table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count)
+    row_str, row_flag = format_row("Total Predictions", [output_data.modified_prediction_count], 'int', None)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
+
+    row_str, row_flag = format_row("Probability Threshold", [base_modification_threshold], 'float', 0)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
+
+    row_str, row_flag = format_row("Total Modified Bases in the Sample", [output_data.sample_modified_base_count], 'int', None)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
+
+    row_str, row_flag = format_row("Total in the Forward Strand", [output_data.sample_modified_base_count_forward], 'int', None)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
+
+    row_str, row_flag = format_row("Total in the Reverse Strand", [output_data.sample_modified_base_count_reverse], 'int', None)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
+
+    row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
+
+    row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
+
+    # table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count)
+    # table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold)
+    # table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count)
+    # table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward)
+    # table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse)
+    # table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count)
+    # table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count)
 
     # Add the modification type data
     for mod_type in base_mod_types:
@@ -1097,13 +1154,26 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         mod_count = output_data.getModTypeCount(mod_type)
         mod_count_fwd = output_data.getModTypeCount(mod_type, 0)
         mod_count_rev = output_data.getModTypeCount(mod_type, 1)
-        table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count)
-        table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd)
-        table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev)
+
+        row_str, row_flag = format_row("Total {} Sites in the Sample".format(mod_name), [mod_count], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        row_str, row_flag = format_row("Total {} Sites in the Sample (Forward Strand)".format(mod_name), [mod_count_fwd], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+
+        row_str, row_flag = format_row("Total {} Sites in the Sample (Reverse Strand)".format(mod_name), [mod_count_rev], 'int', None)
+        table_str += row_str
+        table_error_flag = table_error_flag or row_flag
+        # table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count)
+        # table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd)
+        # table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev)
 
     # Finish the table
     table_str += "\n</tbody>\n</table>"
     plot_filepaths["base_mods"]['detail'] = table_str
+    plot_filepaths["base_mods"]['error_flag'] = table_error_flag
 
 def create_tin_table(output_data, input_files, plot_filepaths):
     """Create a summary table for the RNA-Seq TIN values."""
@@ -1118,12 +1188,15 @@ def create_tin_table(output_data, input_files, plot_filepaths):
     table_str += "\n<tbody>"
     
     # Loop through each BAM file
+    error_flag = False
     for bam_file in input_files:
         # Format the filepath as filename only
         bam_filename = os.path.basename(bam_file)
 
         # Get the file data
         tin_count = output_data.getTINCount(bam_file)
+        error_flag = error_flag or tin_count == 0
+
         tin_mean = output_data.getTINMean(bam_file)
         tin_median = output_data.getTINMedian(bam_file)
         tin_std = output_data.getTINStdDev(bam_file)
@@ -1135,6 +1208,8 @@ def create_tin_table(output_data, input_files, plot_filepaths):
 
     # Add the table to the plot filepaths
     plot_filepaths["tin"]['detail'] = table_str
+    plot_filepaths["tin"]['error_flag'] = error_flag
+
 
 def create_pod5_table(output_dict, plot_filepaths):
     """Create a summary table for the ONT POD5 signal data."""
@@ -1143,26 +1218,41 @@ def create_pod5_table(output_dict, plot_filepaths):
     plot_filepaths["basic_st"]['title'] = "Summary Table"
     file_type_label = "POD5"
     plot_filepaths["basic_st"]['description'] = f"{file_type_label} Basic Statistics"
+    table_error_flag = False
     
     # Get values
-    read_count = len(output_dict.keys())
 
     # Set up the HTML table
     table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>"
     table_str += "\n<tbody>"
-    int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>"
-    table_str += int_str_for_format.format("#Total Reads", read_count)
+    # int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>"
+    # table_str += int_str_for_format.format("Total Reads", read_count)
+    read_count = len(output_dict.keys())
+    row_str, row_flag = format_row("Total Reads", [read_count], 'int', None)
+    table_str += row_str
+    table_error_flag = table_error_flag or row_flag
 
     table_str += "\n</tbody>\n</table>"
     plot_filepaths["basic_st"]['detail'] = table_str
+    plot_filepaths["basic_st"]['error_flag'] = table_error_flag
 
 
-def plot_alignment_numbers(data):
+def plot_alignment_numbers(data, plot_filepaths):
     category = ['Primary Alignments', 'Supplementary Alignments', 'Secondary Alignments',
                 'Reads with Supplementary Alignments', 'Reads with Secondary Alignments',
                 'Reads with Secondary and Supplementary Alignments', 'Forward Alignments', 'Reverse Alignments']
     category = [wrap(x) for x in category]
 
+    # Identify null values
+    error_flag = False
+    for value in [data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment,
+                  data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment,
+                  data.num_reads_with_both_secondary_supplementary_alignment, data.forward_alignment,
+                  data.reverse_alignment]:
+        if value == 0:
+            error_flag = True
+            break
+    
     # Create a horizontally aligned bar plot trace from the data using plotly
     trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment,
                       data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment,
@@ -1179,13 +1269,18 @@ def plot_alignment_numbers(data):
     fig = go.Figure(data=[trace], layout=layout)
 
     # Generate the HTML object for the plot
-    html_obj = fig.to_html(full_html=False, default_height=500, default_width=1000)
+    # html_obj = fig.to_html(full_html=False, default_height=500,
+    # default_width=1000)
 
-    return html_obj
+    # return html_obj, error_flag
 
+    # Update the HTML data for the plot
+    plot_filepaths['read_alignments_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1000)
+    plot_filepaths['read_alignments_bar']['error_flag'] = error_flag
 
-# Plot base alignment statistics
-def plot_errors(output_data):
+
+def plot_errors(output_data, plot_filepaths):
+    """Plot the error statistics for the alignment data."""
     category = \
         ['Matched Bases', 'Mismatched Bases', 'Inserted Bases', 'Deleted Bases', 'Clipped Bases\n(Primary Alignments)']
     category = [wrap(x) for x in category]
@@ -1204,7 +1299,14 @@ def plot_errors(output_data):
     fig = go.Figure(data=[trace], layout=layout)
 
     # Generate the HTML object for the plot
-    html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
+    # html_obj = fig.to_html(full_html=False, default_height=500,
+    # default_width=700)
+    plot_filepaths['base_alignments_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
 
-    return html_obj
+    # Set the error flag if mismatch or clipped bases > matched bases
+    error_flag = output_data.num_mismatched_bases > output_data.num_matched_bases or \
+                 output_data.num_clip_bases > output_data.num_matched_bases
+    plot_filepaths['base_alignments_bar']['error_flag'] = error_flag
+
+    # return html_obj
 

From 8b2a913d233a9273e17fe554e3b040cafebccee8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 6 Jan 2025 15:45:49 -0500
Subject: [PATCH 10/25] Work on flags

---
 src/plot_utils.py | 196 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 145 insertions(+), 51 deletions(-)

diff --git a/src/plot_utils.py b/src/plot_utils.py
index db96017..c6dd718 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -72,11 +72,12 @@ def wrap(label):
     return new_label
 
 # Plot the read alignment numbers
-def plot_read_length_stats(output_data, file_type):
+def plot_read_length_stats(output_data, file_type, plot_filepaths):
 
     # Define the three categories
     category = ['N50', 'Mean', 'Median']
     all_traces = []
+    error_flag = False
 
     if file_type == 'BAM':
         # Create a bar trace for each type of read length statistic
@@ -89,6 +90,10 @@ def plot_read_length_stats(output_data, file_type):
             trace = go.Bar(x=category, y=values, name=plot_title)
             all_traces.append(trace)
 
+            # Set the error flag if any of the values are zero (except for unmapped reads)
+            if i != 2 and (values[0] == 0 or values[1] == 0 or values[2] == 0):
+                error_flag = True
+
     elif file_type == 'SeqTxt':
         # Create a bar trace for each type of read length statistic
         bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads']
@@ -100,6 +105,10 @@ def plot_read_length_stats(output_data, file_type):
             trace = go.Bar(x=category, y=values, name=plot_title)
             all_traces.append(trace)
 
+            # Set the error flag if any of the values are zero (except for failed reads)
+            if i != 2 and (values[0] == 0 or values[1] == 0 or values[2] == 0):
+                error_flag = True
+
     else:
         # Get the data for all reads
         key_list = ['n50_read_length', 'mean_read_length', 'median_read_length']
@@ -111,6 +120,11 @@ def plot_read_length_stats(output_data, file_type):
         trace = go.Bar(x=category, y=values, name=bar_title)
         all_traces.append(trace)
 
+        # Set the error flag if any of the values are zero
+        if values[0] == 0 or values[1] == 0 or values[2] == 0:
+            error_flag = True
+
+
     # Create the layout
     layout = go.Layout(title='', xaxis=dict(title='Statistics'), yaxis=dict(title='Length (bp)'), barmode='group', font=dict(size=PLOT_FONT_SIZE))
 
@@ -118,16 +132,19 @@ def plot_read_length_stats(output_data, file_type):
     fig = go.Figure(data=all_traces, layout=layout)
 
     # Generate the HTML
-    html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
+    # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
+    plot_filepaths['read_length_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
+
+    # Set the error flag
+    plot_filepaths['read_length_bar']['error_flag'] = error_flag
 
-    return html_obj
 
 # Plot the base counts
-def plot_base_counts(output_data, filetype):
-    # Define the five categories
-    category = ['A', 'C', 'G', 'T/U', 'N']
+def plot_base_counts(output_data, filetype, plot_filepaths):
 
-    # Create a bar trace for each type of data
+    # Create a bar trace for each base
+    error_flag = False
+    category = ['A', 'C', 'G', 'T/U', 'N']
     all_traces = []
     if filetype == 'BAM':
         bar_titles = ['All Reads', 'Mapped Reads', 'Unmapped Reads']
@@ -139,6 +156,15 @@ def plot_base_counts(output_data, filetype):
             trace = go.Bar(x=category, y=values, name=plot_title)
             all_traces.append(trace)
 
+            # Set the error flag if the N count is greater than 10% or the A, C,
+            # G, or T/U counts are zero
+            if data.total_num_bases == 0:
+                error_flag = True
+            elif data.total_n_cnt / data.total_num_bases > 0.1:
+                error_flag = True
+            elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0:
+                error_flag = True
+
     elif filetype == 'SeqTxt':
         bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads']
         data_objects = [output_data.all_long_read_info.long_read_info, output_data.passed_long_read_info.long_read_info, output_data.failed_long_read_info.long_read_info]
@@ -149,6 +175,15 @@ def plot_base_counts(output_data, filetype):
             trace = go.Bar(x=category, y=values, name=plot_title)
             all_traces.append(trace)
 
+            # Set the error flag if the N count is greater than 10% or the A, C,
+            # G, or T/U counts are zero
+            if data.total_num_bases == 0:
+                error_flag = True
+            elif data.total_n_cnt / data.total_num_bases > 0.1:
+                error_flag = True
+            elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0:
+                error_flag = True
+
     else:
         plot_title = 'All Reads'
         data = output_data.long_read_info
@@ -156,19 +191,30 @@ def plot_base_counts(output_data, filetype):
         trace = go.Bar(x=category, y=values, name=plot_title)
         all_traces.append(trace)
 
-    # Create the layout
-    layout = go.Layout(title='', xaxis=dict(title='Base'), yaxis=dict(title='Counts'), barmode='group', font=dict(size=PLOT_FONT_SIZE))
+        # Set the error flag if the N count is greater than 10% or the A, C,
+        # G, or T/U counts are zero
+        if data.total_num_bases == 0:
+            error_flag = True
+        elif data.total_n_cnt / data.total_num_bases > 0.1:
+            error_flag = True
+        elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0:
+            error_flag = True
 
     # Create the figure and add the traces
+    layout = go.Layout(title='', xaxis=dict(title='Base'), yaxis=dict(title='Counts'), barmode='group', font=dict(size=PLOT_FONT_SIZE))
     fig = go.Figure(data=all_traces, layout=layout)
 
     # Generate the HTML
-    html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
+    # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
 
-    return html_obj
+    # return html_obj
+
+    # Generate the HTML
+    plot_filepaths['base_counts']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
+    plot_filepaths['base_counts']['error_flag'] = error_flag
 
 # Plot basic information about the reads in bar chart format
-def plot_basic_info(output_data, file_type):
+def plot_basic_info(output_data, file_type, plot_filepaths):
     html_obj = ''
     if file_type == 'BAM':
 
@@ -181,6 +227,7 @@ def plot_basic_info(output_data, file_type):
 
         # Add traces for each category
         key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length', 'gc_cnt']
+        error_flag = False
         for i in range(4):
             # Get the data for this category
             key_name = key_list[i]
@@ -188,6 +235,10 @@ def plot_basic_info(output_data, file_type):
             # Add the traces for each type of data
             data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)]
 
+            # Set the error flag if any of the values are zero
+            if data[0] == 0 or data[1] == 0 or data[2] == 0:
+                error_flag = True
+
             # Create the trace
             trace = go.Bar(x=data, y=bar_titles, orientation='h')
 
@@ -199,7 +250,11 @@ def plot_basic_info(output_data, file_type):
         fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE))
 
         # Generate the HTML
-        html_obj = fig.to_html(full_html=False, default_height=800, default_width=1200)
+        # html_obj = fig.to_html(full_html=False, default_height=800,
+        # default_width=1200)
+        plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=800, default_width=1200)
+        plot_filepaths['basic_info']['error_flag'] = error_flag
+
 
     elif file_type == 'SeqTxt':
 
@@ -212,6 +267,7 @@ def plot_basic_info(output_data, file_type):
 
         # Add traces for each category
         key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length']
+        error_flag = False
         for i in range(3):
             # Get the data for this category
             key_name = key_list[i]
@@ -219,6 +275,10 @@ def plot_basic_info(output_data, file_type):
             # Add the traces for each type of data
             data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)]
 
+            # Set the error flag if any of the values are zero
+            if data[0] == 0 or data[1] == 0 or data[2] == 0:
+                error_flag = True
+
             # Create the trace
             trace = go.Bar(x=data, y=bar_titles, orientation='h')
 
@@ -229,13 +289,16 @@ def plot_basic_info(output_data, file_type):
         fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE))
 
         # Generate the HTML
-        html_obj = fig.to_html(full_html=False, default_height=500, default_width=1600)
+        # html_obj = fig.to_html(full_html=False, default_height=500,
+        # default_width=1600)
+        plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1600)
+        plot_filepaths['basic_info']['error_flag'] = error_flag
 
     return html_obj
 
 
 # Plot the read length histograms
-def read_lengths_histogram(data, font_size):
+def read_lengths_histogram(data, font_size, plot_filepaths):
     linear_bin_count = 10
     log_bin_count = 10
 
@@ -262,7 +325,6 @@ def read_lengths_histogram(data, font_size):
     log_col=2
 
     linear_bindata = np.dstack((edges[:-1], edges[1:], hist))[0, :, :]
-    # linear_bin_centers = np.round((linear_bindata[:, 0] + linear_bindata[:, 1]) / 2, 0)
     fig.add_trace(go.Bar(x=edges, y=hist, customdata=linear_bindata,
                          hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp<br>Counts:%{customdata[2]:.0f}<extra></extra>',
                          marker_color='#36a5c7'), row=1, col=linear_col)
@@ -274,8 +336,7 @@ def read_lengths_histogram(data, font_size):
     fig.add_vline(n50, line_width=1, line_dash="dash", annotation_text='N50', annotation_bgcolor="green",
                   annotation_textangle=90, row=1, col=linear_col)
 
-    # Log histogram
-    # Get the log10 histogram of read lengths
+    # Log scale histogram
     read_lengths_log = np.log10(read_lengths, out=np.zeros_like(read_lengths), where=(read_lengths != 0))
     log_edges = np.linspace(0, np.max(read_lengths_log), num=log_bin_count + 1)
     log_hist, _ = np.histogram(read_lengths_log, bins=log_edges)
@@ -333,18 +394,26 @@ def read_lengths_histogram(data, font_size):
 
     # Update the layout
     fig.update_layout(showlegend=False, autosize=True, font=dict(size=PLOT_FONT_SIZE))
-
     fig.update_annotations(font_size=annotation_size)
-    html_obj = fig.to_html(full_html=False, default_height=500, default_width=1200)
+
+    # Generate the HTML
+    # html_obj = fig.to_html(full_html=False, default_height=500,
+    # default_width=1200)
+    plot_filepaths['read_length_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1200)
                            
-    return html_obj
 
-def read_gc_content_histogram(data, font_size):
+def read_gc_content_histogram(data, font_size, plot_filepaths):
     """Plot the per-read GC content histogram."""
     bin_size = 1
+    gc_content = np.array(data.read_gc_content_count)
+
+    # Set the error flag if the GC content is below 20% for more than 10% of the
+    # reads
+    error_flag = False
+    if np.sum(gc_content[:20]) / np.sum(gc_content) > 0.1:
+        error_flag = True
 
     # Bin the GC content if the bin size is greater than 1
-    gc_content = np.array(data.read_gc_content_count)
     if bin_size > 1:
         gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)])
 
@@ -374,7 +443,9 @@ def read_gc_content_histogram(data, font_size):
     fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0)
     fig.update_layout(font=dict(size=PLOT_FONT_SIZE))  # Set font size
 
-    return fig.to_html(full_html=False, default_height=500, default_width=700)
+    # return fig.to_html(full_html=False, default_height=500, default_width=700)
+    plot_filepaths['gc_content_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
+    plot_filepaths['gc_content_hist']['error_flag'] = error_flag
 
 
 def base_quality(data, font_size, plot_filepaths):
@@ -456,22 +527,18 @@ def plot_base_modifications(base_modifications):
     return html_obj
 
 
-# Main plot function
 def plot(output_data, para_dict, file_type):
+    """Generate the plots for the output data."""
     plot_filepaths = getDefaultPlotFilenames()
-
-    # Get the font size for plotly plots
-    font_size = 14
-
-    # Create the summary table
-    create_summary_table(output_data, plot_filepaths, file_type)
+    font_size = 14  # Font size for the plots
+    create_summary_table(output_data, plot_filepaths, file_type)  # Create the summary table
 
     # Modified base table and plots
     if file_type == 'BAM' and para_dict["mod"] > 0:
         # Output file for the read length vs. modification rates plot
         output_folder = para_dict["output_folder"]
-        read_length_hist_file = os.path.join(output_folder, 'read_length_hist.png')
-        plot_filepaths['read_length_mod_rates']['file'] = read_length_hist_file
+        read_length_mod_rate_file = os.path.join(output_folder, 'read_length_hist.png')
+        plot_filepaths['read_length_mod_rates']['file'] = read_length_mod_rate_file
 
         # Generate the modified base table and read length vs. modification rates plot
         base_modification_threshold = para_dict["modprob"]
@@ -491,8 +558,14 @@ def plot(output_data, para_dict, file_type):
             logging.warning("WARNING: TIN table not created")
 
     # Generate plots
-    plot_filepaths['base_counts']['dynamic'] = plot_base_counts(output_data, file_type)
-    plot_filepaths['basic_info']['dynamic'] = plot_basic_info(output_data, file_type)
+    # plot_filepaths['base_counts']['dynamic'] = plot_base_counts(output_data,
+    # file_type)
+    plot_base_counts(output_data, file_type, plot_filepaths)
+
+    # Plot basic information
+    # plot_filepaths['basic_info']['dynamic'] = plot_basic_info(output_data,
+    # file_type)
+    plot_basic_info(output_data, file_type, plot_filepaths)
 
     # Read length histogram
     if file_type == 'SeqTxt':
@@ -501,18 +574,30 @@ def plot(output_data, para_dict, file_type):
         long_read_data = output_data.long_read_info
 
     if file_type != 'FAST5s':
-        plot_filepaths['read_length_hist']['dynamic'] = read_lengths_histogram(long_read_data, font_size)
+        # plot_filepaths['read_length_hist']['dynamic'] =
+        # read_lengths_histogram(long_read_data, font_size)
+        read_lengths_histogram(long_read_data, font_size, plot_filepaths)
 
-        plot_filepaths['read_length_bar']['dynamic'] = plot_read_length_stats(output_data, file_type)
+        # plot_filepaths['read_length_bar']['dynamic'] =
+        # plot_read_length_stats(output_data, file_type)
+        plot_read_length_stats(output_data, file_type, plot_filepaths)
 
     # GC content histogram
     if file_type != 'FAST5s' and file_type != 'SeqTxt':
         if file_type == 'BAM':
-            plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.mapped_long_read_info, font_size)
+            # plot_filepaths['gc_content_hist']['dynamic'] =
+            # read_gc_content_histogram(output_data.mapped_long_read_info,
+            # font_size)
+            read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths)
         elif file_type == 'SeqTxt':
-            plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size)
+            # plot_filepaths['gc_content_hist']['dynamic'] =
+            # read_gc_content_histogram(output_data.passed_long_read_info.long_read_info,
+            # font_size)
+            read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths)
         else:
-            plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.long_read_info, font_size)
+            # plot_filepaths['gc_content_hist']['dynamic'] =
+            # read_gc_content_histogram(output_data.long_read_info, font_size)
+            read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths)
 
     # Quality plots
     if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt':
@@ -681,6 +766,8 @@ def plot_signal(output_data, para_dict):
     
     # Get read and base counts
     read_count = output_data.getReadCount()
+    if read_count == 0:
+        raise ValueError("No reads found in the dataset")
 
     # Randomly sample a small set of reads if it is a large dataset
     read_sample_size = min(read_count_max, read_count)
@@ -1033,7 +1120,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         for mod_type in base_mod_types:
             logging.info(mod_type)
 
-        # Get the read length vs. base modification rate data for each modification type
+        logging.info("Getting base modification statistics")
+
+        # Get the read length vs. base modification rate data for each
+        # modification type
+        logging.info("Getting mod data size")
         read_mod_data_size = output_data.getReadModDataSize()
         read_length_mod_rates = {}
         for i in range(read_mod_data_size):
@@ -1041,8 +1132,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                 if mod_type not in read_length_mod_rates:
                     read_length_mod_rates[mod_type] = []
 
+                logging.info("Getting read length for read {}".format(i))
                 read_length = output_data.getNthReadModLength(i)
+                logging.info("Getting read length vs. {} modification rate".format(mod_type))
                 mod_rate = output_data.getNthReadModRate(i, mod_type)
+                logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate))
                 read_length_mod_rates[mod_type].append((read_length, mod_rate))
 
         # Dictionary of modification character to full name
@@ -1092,12 +1186,12 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                             xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]),
                             font=dict(size=PLOT_FONT_SIZE))
             
-            logging.info("Plotting read length vs. {} modification rate".format(mod_name))
-
         # Save the plot image
-        fig_file = plot_filepaths["read_length_mod_rates"]['file']
-        fig.write_image(fig_file)
-        
+        if len(base_mod_types) > 0:
+            fig_file = plot_filepaths["read_length_mod_rates"]['file']
+            logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file))
+            fig.write_image(fig_file, format='png', width=700, height=500)
+            
         # Generate the HTML
         # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
         # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj
@@ -1195,14 +1289,16 @@ def create_tin_table(output_data, input_files, plot_filepaths):
 
         # Get the file data
         tin_count = output_data.getTINCount(bam_file)
-        error_flag = error_flag or tin_count == 0
-
         tin_mean = output_data.getTINMean(bam_file)
         tin_median = output_data.getTINMedian(bam_file)
         tin_std = output_data.getTINStdDev(bam_file)
 
         # Add the data to the table
-        table_str += "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>".format(bam_filename, tin_count, tin_mean, tin_median, tin_std)
+        row_str, row_flag = format_row(bam_filename, [tin_count, tin_mean, tin_median, tin_std], 'float', None)
+        table_str += row_str
+        error_flag = error_flag or row_flag
+
+        # table_str += "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>".format(bam_filename, tin_count, tin_mean, tin_median, tin_std)
 
     table_str += "\n</tbody>\n</table>"
 
@@ -1220,8 +1316,6 @@ def create_pod5_table(output_dict, plot_filepaths):
     plot_filepaths["basic_st"]['description'] = f"{file_type_label} Basic Statistics"
     table_error_flag = False
     
-    # Get values
-
     # Set up the HTML table
     table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>"
     table_str += "\n<tbody>"

From 1456cd7e7cc752d61b16d5224dd5bcc2e2d25e07 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 6 Jan 2025 18:34:19 -0500
Subject: [PATCH 11/25] subsample read length vs mod rate plot

---
 src/cli.py           |   8 +--
 src/generate_html.py |   8 ++-
 src/plot_utils.py    | 127 +++++++++++++++++++++++++++----------------
 3 files changed, 90 insertions(+), 53 deletions(-)

diff --git a/src/cli.py b/src/cli.py
index 85951a6..cbe2a7d 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -246,7 +246,7 @@ def bam_module(margs):
             plot_filepaths = plot(bam_output, param_dict, 'BAM')
 
             # Set the list of QC information to display
-            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", "read_avg_base_quality"]
+            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"]
 
             # If base modifications were found, add the base modification plots
             # after the first table
@@ -314,7 +314,7 @@ def rrms_module(margs):
                 # Generate the HTML report
                 bam_html_gen = generate_html.ST_HTML_Generator(
                     [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info",
-                    "base_quality", "read_avg_base_quality"], "BAM QC", param_dict], plot_filepaths, static=False)
+                    "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False)
                 bam_html_gen.generate_html()
                 logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
@@ -386,8 +386,8 @@ def fast5_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fast5_output, param_dict, 'FAST5')
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality",
-                  "read_avg_base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=False)
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"], 
+                 "FAST5 QC", param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html()
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
diff --git a/src/generate_html.py b/src/generate_html.py
index 64f1641..4cc3cf0 100644
--- a/src/generate_html.py
+++ b/src/generate_html.py
@@ -304,10 +304,12 @@ def generate_right(self):
             key_index += 1
 
         self.html_writer.write('<div class="module">')
-        self.html_writer.write('<h2 id="lrst' + str(key_index) + '">File count = ' + str(
+        self.html_writer.write('<h2 id="lrst' + str(key_index) + '">File Count = ' + str(
             len(self.input_para["input_files"])) + '</h2><p>')
-        for _af in self.input_para["input_files"]:
-            self.html_writer.write("<br/>" + _af)
+        # for _af in self.input_para["input_files"]:
+        #     self.html_writer.write("<br/>" + _af)
+        # Write the input files in format "1.\tfile1\n2.\tfile2\n..."
+        self.html_writer.write("<br/>" + "<br/>".join([f"{i+1}.\t{af}" for i, af in enumerate(self.input_para["input_files"])]))
         self.html_writer.write('</p></div>')
         key_index += 1
 
diff --git a/src/plot_utils.py b/src/plot_utils.py
index c6dd718..2f25c7d 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -139,8 +139,8 @@ def plot_read_length_stats(output_data, file_type, plot_filepaths):
     plot_filepaths['read_length_bar']['error_flag'] = error_flag
 
 
-# Plot the base counts
 def plot_base_counts(output_data, filetype, plot_filepaths):
+    """Plot overall base counts for the reads."""
 
     # Create a bar trace for each base
     error_flag = False
@@ -157,13 +157,14 @@ def plot_base_counts(output_data, filetype, plot_filepaths):
             all_traces.append(trace)
 
             # Set the error flag if the N count is greater than 10% or the A, C,
-            # G, or T/U counts are zero
-            if data.total_num_bases == 0:
-                error_flag = True
-            elif data.total_n_cnt / data.total_num_bases > 0.1:
-                error_flag = True
-            elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0:
-                error_flag = True
+            # G, or T/U counts are zero (except for unmapped reads)
+            if i != 2:
+                if data.total_num_bases == 0:
+                    error_flag = True
+                elif data.total_n_cnt / data.total_num_bases > 0.1:
+                    error_flag = True
+                elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0:
+                    error_flag = True
 
     elif filetype == 'SeqTxt':
         bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads']
@@ -213,8 +214,8 @@ def plot_base_counts(output_data, filetype, plot_filepaths):
     plot_filepaths['base_counts']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
     plot_filepaths['base_counts']['error_flag'] = error_flag
 
-# Plot basic information about the reads in bar chart format
 def plot_basic_info(output_data, file_type, plot_filepaths):
+    """Plot basic information about the reads in bar chart format."""
     html_obj = ''
     if file_type == 'BAM':
 
@@ -235,8 +236,9 @@ def plot_basic_info(output_data, file_type, plot_filepaths):
             # Add the traces for each type of data
             data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)]
 
-            # Set the error flag if any of the values are zero
-            if data[0] == 0 or data[1] == 0 or data[2] == 0:
+            # Set the error flag if any of the values are zero (except for unmapped reads)
+            # if data[0] == 0 or data[1] == 0 or data[2] == 0:
+            if data[0] == 0 or data[1] == 0:
                 error_flag = True
 
             # Create the trace
@@ -410,7 +412,9 @@ def read_gc_content_histogram(data, font_size, plot_filepaths):
     # Set the error flag if the GC content is below 20% for more than 10% of the
     # reads
     error_flag = False
-    if np.sum(gc_content[:20]) / np.sum(gc_content) > 0.1:
+    if np.sum(gc_content) == 0:
+        error_flag = True
+    elif np.sum(gc_content[:20]) / np.sum(gc_content) > 0.1:
         error_flag = True
 
     # Bin the GC content if the bin size is greater than 1
@@ -470,7 +474,9 @@ def base_quality(data, font_size, plot_filepaths):
     # Set the error flag if the base quality is below 20 for more than 10% of
     # the bases
     error_flag = False
-    if np.sum(yd[:20]) / np.sum(yd) > 0.1:
+    if np.sum(yd) == 0:
+        error_flag = True
+    elif np.sum(yd[:20]) / np.sum(yd) > 0.1:
         error_flag = True
 
     plot_filepaths['base_quality']['error_flag'] = error_flag
@@ -493,7 +499,9 @@ def read_avg_base_quality(data, font_size, plot_filepaths):
     # Set the error flag if the average base quality is below 20 for more than
     # 10% of the reads
     error_flag = False
-    if np.sum(yd[:20]) / np.sum(yd) > 0.1:
+    if np.sum(yd) == 0:
+        error_flag = True
+    elif np.sum(yd[:20]) / np.sum(yd) > 0.1:
         error_flag = True
 
     plot_filepaths['read_avg_base_quality']['error_flag'] = error_flag
@@ -599,15 +607,17 @@ def plot(output_data, para_dict, file_type):
             # read_gc_content_histogram(output_data.long_read_info, font_size)
             read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths)
 
-    # Quality plots
+    # Base quality histogram
     if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt':
         seq_quality_info = output_data.seq_quality_info
 
         # Base quality histogram
         base_quality(seq_quality_info, font_size, plot_filepaths)
-        # plot_filepaths['base_quality']['dynamic'] = base_quality(seq_quality_info, font_size)
-
-        # Read quality histogram
+        # plot_filepaths['base_quality']['dynamic'] =
+        # base_quality(seq_quality_info, font_size)
+        
+    # Read average base quality histogram
+    if file_type == 'FASTQ':
         # read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size)
         # plot_filepaths['read_avg_base_quality']['dynamic'] =
         # read_quality_dynamic
@@ -1126,17 +1136,31 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         # modification type
         logging.info("Getting mod data size")
         read_mod_data_size = output_data.getReadModDataSize()
+        logging.info("Mod data size: {}".format(read_mod_data_size))
+
+        # Choose a maximum of 10,000 reads to randomly sample for the plot
+        max_reads = min(read_mod_data_size, 10000)        
+        # read_indices = set(sample(range(read_mod_data_size), max_reads))
+        read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False)
         read_length_mod_rates = {}
-        for i in range(read_mod_data_size):
+
+        # Get the read length vs. base modification rate data for each
+        # modification type in the sampled reads
+        # for i in range(read_mod_data_size):
+        #     if i not in read_indices:
+        #         continue
+        for i in read_indices:
             for mod_type in base_mod_types:
                 if mod_type not in read_length_mod_rates:
                     read_length_mod_rates[mod_type] = []
 
-                logging.info("Getting read length for read {}".format(i))
-                read_length = output_data.getNthReadModLength(i)
-                logging.info("Getting read length vs. {} modification rate".format(mod_type))
-                mod_rate = output_data.getNthReadModRate(i, mod_type)
-                logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate))
+                # logging.info("Getting read length for read {}".format(i))
+                # read_length = output_data.getNthReadModLength(i)
+                read_length = output_data.getNthReadModLength(int(i))
+                # logging.info("Getting read length vs. {} modification rate".format(mod_type))
+                # mod_rate = output_data.getNthReadModRate(i, mod_type)
+                mod_rate = output_data.getNthReadModRate(int(i), mod_type)
+                # logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate))
                 read_length_mod_rates[mod_type].append((read_length, mod_rate))
 
         # Dictionary of modification character to full name
@@ -1147,7 +1171,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                             'N': 'Amb. N', \
                             'v': 'pseU'}
 
-
         # Create a plot of read length vs. base modification rate for each
         # modification type
         # Make subplots vertically for each modification type
@@ -1159,13 +1182,19 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
             # Format the data
             mod_data = read_length_mod_rates[mod_type]
-            x_vals = [data[0] for data in mod_data]
-            read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals]
             mod_rates = [data[1] * 100 for data in mod_data]
+            x_vals = [data[0] for data in mod_data]
+
+            # Generate evenly-spaced x values and labels (10 ticks across the
+            # range) with the read lengths being a multiple of 1000
+            x_tick_values = np.linspace(0, max(x_vals), num=10)
+            read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values]
+
+            # read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals]
 
             # Update the min and max x values
-            min_x = min(min_x, min(x_vals))
-            max_x = max(max_x, max(x_vals))
+            # min_x = min(min_x, *x_vals)
+            # max_x = max(max_x, *x_vals)
 
             # Get the modification name
             try:
@@ -1177,23 +1206,36 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
             fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name), row=i + 1, col=1)
 
             # Update the layout
-            max_x_range = min(max_x, 10000)  # To improve the plot performance
             fig.update_layout(title='Read Length vs. {} Modification Rate'.format(mod_name),
                             xaxis_title='Read Length',
                             yaxis_title='Modification Rate (%)',
                             showlegend=False,
                             yaxis=dict(range=[0, 100]),
-                            xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]),
+                            xaxis=dict(tickvals=x_tick_values, ticktext=read_lengths, range=[0, max(x_vals)]),
                             font=dict(size=PLOT_FONT_SIZE))
             
+            # Get the X tick values generated by Plotly and format the read lengths
+            # x_tick_values = fig.layout.x
+            # if x_tick_values:
+            #     read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values]
+
+            # # Update the X tick labels
+            # fig.update_xaxes(tickvals=x_tick_values, ticktext=read_lengths, row=i + 1, col=1)
+            
+            # xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]),
+
         # Save the plot image
-        if len(base_mod_types) > 0:
-            fig_file = plot_filepaths["read_length_mod_rates"]['file']
-            logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file))
-            fig.write_image(fig_file, format='png', width=700, height=500)
+        # if len(base_mod_types) > 0:
+        #     fig_file = plot_filepaths["read_length_mod_rates"]['file']
+        #     logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file))
+        #     fig.write_image(fig_file, format='png', width=700, height=500)
             
         # Generate the HTML
-        # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
+        # html_obj = fig.to_html(full_html=False, default_height=500,
+        # default_width=700)
+        if len(base_mod_types) > 0:
+            logging.info("Saving the read length vs. modification rates plot")
+            plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
         # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj
     else:
         logging.warning("WARNING: No modification types found")
@@ -1337,16 +1379,9 @@ def plot_alignment_numbers(data, plot_filepaths):
                 'Reads with Secondary and Supplementary Alignments', 'Forward Alignments', 'Reverse Alignments']
     category = [wrap(x) for x in category]
 
-    # Identify null values
-    error_flag = False
-    for value in [data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment,
-                  data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment,
-                  data.num_reads_with_both_secondary_supplementary_alignment, data.forward_alignment,
-                  data.reverse_alignment]:
-        if value == 0:
-            error_flag = True
-            break
-    
+    # Set the error flag if primary alignments equal 0
+    error_flag = data.num_primary_alignment == 0
+
     # Create a horizontally aligned bar plot trace from the data using plotly
     trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment,
                       data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment,

From 330ba933b4fcb3604a045aaa828e7cf6a13e6834 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 12 Jan 2025 15:43:46 -0500
Subject: [PATCH 12/25] Update base mod rate and remove basic statistics bar
 graphs

---
 include/output_data.h |   4 +-
 src/bam_module.cpp    |  10 +-
 src/cli.py            |  39 +++++--
 src/hts_reader.cpp    | 114 +++++++++++++++----
 src/output_data.cpp   |  34 ++++--
 src/plot_utils.py     | 252 ++++++++++++------------------------------
 6 files changed, 228 insertions(+), 225 deletions(-)

diff --git a/include/output_data.h b/include/output_data.h
index 06608d6..fa66cdb 100644
--- a/include/output_data.h
+++ b/include/output_data.h
@@ -161,7 +161,6 @@ class Base_Move_Table
 struct ReadModData
 {
    int read_length;
-   double mod_rate;
    std::unordered_map<char, double> base_mod_rates;  // Type-specific base modification rates
 };
 
@@ -229,7 +228,6 @@ class Output_BAM : public Output_FQ
       std::vector<char> getBaseModTypes();  // Get the types of base modifications found
       int getReadModDataSize();  // Get the number of read length vs. base modification rate data points
       int getNthReadModLength(int read_index);  // Get the read length for the nth read
-      double getNthReadModRate(int read_index);  // Get the base modification rate for the nth read
       double getNthReadModRate(int read_index, char mod_type);  // Get the base modification rate for the nth read for a specific base modification type
       uint64_t getModTypeCount(char mod_type);  // Get the count of a specific base modification type
       uint64_t getModTypeCount(char mod_type, int strand);  // Get the count of a specific base modification type for a specific strand
@@ -243,7 +241,7 @@ class Output_BAM : public Output_FQ
       int getReadSequenceEnd(std::string read_id);
 
       void updateBaseModCounts(char mod_type, int strand);  // Update base modification counts for predictions exceeding the threshold
-      void updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates);  // Update read length vs. base modification rate data
+      void updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates);  // Update read length vs. base modification rate data
 
       // Add TIN data for a single BAM file
       void addTINData(std::string &bam_file, TINStats &tin_data);
diff --git a/src/bam_module.cpp b/src/bam_module.cpp
index 2d47cb4..058f831 100644
--- a/src/bam_module.cpp
+++ b/src/bam_module.cpp
@@ -146,6 +146,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
         }
 
          // Calculate statistics in batches
+         printMemoryUsage("Before batch processing");
          while (reader.hasNextRecord()){
             std::cout << "Generating " << thread_count << " thread(s)..." << std::endl;
             std::vector<std::thread> thread_vector;
@@ -169,6 +170,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
                 if (t.joinable()){
                     t.join();
                 }
+                printMemoryUsage("After thread " + std::to_string(thread_index));
                 thread_index++;
             }
             std::cout << "All threads joined." << std::endl;
@@ -245,6 +247,7 @@ void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unorder
     // Update the final output
     std::lock_guard<std::mutex> lock(output_mutex);
     final_output.add(record_output);
+    printMemoryUsage("After record processing");
 }
 
 std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_file, bool accepted_reads)
@@ -262,7 +265,10 @@ std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_fi
     std::stringstream ss(header);
     std::string field;
     // std::cout << "RRMS CSV header:" << std::endl;
-    while (std::getline(ss, field, ',')){
+
+    // Split the header fields
+    char delimiter = ',';
+    while (std::getline(ss, field, delimiter)){
         header_fields.push_back(field);
         // std::cout << field << std::endl;
     }
@@ -297,7 +303,7 @@ std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_fi
         std::vector<std::string> fields;
         std::string field;
         std::stringstream ss(line);
-        while (std::getline(ss, field, ',')){
+        while (std::getline(ss, field, delimiter)){
             fields.push_back(field);
         }
 
diff --git a/src/cli.py b/src/cli.py
index cbe2a7d..ebf10d6 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -222,9 +222,16 @@ def bam_module(margs):
         param_dict["ref"] = input_para.ref_genome = ref_genome
 
         # Set the base modification flag, and filtering threshold
-        param_dict["mod"] = input_para.mod_analysis = margs.mod
+        # param_dict["mod"] = input_para.mod_analysis = margs.mod
+        if margs.mod:
+            param_dict["mod"] = input_para.mod_analysis = True
+        else:
+            param_dict["mod"] = input_para.mod_analysis = False
+            
         mod_prob = margs.modprob
-        param_dict["modprob"] = input_para.base_mod_threshold = mod_prob
+        param_dict["modprob"] = mod_prob
+        input_para.base_mod_threshold = mod_prob
+        logging.info("Base modification threshold is set to " + str(input_para.base_mod_threshold))
 
         # Set the gene BED file for RNA-seq transcript analysis
         input_para.gene_bed = margs.genebed if margs.genebed != "" or margs.genebed is not None else ""
@@ -246,7 +253,7 @@ def bam_module(margs):
             plot_filepaths = plot(bam_output, param_dict, 'BAM')
 
             # Set the list of QC information to display
-            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"]
+            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"]
 
             # If base modifications were found, add the base modification plots
             # after the first table
@@ -301,6 +308,7 @@ def rrms_module(margs):
 
             # Set the output prefix
             param_dict["out_prefix"] = output_prefix + "rrms_" + ("accepted" if filter_type else "rejected")
+            param_dict["mod"] = input_para.mod_analysis = False  # Disable base modification analysis for RRMS (use BAM module for this)
 
             # Run the QC module
             logging.info("Running QC for " + ("accepted" if filter_type else "rejected") + " reads...")
@@ -311,10 +319,19 @@ def rrms_module(margs):
                 logging.info("Generating HTML report...")
                 plot_filepaths = plot(bam_output, param_dict, 'BAM')
 
+                # Set the list of QC information to display
+                qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"]
+
+                # If base modifications were found, add the base modification
+                # plots
+                if bam_output.sample_modified_base_count > 0:
+                    logging.info("Base modifications found. Adding base modification plots to the HTML report.")
+                    qc_info_list.insert(1, "read_length_mod_rates")
+                    qc_info_list.insert(1, "base_mods")
+
                 # Generate the HTML report
                 bam_html_gen = generate_html.ST_HTML_Generator(
-                    [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info",
-                    "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False)
+                    [qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False)
                 bam_html_gen.generate_html()
                 logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
@@ -350,7 +367,7 @@ def seqtxt_module(margs):
 
             report_title = "Basecall Summary QC"
             seqtxt_html_gen = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "basic_info"],
+                [["basic_st", "read_length_bar", "read_length_hist"],
                     report_title, param_dict], plot_filepaths, static=False)
                 
             seqtxt_html_gen.generate_html()
@@ -386,7 +403,7 @@ def fast5_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fast5_output, param_dict, 'FAST5')
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"], 
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"], 
                  "FAST5 QC", param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html()
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
@@ -432,7 +449,7 @@ def fast5_signal_module(margs):
             logging.info("Generating HTML report...")
             plot_filepaths = plot(fast5_output, param_dict, 'FAST5s')
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False)
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html(signal_plots=True)
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
@@ -500,7 +517,7 @@ def pod5_module(margs):
             # plot_filepaths = plot(read_signal_dict, param_dict, 'POD5')
             webpage_title = "POD5 QC"
             fast5_html_obj = generate_html.ST_HTML_Generator(
-                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
+                [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False)
             fast5_html_obj.generate_html(signal_plots=True)
             logging.info("Done. Output files are in %s", param_dict["output_folder"])
 
@@ -637,8 +654,8 @@ def set_file_parser_defaults(file_parser):
 bam_parser.add_argument("--genebed", type=str, default="",
                         help="Gene BED12 file required for calculating TIN scores from RNA-seq BAM files. Default: None.")
 
-bam_parser.add_argument("--modprob", type=float, default=0.8,
-                        help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.8.")
+bam_parser.add_argument("--modprob", type=float, default=0.5,
+                        help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.5.")
 
 bam_parser.add_argument("--ref", type=str, default="",
                         help="The reference genome FASTA file to use for identifying CpG sites.")
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 8762f45..585254d 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -104,10 +104,10 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
     bool read_ids_present = false;
     if (read_ids.size() > 0){
         read_ids_present = true;
-        printMessage("Filtering reads by read ID");
+        // printMessage("Filtering reads by read ID");
 
-        printMessage("Number of read IDs: " + std::to_string(read_ids.size()));
-        printMessage("First read ID: " + *read_ids.begin());
+        // printMessage("Number of read IDs: " + std::to_string(read_ids.size()));
+        // printMessage("First read ID: " + *read_ids.begin());
         // Check if the first read ID has any newlines, carriage returns, tabs,
         // or spaces
         if (read_ids.begin()->find_first_of("\n\r\t ") != std::string::npos) {
@@ -361,6 +361,14 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
     std::vector<int> read_lengths;  // Read lengths
     std::vector<double> read_mod_rates;  // Total base modification rate for each read length
     std::vector<std::unordered_map<char, double>> read_base_mod_rates;  // Type-specific base modification rates for each read length
+
+    // Keep track of number of modified bases on the primary alignment vs other
+    // alignments (secondary, supplementary, unmapped)
+    int num_modified_bases_primary = 0;
+    int num_modified_bases_unmapped = 0;
+    int num_modified_bases_secondary = 0;
+    int num_modified_bases_supplementary = 0;
+
     while (sam_read1(bam_file, bam_header, bam_record) >= 0) {
         num_reads++;
 
@@ -373,13 +381,32 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
             int read_length = bam_record->core.l_qseq;
             hts_base_mod_state *state = hts_base_mod_state_alloc();
             std::vector<std::pair<int32_t, int>> c_modified_positions;  // C-modified positions for CpG analysis (chr->(position, strand))
-            std::unordered_map<char, int> base_mod_counts;  // Type-specific base modification counts for the read
+            // std::unordered_map<char, int> base_mod_counts;  // Type-specific
+            // base modification counts for the alignment
+            std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts;  // Type-specific base modification counts (canonical base -> modified base -> count)
+            std::unordered_map<char, int> base_primary_count;  // Total base counts for the alignment
 
             // Parse the base modification tags if a primary alignment
             int read_mod_count = 0;
             int ret = bam_parse_basemod(bam_record, state);
+            bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
+
+            // Update the number of reads with base modifications for the
+            // primary alignment vs other alignments
             if (ret >= 0) {
-                bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
+                if (is_primary) {
+                    num_modified_bases_primary++;
+                } else if (bam_record->core.flag & BAM_FUNMAP) {
+                    num_modified_bases_unmapped++;
+                } else if (bam_record->core.flag & BAM_FSECONDARY) {
+                    num_modified_bases_secondary++;
+                } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) {
+                    num_modified_bases_supplementary++;
+                }
+            }
+
+            if (ret >= 0 && is_primary) {
+                // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
 
                 // Get the chromosome if alignments are present
                 bool alignments_present = true;
@@ -398,9 +425,11 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                 // but it always yields 0...)
                 int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0;
 
-                // Set strand to null (-1) if the read is not primary
-                if (!is_primary) {
-                    strand = -1;
+                // Get the number of each type of base for the read
+                uint8_t *seq = bam_get_seq(bam_record);
+                for (int i = 0; i < read_length; i++) {
+                    char base = seq_nt16_str[bam_seqi(seq, i)];
+                    base_primary_count[std::toupper(base)]++;
                 }
 
                 // Iterate over the state object to get the base modification tags
@@ -416,8 +445,9 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                         // Update the modified prediction counts
                         read_mod_count++;  // Read-specific count
                         final_output.modified_prediction_count++;  // Cumulative count
+                        char canonical_base_char = std::toupper(mods[i].canonical_base);
                         char mod_type = mods[i].modified_base;
-                        base_mod_counts[mod_type]++;  // Update the type-specific count
+                        // base_mod_counts[mod_type]++;  // Update the type-specific count
 
                         // Note: The modified base value can be a positive char (e.g. 'm',
                         // 'h') (DNA Mods DB) or negative integer (ChEBI ID):
@@ -436,11 +466,13 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                             // Update counts for predictions exceeding the threshold
                             if (probability >= base_mod_threshold) {
                                 final_output.updateBaseModCounts(mod_type, strand);  // Update the base modification counts
+                                // base_mod_counts[mod_type]++;  // Update the
+                                // type-specific count
+                                base_mod_counts[canonical_base_char][mod_type]++;  // Update the type-specific count
 
                                 // Store the modified positions for later CpG
                                 // analysis if a C modification on a primary alignment
-                                char canonical_base_char = std::toupper(mods[i].canonical_base);
-                                if (is_primary && canonical_base_char == 'C' && mod_type != 'C') {
+                                if (canonical_base_char == 'C' && mod_type != 'C') {
 
                                     // Convert the query position to reference position if available
                                     if (alignments_present) {
@@ -451,6 +483,9 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
                                     }
                                 }
                             }
+                            // } else {
+                            //     base_primary_count[mod_type]++;  // Update the type-specific unmodified count
+                            // }
                         }
                     }
                 }
@@ -474,26 +509,63 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
             hts_base_mod_state_free(state);  // Deallocate the base modification state object
 
             // Calculate the base modification rate for the read
-            double read_mod_rate = 0.0;
-            if (read_length > 0) {
-                read_mod_rate = (double) read_mod_count / read_length;
-            }
+            // double read_mod_rate = 0.0;
+            // if (read_length > 0) {
+            //     read_mod_rate = (double) read_mod_count / read_length;
+            // }
 
             // Calculate the type-specific base modification rates for the read
             std::unordered_map<char, double> base_mod_rates;
             for (auto const &it : base_mod_counts) {
-                char mod_type = it.first;
-                int mod_count = it.second;
+                char canonical_base = it.first;
+                std::unordered_map<char, int> mod_counts = it.second;
                 double mod_rate = 0.0;
-                if (read_length > 0) {
-                    mod_rate = (double) mod_count / read_length;
+                int total_base_count = base_primary_count[canonical_base];
+
+                // Calculate the modification rate for each modification type
+                for (auto const &it2 : mod_counts) {
+                    char mod_type = it2.first;
+                    int mod_count = it2.second;
+                    double mod_rate = 0.0;
+                    if (mod_count + total_base_count > 0) {
+                        mod_rate = (double) mod_count / total_base_count;
+                    }
+                    base_mod_rates[mod_type] = mod_rate;
                 }
-                base_mod_rates[mod_type] = mod_rate;
+                // for (auto const &it2 : mod_counts) {
+                //     total_mod_count += it2.second;
+                // }
+                // if (total_mod_count + total_base_count > 0) {
+                //     mod_rate = (double) total_mod_count / (total_mod_count + total_base_count);
+                // }
+                // base_mod_rates[canonical_base] = mod_rate;
             }
-            final_output.updateReadModRate(read_length, read_mod_rate, base_mod_rates);  // Update the output data
+            // for (auto const &it : base_mod_counts) {
+            //     char mod_type = it.first;
+            //     int mod_count = it.second;
+            //     double mod_rate = 0.0;
+            //     int total_base_count = base_primary_count[mod_type];
+            //     if (mod_count + unmod_count > 0) {
+            //         mod_rate = (double) mod_count / (mod_count + unmod_count);
+            //     }
+            //     // if (read_length > 0) {
+            //     //     mod_rate = (double) mod_count / read_length;
+            //     // }
+            //     base_mod_rates[mod_type] = mod_rate;
+            // }
+            final_output.updateReadModRate(read_length, base_mod_rates);  // Update the output data
         }
     }
 
+    // Summary of base modification counts
+    if (mod_analysis) {
+        printMessage("Base modification counts:");
+        printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary));
+        printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped));
+        printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary));
+        printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary));
+    }
+
     bam_destroy1(bam_record);
     bam_hdr_destroy(bam_header);
     sam_close(bam_file);
diff --git a/src/output_data.cpp b/src/output_data.cpp
index 02cd5a8..148fbeb 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -257,6 +257,18 @@ void Basic_Seq_Quality_Statistics::global_sum(){
 
 // BAM output constructor
 Output_BAM::Output_BAM(){
+    this->num_primary_alignment = 0;
+    this->num_secondary_alignment = 0;
+    this->num_supplementary_alignment = 0;
+    this->num_clip_bases = 0;
+    this->sample_modified_base_count = 0;
+    this->sample_modified_base_count_forward = 0;
+    this->sample_modified_base_count_reverse = 0;
+    this->forward_alignment = 0;
+    this->reverse_alignment = 0;
+    this->base_mod_counts = std::unordered_map<char, uint64_t>();
+    this->base_mod_counts_forward = std::unordered_map<char, uint64_t>();
+    this->base_mod_counts_reverse = std::unordered_map<char, uint64_t>();
 }
 
 Output_BAM::~Output_BAM(){
@@ -278,20 +290,29 @@ void Output_BAM::updateBaseModCounts(char mod_type, int strand)
     }
 }
 
-void Output_BAM::updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates) {
+void Output_BAM::updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates) {
     ReadModData read_mod_data;
     read_mod_data.read_length = read_length;
-    read_mod_data.mod_rate = read_mod_rate;
     read_mod_data.base_mod_rates = base_mod_rates;
     this->read_mod_data.push_back(read_mod_data);
 }
 
 std::vector<char> Output_BAM::getBaseModTypes()
 {
+    printMessage("[TEST] Getting base modification types.");
     std::vector<char> base_mod_types;
-    for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) {
-        base_mod_types.push_back(it->first);
+    if (this->base_mod_counts.empty()) {
+        printError("No base modification counts found.");
+        return base_mod_types;
     }
+
+    printMessage("[TEST2] Getting base modification types.");
+    for (const auto& it : this->base_mod_counts) {
+        base_mod_types.push_back(it.first);
+    }
+    // for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) {
+    //     base_mod_types.push_back(it->first);
+    // }
     return base_mod_types;
 }
 
@@ -305,11 +326,6 @@ int Output_BAM::getNthReadModLength(int read_index)
     return this->read_mod_data[read_index].read_length;
 }
 
-double Output_BAM::getNthReadModRate(int read_index)
-{
-    return this->read_mod_data[read_index].mod_rate;
-}
-
 double Output_BAM::getNthReadModRate(int read_index, char mod_type)
 {
     double mod_rate = 0.0;
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 2f25c7d..a47b958 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -20,8 +20,9 @@
 MAX_READ_QUALITY = 100
 PLOT_FONT_SIZE = 16
 
-# Return a dictionary of default plot filenames
+
 def getDefaultPlotFilenames():
+    """Create a default HTML plot data structure."""
     plot_filenames = {  # for fq/fa
         "read_length_distr": {'title': "Read Length", 'description': "Read Length Distribution"},  # for bam
         "read_alignments_bar": {'title': "Read Alignments",
@@ -31,8 +32,6 @@ def getDefaultPlotFilenames():
         "read_length_bar": {'title': "Read Length Statistics", 'description': "Read Length Statistics"},
         "base_counts": {'title': "Base Counts",
                     'description': "Base Counts", 'summary': ""},
-        "basic_info": {'title': "Basic Statistics",
-                       'description': "Basic Statistics", 'summary': ""},
         "read_length_hist": {'title': "Read Length Histogram", 'description': "Read Length Histogram", 'summary': ""},
         
         "gc_content_hist": {'title': "GC Content Histogram", 'description': "GC Content Histogram", 'summary': ""},
@@ -49,8 +48,9 @@ def getDefaultPlotFilenames():
 
     return plot_filenames
 
-# Wrap the text in the table 
+
 def wrap(label):
+    """Wrap the label text."""
     # First split the string into a list of words
     words = label.split(' ')
 
@@ -71,8 +71,9 @@ def wrap(label):
 
     return new_label
 
-# Plot the read alignment numbers
+
 def plot_read_length_stats(output_data, file_type, plot_filepaths):
+    """Plot the read length statistics."""
 
     # Define the three categories
     category = ['N50', 'Mean', 'Median']
@@ -205,99 +206,10 @@ def plot_base_counts(output_data, filetype, plot_filepaths):
     layout = go.Layout(title='', xaxis=dict(title='Base'), yaxis=dict(title='Counts'), barmode='group', font=dict(size=PLOT_FONT_SIZE))
     fig = go.Figure(data=all_traces, layout=layout)
 
-    # Generate the HTML
-    # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
-
-    # return html_obj
-
     # Generate the HTML
     plot_filepaths['base_counts']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
     plot_filepaths['base_counts']['error_flag'] = error_flag
 
-def plot_basic_info(output_data, file_type, plot_filepaths):
-    """Plot basic information about the reads in bar chart format."""
-    html_obj = ''
-    if file_type == 'BAM':
-
-        # Create a bar trace for each type of data
-        bar_titles = ['All Reads', 'Mapped Reads', 'Unmapped Reads']
-        data_objects = [output_data.long_read_info, output_data.mapped_long_read_info, output_data.unmapped_long_read_info]
-
-        # Create subplots for each category
-        fig = make_subplots(rows=2, cols=2, subplot_titles=("Number of Reads", "Number of Bases", "Longest Read", "GC Content"), horizontal_spacing=0.3, vertical_spacing=0.2)
-
-        # Add traces for each category
-        key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length', 'gc_cnt']
-        error_flag = False
-        for i in range(4):
-            # Get the data for this category
-            key_name = key_list[i]
-
-            # Add the traces for each type of data
-            data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)]
-
-            # Set the error flag if any of the values are zero (except for unmapped reads)
-            # if data[0] == 0 or data[1] == 0 or data[2] == 0:
-            if data[0] == 0 or data[1] == 0:
-                error_flag = True
-
-            # Create the trace
-            trace = go.Bar(x=data, y=bar_titles, orientation='h')
-
-            # Add the trace to the figure
-            fig.add_trace(trace, row=(i // 2) + 1, col=(i % 2) + 1)
-            fig.update_layout(showlegend=False)
-
-        # Update the layout
-        fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE))
-
-        # Generate the HTML
-        # html_obj = fig.to_html(full_html=False, default_height=800,
-        # default_width=1200)
-        plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=800, default_width=1200)
-        plot_filepaths['basic_info']['error_flag'] = error_flag
-
-
-    elif file_type == 'SeqTxt':
-
-        # Create a bar trace for each type of data
-        bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads']
-        data_objects = [output_data.all_long_read_info.long_read_info, output_data.passed_long_read_info.long_read_info, output_data.failed_long_read_info.long_read_info]
-
-        # Create subplots for each category
-        fig = make_subplots(rows=1, cols=3, subplot_titles=("Number of Reads", "Number of Bases", "Longest Read"), horizontal_spacing=0.1)
-
-        # Add traces for each category
-        key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length']
-        error_flag = False
-        for i in range(3):
-            # Get the data for this category
-            key_name = key_list[i]
-
-            # Add the traces for each type of data
-            data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)]
-
-            # Set the error flag if any of the values are zero
-            if data[0] == 0 or data[1] == 0 or data[2] == 0:
-                error_flag = True
-
-            # Create the trace
-            trace = go.Bar(x=data, y=bar_titles, orientation='h')
-
-            # Add the trace to the figure
-            fig.add_trace(trace, row=1, col=i + 1)
-
-        # Update the layout
-        fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE))
-
-        # Generate the HTML
-        # html_obj = fig.to_html(full_html=False, default_height=500,
-        # default_width=1600)
-        plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1600)
-        plot_filepaths['basic_info']['error_flag'] = error_flag
-
-    return html_obj
-
 
 # Plot the read length histograms
 def read_lengths_histogram(data, font_size, plot_filepaths):
@@ -399,8 +311,6 @@ def read_lengths_histogram(data, font_size, plot_filepaths):
     fig.update_annotations(font_size=annotation_size)
 
     # Generate the HTML
-    # html_obj = fig.to_html(full_html=False, default_height=500,
-    # default_width=1200)
     plot_filepaths['read_length_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1200)
                            
 
@@ -542,7 +452,12 @@ def plot(output_data, para_dict, file_type):
     create_summary_table(output_data, plot_filepaths, file_type)  # Create the summary table
 
     # Modified base table and plots
-    if file_type == 'BAM' and para_dict["mod"] > 0:
+    try:
+        para_dict["mod"]
+    except KeyError:
+        para_dict["mod"] = False
+
+    if file_type == 'BAM' and para_dict["mod"]:
         # Output file for the read length vs. modification rates plot
         output_folder = para_dict["output_folder"]
         read_length_mod_rate_file = os.path.join(output_folder, 'read_length_hist.png')
@@ -555,6 +470,11 @@ def plot(output_data, para_dict, file_type):
             logging.warning("WARNING: Modified base table not created")
 
     # Create the TIN table if available
+    try:
+        para_dict["genebed"]
+    except KeyError:
+        para_dict["genebed"] = ""
+        
     if file_type == 'BAM' and para_dict["genebed"] != "":
         input_files = para_dict["input_files"]
         create_tin_table(output_data, input_files, plot_filepaths)
@@ -565,16 +485,8 @@ def plot(output_data, para_dict, file_type):
         else:
             logging.warning("WARNING: TIN table not created")
 
-    # Generate plots
-    # plot_filepaths['base_counts']['dynamic'] = plot_base_counts(output_data,
-    # file_type)
     plot_base_counts(output_data, file_type, plot_filepaths)
 
-    # Plot basic information
-    # plot_filepaths['basic_info']['dynamic'] = plot_basic_info(output_data,
-    # file_type)
-    plot_basic_info(output_data, file_type, plot_filepaths)
-
     # Read length histogram
     if file_type == 'SeqTxt':
         long_read_data = output_data.all_long_read_info.long_read_info
@@ -582,29 +494,15 @@ def plot(output_data, para_dict, file_type):
         long_read_data = output_data.long_read_info
 
     if file_type != 'FAST5s':
-        # plot_filepaths['read_length_hist']['dynamic'] =
-        # read_lengths_histogram(long_read_data, font_size)
         read_lengths_histogram(long_read_data, font_size, plot_filepaths)
-
-        # plot_filepaths['read_length_bar']['dynamic'] =
-        # plot_read_length_stats(output_data, file_type)
         plot_read_length_stats(output_data, file_type, plot_filepaths)
 
     # GC content histogram
     if file_type != 'FAST5s' and file_type != 'SeqTxt':
-        if file_type == 'BAM':
-            # plot_filepaths['gc_content_hist']['dynamic'] =
-            # read_gc_content_histogram(output_data.mapped_long_read_info,
-            # font_size)
             read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths)
         elif file_type == 'SeqTxt':
-            # plot_filepaths['gc_content_hist']['dynamic'] =
-            # read_gc_content_histogram(output_data.passed_long_read_info.long_read_info,
-            # font_size)
             read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths)
         else:
-            # plot_filepaths['gc_content_hist']['dynamic'] =
-            # read_gc_content_histogram(output_data.long_read_info, font_size)
             read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths)
 
     # Base quality histogram
@@ -613,25 +511,17 @@ def plot(output_data, para_dict, file_type):
 
         # Base quality histogram
         base_quality(seq_quality_info, font_size, plot_filepaths)
-        # plot_filepaths['base_quality']['dynamic'] =
-        # base_quality(seq_quality_info, font_size)
         
     # Read average base quality histogram
     if file_type == 'FASTQ':
-        # read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size)
-        # plot_filepaths['read_avg_base_quality']['dynamic'] =
-        # read_quality_dynamic
         read_avg_base_quality(seq_quality_info, font_size, plot_filepaths)
 
     if file_type == 'BAM':
         # Plot read alignment QC
         plot_alignment_numbers(output_data, plot_filepaths)
-        # plot_filepaths['read_alignments_bar']['dynamic'] =
-        # plot_alignment_numbers(output_data)
         
         # Plot base alignment and error QC
         plot_errors(output_data, plot_filepaths)
-        # plot_filepaths['base_alignments_bar']['dynamic'] = plot_errors(output_data)
         
     elif file_type == 'FAST5s':
         plot_filepaths['ont_signal']['dynamic'] = plot_signal(output_data, para_dict)
@@ -1114,6 +1004,12 @@ def create_summary_table(output_data, plot_filepaths, file_type):
     plot_filepaths["basic_st"]['error_flag'] = table_error_flag
 
 
+def get_axis_name(row, axis_type='x'):
+    """Get the axis name for the plot."""
+    axis_number = row + 1
+    return f"{axis_type}axis{axis_number}" if axis_number > 1 else f"{axis_type}axis"
+
+
 def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold):
     """Create a summary table for the base modifications."""
     plot_filepaths["base_mods"] = {}
@@ -1125,6 +1021,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     # Print the types of modifications
     logging.info("Getting base modification types")
     base_mod_types = output_data.getBaseModTypes()
+    logging.info("[TEST] Modification types: ")
     if base_mod_types:
         logging.info("Modification types: ")
         for mod_type in base_mod_types:
@@ -1146,9 +1043,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
         # Get the read length vs. base modification rate data for each
         # modification type in the sampled reads
-        # for i in range(read_mod_data_size):
-        #     if i not in read_indices:
-        #         continue
         for i in read_indices:
             for mod_type in base_mod_types:
                 if mod_type not in read_length_mod_rates:
@@ -1174,10 +1068,20 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         # Create a plot of read length vs. base modification rate for each
         # modification type
         # Make subplots vertically for each modification type
-        fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1)
+        subplot_titles = []
+        for mod_type in base_mod_types:
+            try:
+                mod_name = mod_char_to_name[mod_type]
+            except KeyError:
+                logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
+                mod_name = mod_type
+
+            subplot_titles.append('Read Length vs. {} Modification Rate'.format(mod_name))
+            
+            
+        fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=subplot_titles)
         min_x = float('inf')
         max_x = 0
-        # for mod_type in base_mod_types:
         for i, mod_type in enumerate(base_mod_types):
 
             # Format the data
@@ -1185,17 +1089,24 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
             mod_rates = [data[1] * 100 for data in mod_data]
             x_vals = [data[0] for data in mod_data]
 
+            # Remove outlier read lengths using the IQR method
+            if len(x_vals) > 1:
+                x_vals_np = np.array(x_vals)
+                q1 = np.percentile(x_vals_np, 25)
+                q3 = np.percentile(x_vals_np, 75)
+                iqr = q3 - q1
+                lower_bound = q1 - 1.5 * iqr
+                upper_bound = q3 + 1.5 * iqr
+
+                # Filter the data to remove outliers
+                filtered_data = [(x, y) for x, y in zip(x_vals, mod_rates) if lower_bound <= x <= upper_bound]
+                x_vals, mod_rates = zip(*filtered_data)
+            
             # Generate evenly-spaced x values and labels (10 ticks across the
             # range) with the read lengths being a multiple of 1000
             x_tick_values = np.linspace(0, max(x_vals), num=10)
             read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values]
 
-            # read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals]
-
-            # Update the min and max x values
-            # min_x = min(min_x, *x_vals)
-            # max_x = max(max_x, *x_vals)
-
             # Get the modification name
             try:
                 mod_name = mod_char_to_name[mod_type]
@@ -1203,26 +1114,19 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                 logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
                 mod_name = mod_type
 
-            fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name), row=i + 1, col=1)
+            fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1)
 
             # Update the layout
-            fig.update_layout(title='Read Length vs. {} Modification Rate'.format(mod_name),
-                            xaxis_title='Read Length',
-                            yaxis_title='Modification Rate (%)',
-                            showlegend=False,
-                            yaxis=dict(range=[0, 100]),
-                            xaxis=dict(tickvals=x_tick_values, ticktext=read_lengths, range=[0, max(x_vals)]),
-                            font=dict(size=PLOT_FONT_SIZE))
+            x_axis_name = get_axis_name(i)
+            y_axis_name = get_axis_name(i, 'y')
+            logging.info("Index: {}, Y index: {}".format(i, y_axis_name))
             
-            # Get the X tick values generated by Plotly and format the read lengths
-            # x_tick_values = fig.layout.x
-            # if x_tick_values:
-            #     read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values]
-
-            # # Update the X tick labels
-            # fig.update_xaxes(tickvals=x_tick_values, ticktext=read_lengths, row=i + 1, col=1)
+            # Auto range the axes
+            fig.update_layout(
+                **{f"{x_axis_name}_title": 'Read Length (bp)',
+                    f"{y_axis_name}_title": 'Modification Rate (%)'})
             
-            # xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]),
+        fig.update_layout(font=dict(size=PLOT_FONT_SIZE))
 
         # Save the plot image
         # if len(base_mod_types) > 0:
@@ -1234,15 +1138,16 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         # html_obj = fig.to_html(full_html=False, default_height=500,
         # default_width=700)
         if len(base_mod_types) > 0:
+            plot_height = 500 * len(base_mod_types)
             logging.info("Saving the read length vs. modification rates plot")
-            plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
+            plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=700)
         # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj
     else:
         logging.warning("WARNING: No modification types found")
 
     # Create the base modification statistics table
     table_str = "<table>\n<tbody>"
-    row_str, row_flag = format_row("Total Predictions", [output_data.modified_prediction_count], 'int', None)
+    row_str, row_flag = format_row("Total Unfiltered Predictions", [output_data.modified_prediction_count], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
@@ -1262,22 +1167,14 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None)
+    row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None)
+    row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    # table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count)
-    # table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold)
-    # table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count)
-    # table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward)
-    # table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse)
-    # table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count)
-    # table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count)
-
     # Add the modification type data
     for mod_type in base_mod_types:
         # mod_name = mod_char_to_name[mod_type]
@@ -1291,20 +1188,17 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         mod_count_fwd = output_data.getModTypeCount(mod_type, 0)
         mod_count_rev = output_data.getModTypeCount(mod_type, 1)
 
-        row_str, row_flag = format_row("Total {} Sites in the Sample".format(mod_name), [mod_count], 'int', None)
+        row_str, row_flag = format_row("Total {} Counts in the Sample".format(mod_name), [mod_count], 'int', None)
         table_str += row_str
         table_error_flag = table_error_flag or row_flag
 
-        row_str, row_flag = format_row("Total {} Sites in the Sample (Forward Strand)".format(mod_name), [mod_count_fwd], 'int', None)
+        row_str, row_flag = format_row("Total {} Counts in the Sample (Forward Strand)".format(mod_name), [mod_count_fwd], 'int', None)
         table_str += row_str
         table_error_flag = table_error_flag or row_flag
 
-        row_str, row_flag = format_row("Total {} Sites in the Sample (Reverse Strand)".format(mod_name), [mod_count_rev], 'int', None)
+        row_str, row_flag = format_row("Total {} Counts in the Sample (Reverse Strand)".format(mod_name), [mod_count_rev], 'int', None)
         table_str += row_str
         table_error_flag = table_error_flag or row_flag
-        # table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count)
-        # table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd)
-        # table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev)
 
     # Finish the table
     table_str += "\n</tbody>\n</table>"
@@ -1320,7 +1214,7 @@ def create_tin_table(output_data, input_files, plot_filepaths):
 
     # Create a table with the first column showing the BAM filepath, and the
     # following columns showing TIN count, mean, median, and standard deviation
-    table_str = "<table>\n<thead>\n<tr><th>BAM File</th><th>Count</th><th>Mean</th><th>Median</th><th>StdDev</th></tr>\n</thead>"
+    table_str = "<table>\n<thead>\n<tr><th>BAM File</th><th>Median TIN Score</th><th>Number of Transcripts</th></tr>\n</thead>"
     table_str += "\n<tbody>"
     
     # Loop through each BAM file
@@ -1330,18 +1224,18 @@ def create_tin_table(output_data, input_files, plot_filepaths):
         bam_filename = os.path.basename(bam_file)
 
         # Get the file data
-        tin_count = output_data.getTINCount(bam_file)
-        tin_mean = output_data.getTINMean(bam_file)
+        # tin_count = output_data.getTINCount(bam_file)
+        # tin_mean = output_data.getTINMean(bam_file)
         tin_median = output_data.getTINMedian(bam_file)
-        tin_std = output_data.getTINStdDev(bam_file)
+        # tin_std = output_data.getTINStdDev(bam_file)
 
         # Add the data to the table
-        row_str, row_flag = format_row(bam_filename, [tin_count, tin_mean, tin_median, tin_std], 'float', None)
+        # row_str, row_flag = format_row(bam_filename, [tin_count, tin_mean,
+        # tin_median, tin_std], 'float', None)
+        row_str, row_flag = format_row(bam_filename, [tin_median, output_data.getTINCount(bam_file)], 'float', None)
         table_str += row_str
         error_flag = error_flag or row_flag
 
-        # table_str += "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>".format(bam_filename, tin_count, tin_mean, tin_median, tin_std)
-
     table_str += "\n</tbody>\n</table>"
 
     # Add the table to the plot filepaths

From bf61e23c7546fbeb5e546879d66a39abcad3eb58 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 12 Jan 2025 19:04:37 -0500
Subject: [PATCH 13/25] Work on help icon and fix forward vs reverse counts

---
 src/generate_html.py |  45 +++++++++++++++--
 src/hts_reader.cpp   |   4 +-
 src/plot_utils.py    | 112 +++++++++++++++++++++++++++++--------------
 3 files changed, 121 insertions(+), 40 deletions(-)

diff --git a/src/generate_html.py b/src/generate_html.py
index 4cc3cf0..dce7237 100644
--- a/src/generate_html.py
+++ b/src/generate_html.py
@@ -218,6 +218,48 @@ def generate_header(self):
   li {
   margin: 10px 0;
   }
+.help-icon {
+    position: relative;
+    display: inline-block;
+    cursor: pointer;
+    color: #555;
+    font-size: 18px; /* Adjust size of the icon */
+    margin-top: 10px; /* Adjust spacing if needed */
+}
+
+.help-icon:hover .tooltip {
+    visibility: visible;
+    opacity: 1;
+}
+
+.tooltip {
+    visibility: hidden;
+    width: 200px;
+    background-color: #333;
+    color: #fff;
+    text-align: left;
+    border-radius: 4px;
+    padding: 8px;
+    font-size: 14px;
+    position: absolute;
+    top: 50%; /* Position the tooltip */
+    left: 120%; /* Position the tooltip */
+    transform: translateY(-50%);
+    opacity: 0;
+    transition: opacity 0.3s;
+    z-index: 1;
+}
+
+.tooltip::after {
+    content: '';
+    position: absolute;
+    top: 50%; /* Position the arrow in the middle of the tooltip */
+    left: 0; /* Position the arrow on the left edge of the tooltip */
+    transform: translateY(-50%);
+    border-width: 5px;
+    border-style: solid;
+    border-color: #333 transparent transparent transparent;
+}
       </style>''')
 
         self.html_writer.write("</head>")
@@ -306,9 +348,6 @@ def generate_right(self):
         self.html_writer.write('<div class="module">')
         self.html_writer.write('<h2 id="lrst' + str(key_index) + '">File Count = ' + str(
             len(self.input_para["input_files"])) + '</h2><p>')
-        # for _af in self.input_para["input_files"]:
-        #     self.html_writer.write("<br/>" + _af)
-        # Write the input files in format "1.\tfile1\n2.\tfile2\n..."
         self.html_writer.write("<br/>" + "<br/>".join([f"{i+1}.\t{af}" for i, af in enumerate(self.input_para["input_files"])]))
         self.html_writer.write('</p></div>')
         key_index += 1
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 585254d..6c4db22 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -218,9 +218,9 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
 
                 // Determine if this is a forward or reverse read
                 if (record->core.flag & BAM_FREVERSE) {
-                    output_data.forward_alignment++;
-                } else {
                     output_data.reverse_alignment++;
+                } else {
+                    output_data.forward_alignment++;
                 }
 
                 // Loop through the cigar string and count the number of insertions, deletions, and matches
diff --git a/src/plot_utils.py b/src/plot_utils.py
index a47b958..f9b8239 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -211,8 +211,8 @@ def plot_base_counts(output_data, filetype, plot_filepaths):
     plot_filepaths['base_counts']['error_flag'] = error_flag
 
 
-# Plot the read length histograms
 def read_lengths_histogram(data, font_size, plot_filepaths):
+    """Plot the read length histogram."""
     linear_bin_count = 10
     log_bin_count = 10
 
@@ -319,6 +319,22 @@ def read_gc_content_histogram(data, font_size, plot_filepaths):
     bin_size = 1
     gc_content = np.array(data.read_gc_content_count)
 
+    # Calculate the percentage of reads with a GC content of <30%
+    gc_content_below_30 = np.sum(gc_content[:30])
+    logging.info("[TEST] Percentage of reads with GC content <30%: {}".format(gc_content_below_30 / np.sum(gc_content)))
+
+    # Calculate the percentage of reads with a GC content of >70%
+    gc_content_above_70 = np.sum(gc_content[70:])
+    logging.info("[TEST] Percentage of reads with GC content >70%: {}".format(gc_content_above_70 / np.sum(gc_content)))
+
+    # Calculate the percentage of reads with a GC content of <20%
+    gc_content_below_20 = np.sum(gc_content[:20])
+    logging.info("[TEST] Percentage of reads with GC content <20%: {}".format(gc_content_below_20 / np.sum(gc_content)))
+
+    # Calculate the percentage of reads with a GC content of >60%
+    gc_content_above_60 = np.sum(gc_content[60:])
+    logging.info("[TEST] Percentage of reads with GC content >60%: {}".format(gc_content_above_60 / np.sum(gc_content)))
+
     # Set the error flag if the GC content is below 20% for more than 10% of the
     # reads
     error_flag = False
@@ -357,7 +373,6 @@ def read_gc_content_histogram(data, font_size, plot_filepaths):
     fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0)
     fig.update_layout(font=dict(size=PLOT_FONT_SIZE))  # Set font size
 
-    # return fig.to_html(full_html=False, default_height=500, default_width=700)
     plot_filepaths['gc_content_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700)
     plot_filepaths['gc_content_hist']['error_flag'] = error_flag
 
@@ -447,6 +462,7 @@ def plot_base_modifications(base_modifications):
 
 def plot(output_data, para_dict, file_type):
     """Generate the plots for the output data."""
+    logging.info("Generating plots for file type: {}".format(file_type))
     plot_filepaths = getDefaultPlotFilenames()
     font_size = 14  # Font size for the plots
     create_summary_table(output_data, plot_filepaths, file_type)  # Create the summary table
@@ -498,12 +514,12 @@ def plot(output_data, para_dict, file_type):
         plot_read_length_stats(output_data, file_type, plot_filepaths)
 
     # GC content histogram
-    if file_type != 'FAST5s' and file_type != 'SeqTxt':
-            read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths)
-        elif file_type == 'SeqTxt':
-            read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths)
-        else:
-            read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths)
+    if file_type == 'BAM':
+        read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths)
+    elif file_type == 'SeqTxt':
+        read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths)
+    elif file_type == 'FASTQ' or file_type == 'FASTA':
+        read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths)
 
     # Base quality histogram
     if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt':
@@ -1000,6 +1016,12 @@ def create_summary_table(output_data, plot_filepaths, file_type):
         table_error_flag = table_error_flag or row_flag
         
     table_str += "\n</tbody>\n</table>"
+    # table_str += """
+    #     <div class="help-icon">
+    #         💡
+    #         <div class="tooltip">This is your help text explaining the feature!</div>
+    #     </div>
+    #     """
     plot_filepaths["basic_st"]['detail'] = table_str
     plot_filepaths["basic_st"]['error_flag'] = table_error_flag
 
@@ -1012,6 +1034,13 @@ def get_axis_name(row, axis_type='x'):
 
 def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold):
     """Create a summary table for the base modifications."""
+    help_text = "Total unfiltered predictions are all predictions prior to applying the base modification probability threshold.\n" \
+                "This threshold is set by the user (default: 0.5) and is used to filter out low-confidence base modifications.\n" \
+                "Total modification counts are the number of base modifications that pass the threshold.\n" \
+                "These counts are also separated by forward and reverse strand predictions.\n" \
+                "CpG modification counts are the total CpG modifications that pass the threshold.\n" \
+                "These are total counts and not site-specific counts." \
+                
     plot_filepaths["base_mods"] = {}
     plot_filepaths["base_mods"]['file'] = ""
     plot_filepaths["base_mods"]['title'] = "Base Modifications"
@@ -1101,11 +1130,13 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                 # Filter the data to remove outliers
                 filtered_data = [(x, y) for x, y in zip(x_vals, mod_rates) if lower_bound <= x <= upper_bound]
                 x_vals, mod_rates = zip(*filtered_data)
+
+            # Normalize the read lengths to the maximum read length (0-100)
+            x_vals = [100 * x / max(x_vals) for x in x_vals]
             
-            # Generate evenly-spaced x values and labels (10 ticks across the
-            # range) with the read lengths being a multiple of 1000
-            x_tick_values = np.linspace(0, max(x_vals), num=10)
-            read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values]
+            # Use 0-100 for the x-axis ticks and labels
+            x_tick_values = np.arange(0, 101, 10)
+            x_tick_labels = ['{:,}%'.format(int(val)) for val in x_tick_values]
 
             # Get the modification name
             try:
@@ -1114,7 +1145,10 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                 logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
                 mod_name = mod_type
 
-            fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1)
+            # fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1)
+
+            # Create a heatmap plot
+            fig.add_trace(go.Histogram2dContour(x=x_vals, y=mod_rates, colorscale='Viridis', showlegend=False), row=i + 1, col=1)
 
             # Update the layout
             x_axis_name = get_axis_name(i)
@@ -1123,25 +1157,21 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
             
             # Auto range the axes
             fig.update_layout(
-                **{f"{x_axis_name}_title": 'Read Length (bp)',
-                    f"{y_axis_name}_title": 'Modification Rate (%)'})
+                **{f"{x_axis_name}_title": 'Normalized Read Length (%)',
+                    f"{y_axis_name}_title": 'Modification Rate (%)'},
+                **{f"{x_axis_name}_tickmode": 'array',
+                    f"{x_axis_name}_tickvals": x_tick_values,
+                    f"{x_axis_name}_ticktext": x_tick_labels},
+                **{f"{y_axis_name}_range": [0, 100]}
+            )
             
         fig.update_layout(font=dict(size=PLOT_FONT_SIZE))
-
-        # Save the plot image
-        # if len(base_mod_types) > 0:
-        #     fig_file = plot_filepaths["read_length_mod_rates"]['file']
-        #     logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file))
-        #     fig.write_image(fig_file, format='png', width=700, height=500)
             
         # Generate the HTML
-        # html_obj = fig.to_html(full_html=False, default_height=500,
-        # default_width=700)
         if len(base_mod_types) > 0:
             plot_height = 500 * len(base_mod_types)
             logging.info("Saving the read length vs. modification rates plot")
             plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=700)
-        # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj
     else:
         logging.warning("WARNING: No modification types found")
 
@@ -1155,23 +1185,23 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    row_str, row_flag = format_row("Total Modified Bases in the Sample", [output_data.sample_modified_base_count], 'int', None)
+    row_str, row_flag = format_row("Total Modification Counts", [output_data.sample_modified_base_count], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    row_str, row_flag = format_row("Total in the Forward Strand", [output_data.sample_modified_base_count_forward], 'int', None)
+    row_str, row_flag = format_row("Total Modification Counts (Forward Strand Only)", [output_data.sample_modified_base_count_forward], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    row_str, row_flag = format_row("Total in the Reverse Strand", [output_data.sample_modified_base_count_reverse], 'int', None)
+    row_str, row_flag = format_row("Total Modification Counts (Reverse Strand Only)", [output_data.sample_modified_base_count_reverse], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None)
+    row_str, row_flag = format_row("Total CpG Modification Counts (Forward Strand Only)", [output_data.sample_cpg_forward_count], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
-    row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None)
+    row_str, row_flag = format_row("Total CpG Modification Counts (Reverse Strand Only)", [output_data.sample_cpg_reverse_count], 'int', None)
     table_str += row_str
     table_error_flag = table_error_flag or row_flag
 
@@ -1202,6 +1232,21 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
     # Finish the table
     table_str += "\n</tbody>\n</table>"
+
+    # Add the help text
+    table_str += """
+        <div class="help-icon">
+            💡
+            <div class="tooltip">{}</div>
+        </div>
+        """.format(help_text)
+    
+    # Add text below the table suggesting the user to use Modkit for more
+    # detailed analysis on per-site modification rates
+    table_str += "<p><i>For per-site modification rates, please use \
+        <a href=\"https://github.com/nanoporetech/modkit\">Modkit</a> by Oxford Nanopore Technologies..</i></p>"
+
+
     plot_filepaths["base_mods"]['detail'] = table_str
     plot_filepaths["base_mods"]['error_flag'] = table_error_flag
 
@@ -1276,6 +1321,9 @@ def plot_alignment_numbers(data, plot_filepaths):
     # Set the error flag if primary alignments equal 0
     error_flag = data.num_primary_alignment == 0
 
+    logging.info("[TEST] Number of reverse alignments: {}".format(data.reverse_alignment))
+    logging.info("[TEST] Number of forward alignments: {}".format(data.forward_alignment))
+
     # Create a horizontally aligned bar plot trace from the data using plotly
     trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment,
                       data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment,
@@ -1291,12 +1339,6 @@ def plot_alignment_numbers(data, plot_filepaths):
     # Create the figure object
     fig = go.Figure(data=[trace], layout=layout)
 
-    # Generate the HTML object for the plot
-    # html_obj = fig.to_html(full_html=False, default_height=500,
-    # default_width=1000)
-
-    # return html_obj, error_flag
-
     # Update the HTML data for the plot
     plot_filepaths['read_alignments_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1000)
     plot_filepaths['read_alignments_bar']['error_flag'] = error_flag

From 66e2b8a74afc4dfba924abee2e3ff67233459673 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 14 Jan 2025 17:07:57 -0500
Subject: [PATCH 14/25] Work on pct read length vs mod prob

---
 include/hts_reader.h  |   5 +-
 include/output_data.h |   5 +
 include/tin.h         |   2 +-
 src/bam_module.cpp    |  25 ++-
 src/hts_reader.cpp    | 389 +++++++++++++++++++++++-------------------
 src/output_data.cpp   |  40 +++++
 src/plot_utils.py     | 157 +++++++----------
 src/tin.cpp           |  12 +-
 8 files changed, 353 insertions(+), 282 deletions(-)

diff --git a/include/hts_reader.h b/include/hts_reader.h
index 00900b2..5d3a628 100644
--- a/include/hts_reader.h
+++ b/include/hts_reader.h
@@ -47,7 +47,10 @@ class HTSReader {
         bool hasNextRecord();
 
         // Return the number of records in the BAM file using the BAM index
-        int64_t getNumRecords(const std::string &bam_file_name, Output_BAM &final_output, bool mod_analysis, double base_mod_threshold);
+        int getNumRecords(const std::string &bam_file_name, int thread_count);
+
+        // Run base modification analysis
+        void runBaseModificationAnalysis(const std::string &bam_filename, Output_BAM& final_output, double base_mod_threshold, int read_count, int sample_count, int thread_count);
 
         std::map<int, int> getQueryToRefMap(bam1_t* record);
 
diff --git a/include/output_data.h b/include/output_data.h
index fa66cdb..bca521f 100644
--- a/include/output_data.h
+++ b/include/output_data.h
@@ -207,6 +207,8 @@ class Output_BAM : public Output_FQ
       std::unordered_map<char, uint64_t> base_mod_counts_forward;  // Counts for each base modification type exceeding the threshold on the forward strand
       std::unordered_map<char, uint64_t> base_mod_counts_reverse;  // Counts for each base modification type exceeding the threshold on the reverse strand
 
+      std::unordered_map<char, std::vector<std::pair<double, double>>> read_pct_len_vs_mod_prob;  // Read length (%) vs. base modification probability for each base modification type
+
       // Signal data section
       int read_count = ZeroDefault;
       int base_count = ZeroDefault;
@@ -231,6 +233,8 @@ class Output_BAM : public Output_FQ
       double getNthReadModRate(int read_index, char mod_type);  // Get the base modification rate for the nth read for a specific base modification type
       uint64_t getModTypeCount(char mod_type);  // Get the count of a specific base modification type
       uint64_t getModTypeCount(char mod_type, int strand);  // Get the count of a specific base modification type for a specific strand
+      double getNthReadLenPct(int read_index, char mod_type);  // Get the read length percentage for the nth read for a specific base modification type
+      double getNthReadModProb(int read_index, char mod_type);  // Get the base modification probability for the nth read for a specific base modification type
 
       // POD5 signal data functions
       int getReadCount();
@@ -241,6 +245,7 @@ class Output_BAM : public Output_FQ
       int getReadSequenceEnd(std::string read_id);
 
       void updateBaseModCounts(char mod_type, int strand);  // Update base modification counts for predictions exceeding the threshold
+      void updateBaseModProbabilities(char mod_type, double pct_len, double probability);  // Update base modification probabilities
       void updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates);  // Update read length vs. base modification rate data
 
       // Add TIN data for a single BAM file
diff --git a/include/tin.h b/include/tin.h
index b7b73f6..195596e 100644
--- a/include/tin.h
+++ b/include/tin.h
@@ -15,7 +15,7 @@ typedef std::unordered_map<std::string, std::tuple<std::string, int, int, double
 
 // Calculate the TIN score for each transcript in the gene BED file
 // (Reference: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0922-z#Sec11)
-void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder);
+void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count);
 
 std::unordered_map<int, int> getReadDepths(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::string chr, int start, int end);
 
diff --git a/src/bam_module.cpp b/src/bam_module.cpp
index 058f831..d2a96eb 100644
--- a/src/bam_module.cpp
+++ b/src/bam_module.cpp
@@ -80,7 +80,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
             std::cout << "Calculating TIN scores for file: " << filepath << std::endl;
 
             TINStats tin_stats;
-            calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder);
+            calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder, input_params.threads);
 
             // Print the TIN stats
             std::cout << "Number of transcripts: " << tin_stats.num_transcripts << std::endl;
@@ -113,7 +113,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
         // process base modifications and TINs if available.
         // Note: This section utilizes one thread.
         std::cout << "Getting number of records..." << std::endl;
-        int num_records = reader.getNumRecords(filepath, final_output, mod_analysis, base_mod_threshold);
+        int num_records = reader.getNumRecords(filepath, thread_count);
         std::cout << "Number of records = " << num_records << std::endl;
 
         // Exit if there are no records
@@ -123,6 +123,13 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
             return exit_code;
         }
 
+        // Run base modification analysis if the flag is set
+        if (mod_analysis){
+            std::cout << "Running base modification analysis..." << std::endl;
+            int sample_count = 10000;
+            reader.runBaseModificationAnalysis(filepath, final_output, base_mod_threshold, num_records, sample_count, thread_count);
+        }
+
         // Determine the batch sizes if the user-specified thread count is greater than 1
         int batch_size = 0;
         if (thread_count > 1) {
@@ -147,7 +154,14 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
 
          // Calculate statistics in batches
          printMemoryUsage("Before batch processing");
+
+        // TEST
+        // int max_reads = 10;
+        // int current_reads = 0;
+         
          while (reader.hasNextRecord()){
+        // while (current_reads < max_reads && reader.hasNextRecord()){
+            // Read the next batch of records
             std::cout << "Generating " << thread_count << " thread(s)..." << std::endl;
             std::vector<std::thread> thread_vector;
             for (int thread_index=0; thread_index<thread_count; thread_index++){
@@ -172,6 +186,9 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
                 }
                 printMemoryUsage("After thread " + std::to_string(thread_index));
                 thread_index++;
+
+                // TEST - Increment the current reads
+                // current_reads += batch_size;
             }
             std::cout << "All threads joined." << std::endl;
         }
@@ -219,15 +236,13 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
     std::cout << "Calculating summary QC..." << std::endl;
     final_output.global_sum();
     std::cout << "QC complete" << std::endl;
-
-    // Save the summary statistics to a file
     std::cout << "Saving summary statistics to file..." << std::endl;
 
     // If in RRMS mode, append RRMS accepted/rejected to the output prefix
     std::string output_prefix = "bam";
     if (input_params.rrms_csv != ""){
         output_prefix += input_params.rrms_filter ? "_rrms_accepted" : "_rrms_rejected";
-    } 
+    }
     std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.txt";
     final_output.save_summary(summary_filepath, input_params, final_output);
     std::cout << "Saved file: " << summary_filepath << std::endl;
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 6c4db22..01e0524 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -351,16 +351,40 @@ bool HTSReader::hasNextRecord(){
 }
 
 // Return the number of records in the BAM file using the BAM index
-int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &final_output, bool mod_analysis, double base_mod_threshold) {
+int HTSReader::getNumRecords(const std::string& bam_filename, int thread_count) {
     samFile* bam_file = sam_open(bam_filename.c_str(), "r");
+    hts_set_threads(bam_file, thread_count);  // Enable multi-threading
+    bam_hdr_t* bam_header = sam_hdr_read(bam_file);
+    bam1_t* bam_record = bam_init1();
+    int num_reads = 0;
+    while (sam_read1(bam_file, bam_header, bam_record) >= 0) {
+        num_reads++;
+    }
+
+    return num_reads;
+}
+
+void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Output_BAM &final_output, double base_mod_threshold, int read_count, int sample_count, int thread_count)
+{
+    samFile* bam_file = sam_open(bam_filename.c_str(), "r");
+    hts_set_threads(bam_file, thread_count);  // Enable multi-threading
     bam_hdr_t* bam_header = sam_hdr_read(bam_file);
     bam1_t* bam_record = bam_init1();
     int64_t num_reads = 0;
 
-    // Data structure for storing read length vs. base modification rate
-    std::vector<int> read_lengths;  // Read lengths
-    std::vector<double> read_mod_rates;  // Total base modification rate for each read length
-    std::vector<std::unordered_map<char, double>> read_base_mod_rates;  // Type-specific base modification rates for each read length
+    // Create a list of read indices to sample, and only keep the first
+    // sample_count reads
+    std::vector<int> read_indices;
+    for (int i = 0; i < read_count; i++) {
+        read_indices.push_back(i);
+    }
+    std::random_shuffle(read_indices.begin(), read_indices.end());
+    read_indices.resize(sample_count);
+
+    // Convert to a set for fast lookup
+    std::unordered_set<int> read_indices_set(read_indices.begin(), read_indices.end());
+
+    std::cout << "Number of sampled reads = " << read_indices_set.size() << std::endl;
 
     // Keep track of number of modified bases on the primary alignment vs other
     // alignments (secondary, supplementary, unmapped)
@@ -370,207 +394,214 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f
     int num_modified_bases_supplementary = 0;
 
     while (sam_read1(bam_file, bam_header, bam_record) >= 0) {
+
+        if (read_indices_set.find(num_reads) == read_indices_set.end()) {
+            num_reads++;
+            continue;
+        }
         num_reads++;
 
-        if (mod_analysis) {
-
-            // Base modification tag analysis
-            // Follow here to get base modification tags:
-            // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c
-            // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274
-            int read_length = bam_record->core.l_qseq;
-            hts_base_mod_state *state = hts_base_mod_state_alloc();
-            std::vector<std::pair<int32_t, int>> c_modified_positions;  // C-modified positions for CpG analysis (chr->(position, strand))
-            // std::unordered_map<char, int> base_mod_counts;  // Type-specific
-            // base modification counts for the alignment
-            std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts;  // Type-specific base modification counts (canonical base -> modified base -> count)
-            std::unordered_map<char, int> base_primary_count;  // Total base counts for the alignment
-
-            // Parse the base modification tags if a primary alignment
-            int read_mod_count = 0;
-            int ret = bam_parse_basemod(bam_record, state);
-            bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
-
-            // Update the number of reads with base modifications for the
-            // primary alignment vs other alignments
-            if (ret >= 0) {
-                if (is_primary) {
-                    num_modified_bases_primary++;
-                } else if (bam_record->core.flag & BAM_FUNMAP) {
-                    num_modified_bases_unmapped++;
-                } else if (bam_record->core.flag & BAM_FSECONDARY) {
-                    num_modified_bases_secondary++;
-                } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) {
-                    num_modified_bases_supplementary++;
-                }
+        // Base modification tag analysis
+        // Follow here to get base modification tags:
+        // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c
+        // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274
+        int read_length = bam_record->core.l_qseq;
+        hts_base_mod_state *state = hts_base_mod_state_alloc();
+        std::vector<std::pair<int32_t, int>> c_modified_positions;  // C-modified positions for CpG analysis (chr->(position, strand))
+        // std::unordered_map<char, int> base_mod_counts;  // Type-specific
+        // base modification counts for the alignment
+        // std::unordered_map<char, std::unordered_map<char, int>>
+        // base_mod_counts;  // Type-specific base modification counts
+        // (canonical base -> modified base -> count)
+        std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts;  // Type-specific base modification probabilities (canonical base -> modified base -> [read length %, probability])
+        std::unordered_map<char, int> base_primary_count;  // Total base counts for the alignment
+
+        // Parse the base modification tags if a primary alignment
+        int read_mod_count = 0;
+        int ret = bam_parse_basemod(bam_record, state);
+        bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
+
+        // Update the number of reads with base modifications for the
+        // primary alignment vs other alignments
+        if (ret >= 0) {
+            if (is_primary) {
+                num_modified_bases_primary++;
+            } else if (bam_record->core.flag & BAM_FUNMAP) {
+                num_modified_bases_unmapped++;
+            } else if (bam_record->core.flag & BAM_FSECONDARY) {
+                num_modified_bases_secondary++;
+            } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) {
+                num_modified_bases_supplementary++;
             }
+        }
 
-            if (ret >= 0 && is_primary) {
-                // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
+        if (ret >= 0 && is_primary) {
+            // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
 
-                // Get the chromosome if alignments are present
-                bool alignments_present = true;
-                std::string chr;
-                std::map<int, int> query_to_ref_map;
-                if (bam_record->core.tid < 0) {
-                    alignments_present = false;
-                } else {
-                    chr = bam_header->target_name[bam_record->core.tid];
+            // Get the chromosome if alignments are present
+            bool alignments_present = true;
+            std::string chr;
+            std::map<int, int> query_to_ref_map;
+            if (bam_record->core.tid < 0) {
+                alignments_present = false;
+            } else {
+                chr = bam_header->target_name[bam_record->core.tid];
 
-                    // Get the query to reference position mapping
-                    query_to_ref_map = this->getQueryToRefMap(bam_record);
-                }
+                // Get the query to reference position mapping
+                query_to_ref_map = this->getQueryToRefMap(bam_record);
+            }
 
-                // Get the strand from the alignment flag (hts_base_mod uses 0 for positive and 1 for negative,
-                // but it always yields 0...)
-                int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0;
+            // Get the strand from the alignment flag (hts_base_mod uses 0 for positive and 1 for negative,
+            // but it always yields 0...)
+            int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0;
 
-                // Get the number of each type of base for the read
-                uint8_t *seq = bam_get_seq(bam_record);
-                for (int i = 0; i < read_length; i++) {
-                    char base = seq_nt16_str[bam_seqi(seq, i)];
-                    base_primary_count[std::toupper(base)]++;
-                }
+            // Get the number of each type of base for the read
+            uint8_t *seq = bam_get_seq(bam_record);
+            for (int i = 0; i < read_length; i++) {
+                char base = seq_nt16_str[bam_seqi(seq, i)];
+                base_primary_count[std::toupper(base)]++;
+            }
 
-                // Iterate over the state object to get the base modification tags
-                // using bam_next_basemod
-                hts_base_mod mods[10];
-                int n = 0;
-                int32_t pos = 0;
-                std::vector<int> query_pos;
-                bool first_mod_found = false;
-                while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) {
-
-                    for (int i = 0; i < n; i++) {
-                        // Update the modified prediction counts
-                        read_mod_count++;  // Read-specific count
-                        final_output.modified_prediction_count++;  // Cumulative count
-                        char canonical_base_char = std::toupper(mods[i].canonical_base);
-                        char mod_type = mods[i].modified_base;
-                        // base_mod_counts[mod_type]++;  // Update the type-specific count
-
-                        // Note: The modified base value can be a positive char (e.g. 'm',
-                        // 'h') (DNA Mods DB) or negative integer (ChEBI ID):
-                        // https://github.com/samtools/hts-specs/issues/741
-                        // DNA Mods: https://dnamod.hoffmanlab.org/
-                        // ChEBI: https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:21839
-                        // Header line:
-                        // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2215
-
-                        // Determine the probability of the modification (-1 if
-                        // unknown)
-                        double probability = -1;
-                        if (mods[i].qual != -1) {
-                            probability = mods[i].qual / 256.0;
-
-                            // Update counts for predictions exceeding the threshold
-                            if (probability >= base_mod_threshold) {
-                                final_output.updateBaseModCounts(mod_type, strand);  // Update the base modification counts
-                                // base_mod_counts[mod_type]++;  // Update the
-                                // type-specific count
-                                base_mod_counts[canonical_base_char][mod_type]++;  // Update the type-specific count
-
-                                // Store the modified positions for later CpG
-                                // analysis if a C modification on a primary alignment
-                                if (canonical_base_char == 'C' && mod_type != 'C') {
-
-                                    // Convert the query position to reference position if available
-                                    if (alignments_present) {
-                                        if (query_to_ref_map.find(pos) != query_to_ref_map.end()) {
-                                            int32_t ref_pos = query_to_ref_map[pos];
-                                            c_modified_positions.push_back(std::make_pair(ref_pos, strand));
-                                        }
+            // Iterate over the state object to get the base modification tags
+            // using bam_next_basemod
+            hts_base_mod mods[10];
+            int n = 0;
+            int32_t pos = 0;
+            std::vector<int> query_pos;
+            bool first_mod_found = false;
+            while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) {
+
+                for (int i = 0; i < n; i++) {
+                    // Update the modified prediction counts
+                    read_mod_count++;  // Read-specific count
+                    final_output.modified_prediction_count++;  // Cumulative count
+                    char canonical_base_char = std::toupper(mods[i].canonical_base);
+                    char mod_type = mods[i].modified_base;
+                    // base_mod_counts[mod_type]++;  // Update the type-specific count
+
+                    // Note: The modified base value can be a positive char (e.g. 'm',
+                    // 'h') (DNA Mods DB) or negative integer (ChEBI ID):
+                    // https://github.com/samtools/hts-specs/issues/741
+                    // DNA Mods: https://dnamod.hoffmanlab.org/
+                    // ChEBI: https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:21839
+                    // Header line:
+                    // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2215
+
+                    // Determine the probability of the modification (-1 if
+                    // unknown)
+                    double probability = -1;
+                    if (mods[i].qual != -1) {
+                        probability = mods[i].qual / 256.0;
+
+                        // Update the read length % and probability for the
+                        // modification
+                        double read_len_pct = (double) (pos + 1) / read_length;
+                        std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl;
+                        final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability);  // Update the base modification probabilities
+
+                        // Update counts for predictions exceeding the threshold
+                        if (probability >= base_mod_threshold) {
+                            final_output.updateBaseModCounts(mod_type, strand);  // Update the base modification counts
+                            // base_mod_counts[mod_type]++;  // Update the
+                            // type-specific count
+                            // base_mod_counts[canonical_base_char][mod_type]++;  // Update the type-specific count
+
+                            // Store the modified positions for later CpG
+                            // analysis if a C modification on a primary alignment
+                            if (canonical_base_char == 'C' && mod_type != 'C') {
+
+                                // Convert the query position to reference position if available
+                                if (alignments_present) {
+                                    if (query_to_ref_map.find(pos) != query_to_ref_map.end()) {
+                                        int32_t ref_pos = query_to_ref_map[pos];
+                                        c_modified_positions.push_back(std::make_pair(ref_pos, strand));
                                     }
                                 }
                             }
-                            // } else {
-                            //     base_primary_count[mod_type]++;  // Update the type-specific unmodified count
-                            // }
                         }
+                        // } else {
+                        //     base_primary_count[mod_type]++;  // Update the type-specific unmodified count
+                        // }
                     }
                 }
+            }
 
-                // Append the modified positions to the output data
-                if (c_modified_positions.size() > 0) {
-                    // Set the atomic flag and print a message if base
-                    // modification tags are present in the file
-                    if (!this->has_mm_ml_tags.test_and_set()) {
-                        printMessage("Base modification data found (MM, ML tags)");
-                    }
-
-                    // Add the modified positions to the output data
-                    if (final_output.sample_c_modified_positions.find(chr) == final_output.sample_c_modified_positions.end()) {
-                        final_output.sample_c_modified_positions[chr] = c_modified_positions;
-                    } else {
-                        final_output.sample_c_modified_positions[chr].insert(final_output.sample_c_modified_positions[chr].end(), c_modified_positions.begin(), c_modified_positions.end());
-                    }
+            // Append the modified positions to the output data
+            if (c_modified_positions.size() > 0) {
+                // Set the atomic flag and print a message if base
+                // modification tags are present in the file
+                if (!this->has_mm_ml_tags.test_and_set()) {
+                    printMessage("Base modification data found (MM, ML tags)");
                 }
-            }
-            hts_base_mod_state_free(state);  // Deallocate the base modification state object
-
-            // Calculate the base modification rate for the read
-            // double read_mod_rate = 0.0;
-            // if (read_length > 0) {
-            //     read_mod_rate = (double) read_mod_count / read_length;
-            // }
-
-            // Calculate the type-specific base modification rates for the read
-            std::unordered_map<char, double> base_mod_rates;
-            for (auto const &it : base_mod_counts) {
-                char canonical_base = it.first;
-                std::unordered_map<char, int> mod_counts = it.second;
-                double mod_rate = 0.0;
-                int total_base_count = base_primary_count[canonical_base];
-
-                // Calculate the modification rate for each modification type
-                for (auto const &it2 : mod_counts) {
-                    char mod_type = it2.first;
-                    int mod_count = it2.second;
-                    double mod_rate = 0.0;
-                    if (mod_count + total_base_count > 0) {
-                        mod_rate = (double) mod_count / total_base_count;
-                    }
-                    base_mod_rates[mod_type] = mod_rate;
+
+                // Add the modified positions to the output data
+                if (final_output.sample_c_modified_positions.find(chr) == final_output.sample_c_modified_positions.end()) {
+                    final_output.sample_c_modified_positions[chr] = c_modified_positions;
+                } else {
+                    final_output.sample_c_modified_positions[chr].insert(final_output.sample_c_modified_positions[chr].end(), c_modified_positions.begin(), c_modified_positions.end());
                 }
-                // for (auto const &it2 : mod_counts) {
-                //     total_mod_count += it2.second;
-                // }
-                // if (total_mod_count + total_base_count > 0) {
-                //     mod_rate = (double) total_mod_count / (total_mod_count + total_base_count);
-                // }
-                // base_mod_rates[canonical_base] = mod_rate;
             }
-            // for (auto const &it : base_mod_counts) {
-            //     char mod_type = it.first;
-            //     int mod_count = it.second;
-            //     double mod_rate = 0.0;
-            //     int total_base_count = base_primary_count[mod_type];
-            //     if (mod_count + unmod_count > 0) {
-            //         mod_rate = (double) mod_count / (mod_count + unmod_count);
-            //     }
-            //     // if (read_length > 0) {
-            //     //     mod_rate = (double) mod_count / read_length;
-            //     // }
-            //     base_mod_rates[mod_type] = mod_rate;
-            // }
-            final_output.updateReadModRate(read_length, base_mod_rates);  // Update the output data
         }
+        hts_base_mod_state_free(state);  // Deallocate the base modification state object
+
+        // Calculate the base modification rate for the read
+        // double read_mod_rate = 0.0;
+        // if (read_length > 0) {
+        //     read_mod_rate = (double) read_mod_count / read_length;
+        // }
+
+        // Calculate the type-specific base modification rates for the read
+        // std::unordered_map<char, double> base_mod_rates;
+        // for (auto const &it : base_mod_counts) {
+        //     char canonical_base = it.first;
+        //     std::unordered_map<char, int> mod_counts = it.second;
+        //     double mod_rate = 0.0;
+        //     int total_base_count = base_primary_count[canonical_base];
+
+        //     // Calculate the modification rate for each modification type
+        //     for (auto const &it2 : mod_counts) {
+        //         char mod_type = it2.first;
+        //         int mod_count = it2.second;
+        //         double mod_rate = 0.0;
+        //         if (mod_count + total_base_count > 0) {
+        //             mod_rate = (double) mod_count / total_base_count;
+        //         }
+        //         base_mod_rates[mod_type] = mod_rate;
+        //     }
+        //     // for (auto const &it2 : mod_counts) {
+        //     //     total_mod_count += it2.second;
+        //     // }
+        //     // if (total_mod_count + total_base_count > 0) {
+        //     //     mod_rate = (double) total_mod_count / (total_mod_count + total_base_count);
+        //     // }
+        //     // base_mod_rates[canonical_base] = mod_rate;
+        // }
+        // for (auto const &it : base_mod_counts) {
+        //     char mod_type = it.first;
+        //     int mod_count = it.second;
+        //     double mod_rate = 0.0;
+        //     int total_base_count = base_primary_count[mod_type];
+        //     if (mod_count + unmod_count > 0) {
+        //         mod_rate = (double) mod_count / (mod_count + unmod_count);
+        //     }
+        //     // if (read_length > 0) {
+        //     //     mod_rate = (double) mod_count / read_length;
+        //     // }
+        //     base_mod_rates[mod_type] = mod_rate;
+        // }
+        // final_output.updateReadModRate(read_length, base_mod_rates);  // Update the output data
     }
 
     // Summary of base modification counts
-    if (mod_analysis) {
-        printMessage("Base modification counts:");
-        printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary));
-        printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped));
-        printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary));
-        printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary));
-    }
+    printMessage("Base modification counts:");
+    printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary));
+    printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped));
+    printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary));
+    printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary));
 
     bam_destroy1(bam_record);
     bam_hdr_destroy(bam_header);
     sam_close(bam_file);
-
-    return num_reads;
 }
 
 // Get the mapping of query positions to reference positions for a given alignment record
diff --git a/src/output_data.cpp b/src/output_data.cpp
index 148fbeb..edf85d2 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -290,6 +290,12 @@ void Output_BAM::updateBaseModCounts(char mod_type, int strand)
     }
 }
 
+void Output_BAM::updateBaseModProbabilities(char mod_type, double pct_len, double probability)
+{
+    // Update the base modification probabilities
+    this->read_pct_len_vs_mod_prob[mod_type].push_back(std::make_pair(pct_len, probability));
+}
+
 void Output_BAM::updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates) {
     ReadModData read_mod_data;
     read_mod_data.read_length = read_length;
@@ -357,6 +363,40 @@ uint64_t Output_BAM::getModTypeCount(char mod_type, int strand)
     }
 }
 
+double Output_BAM::getNthReadLenPct(int read_index, char mod_type)
+{
+    double read_len_pct = 0.0;
+    try {
+        this->read_pct_len_vs_mod_prob.at(mod_type);
+    } catch (const std::out_of_range& oor) {
+        std::cerr << "Error: Read length percentage not found for type " << mod_type << std::endl;
+    }
+    try {
+        read_len_pct = this->read_pct_len_vs_mod_prob[mod_type].at(read_index).first;
+    } catch (const std::out_of_range& oor) {
+        std::cerr << "Error: Read length percentage not found for read index " << read_index << " and type " << mod_type << std::endl;
+        return 0.0;
+    }
+    return read_len_pct;
+}
+
+double Output_BAM::getNthReadModProb(int read_index, char mod_type)
+{
+    double mod_prob = 0.0;
+    try {
+        this->read_pct_len_vs_mod_prob.at(mod_type);
+    } catch (const std::out_of_range& oor) {
+        std::cerr << "Error: Modification probability not found for type " << mod_type << std::endl;
+    }
+    try {
+        mod_prob = this->read_pct_len_vs_mod_prob[mod_type].at(read_index).second;
+    } catch (const std::out_of_range& oor) {
+        std::cerr << "Error: Modification probability not found for read index " << read_index << " and type " << mod_type << std::endl;
+        return 0.0;
+    }
+    return mod_prob;
+}
+
 int Output_BAM::getReadCount()
 {
     return this->read_move_table.size();
diff --git a/src/plot_utils.py b/src/plot_utils.py
index f9b8239..f6bc76f 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -1059,33 +1059,32 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         logging.info("Getting base modification statistics")
 
         # Get the read length vs. base modification rate data for each
-        # modification type
-        logging.info("Getting mod data size")
-        read_mod_data_size = output_data.getReadModDataSize()
-        logging.info("Mod data size: {}".format(read_mod_data_size))
-
-        # Choose a maximum of 10,000 reads to randomly sample for the plot
-        max_reads = min(read_mod_data_size, 10000)        
-        # read_indices = set(sample(range(read_mod_data_size), max_reads))
-        read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False)
-        read_length_mod_rates = {}
-
-        # Get the read length vs. base modification rate data for each
-        # modification type in the sampled reads
-        for i in read_indices:
-            for mod_type in base_mod_types:
-                if mod_type not in read_length_mod_rates:
-                    read_length_mod_rates[mod_type] = []
-
-                # logging.info("Getting read length for read {}".format(i))
-                # read_length = output_data.getNthReadModLength(i)
-                read_length = output_data.getNthReadModLength(int(i))
-                # logging.info("Getting read length vs. {} modification rate".format(mod_type))
-                # mod_rate = output_data.getNthReadModRate(i, mod_type)
-                mod_rate = output_data.getNthReadModRate(int(i), mod_type)
-                # logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate))
-                read_length_mod_rates[mod_type].append((read_length, mod_rate))
-
+        # # modification type
+        # logging.info("Getting mod data size")
+        # read_mod_data_size = output_data.getReadModDataSize()
+        # logging.info("Mod data size: {}".format(read_mod_data_size))
+
+        # # Choose a maximum of 10,000 reads to randomly sample for the plot
+        # max_reads = min(read_mod_data_size, 10000)        
+        # # read_indices = set(sample(range(read_mod_data_size), max_reads))
+        # read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False)
+        # read_length_mod_rates = {}
+
+        # Get the read length (%) vs. base modification probability data for
+        # each sampled read
+        sample_count = 10000
+        read_len_pct = []
+        mod_prob = []
+        for mod_type in base_mod_types:
+            for i in range(sample_count):
+                try:
+                    pct = output_data.getNthReadLenPct(i, mod_type)
+                    prob = output_data.getNthReadModProb(i, mod_type)
+                    read_len_pct.append(pct)
+                    mod_prob.append(prob)
+                except Exception as e:
+                    logging.error(f"Error getting read length vs. base modification probability data: {e}")
+        
         # Dictionary of modification character to full name
         mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \
                             'g': '5hmU', 'e': '5fu', 'b': '5caU', \
@@ -1094,78 +1093,46 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                             'N': 'Amb. N', \
                             'v': 'pseU'}
 
-        # Create a plot of read length vs. base modification rate for each
-        # modification type
-        # Make subplots vertically for each modification type
-        subplot_titles = []
-        for mod_type in base_mod_types:
-            try:
-                mod_name = mod_char_to_name[mod_type]
-            except KeyError:
-                logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
-                mod_name = mod_type
+        # Create a plot of pct read length vs. base modification probability for
+        # each modification type, as well as a histogram of the average base
+        # modification probability for 100 bins of the read length
+
+        # Make a subplot of two columns for the read length vs. base
+        # modification probability and the histogram of the average base
+        # modification probability for each modification type
+        fig = make_subplots(rows=len(base_mod_types), cols=2, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=[f"{mod_char_to_name[mod_type]} Modification Probability" for mod_type in base_mod_types])
 
-            subplot_titles.append('Read Length vs. {} Modification Rate'.format(mod_name))
-            
-            
-        fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=subplot_titles)
-        min_x = float('inf')
-        max_x = 0
         for i, mod_type in enumerate(base_mod_types):
+            logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}")
 
-            # Format the data
-            mod_data = read_length_mod_rates[mod_type]
-            mod_rates = [data[1] * 100 for data in mod_data]
-            x_vals = [data[0] for data in mod_data]
-
-            # Remove outlier read lengths using the IQR method
-            if len(x_vals) > 1:
-                x_vals_np = np.array(x_vals)
-                q1 = np.percentile(x_vals_np, 25)
-                q3 = np.percentile(x_vals_np, 75)
-                iqr = q3 - q1
-                lower_bound = q1 - 1.5 * iqr
-                upper_bound = q3 + 1.5 * iqr
-
-                # Filter the data to remove outliers
-                filtered_data = [(x, y) for x, y in zip(x_vals, mod_rates) if lower_bound <= x <= upper_bound]
-                x_vals, mod_rates = zip(*filtered_data)
-
-            # Normalize the read lengths to the maximum read length (0-100)
-            x_vals = [100 * x / max(x_vals) for x in x_vals]
-            
-            # Use 0-100 for the x-axis ticks and labels
-            x_tick_values = np.arange(0, 101, 10)
-            x_tick_labels = ['{:,}%'.format(int(val)) for val in x_tick_values]
-
-            # Get the modification name
-            try:
-                mod_name = mod_char_to_name[mod_type]
-            except KeyError:
-                logging.warning("WARNING: Unknown modification type: {}".format(mod_type))
-                mod_name = mod_type
-
-            # fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1)
-
-            # Create a heatmap plot
-            fig.add_trace(go.Histogram2dContour(x=x_vals, y=mod_rates, colorscale='Viridis', showlegend=False), row=i + 1, col=1)
-
-            # Update the layout
-            x_axis_name = get_axis_name(i)
-            y_axis_name = get_axis_name(i, 'y')
-            logging.info("Index: {}, Y index: {}".format(i, y_axis_name))
+            # Add the trace for the read length vs. base modification
+            # probability scatter plot
+            fig.add_trace(go.Scatter
+                (x=read_len_pct, y=mod_prob, mode='markers', name=mod_char_to_name[mod_type], marker=dict(size=5), showlegend=False),
+                row=i + 1, col=1)
             
-            # Auto range the axes
-            fig.update_layout(
-                **{f"{x_axis_name}_title": 'Normalized Read Length (%)',
-                    f"{y_axis_name}_title": 'Modification Rate (%)'},
-                **{f"{x_axis_name}_tickmode": 'array',
-                    f"{x_axis_name}_tickvals": x_tick_values,
-                    f"{x_axis_name}_ticktext": x_tick_labels},
-                **{f"{y_axis_name}_range": [0, 100]}
-            )
-            
-        fig.update_layout(font=dict(size=PLOT_FONT_SIZE))
+            # Add a bar plot of the average base modification probability for
+            # 100 bins of the read length
+            bins = np.linspace(0, 100, 101)
+            bin_indices = np.digitize(read_len_pct, bins)
+            avg_prob_per_bin = np.zeros(100)
+            bin_centers = (bins[:-1] + bins[1:]) / 2
+
+            for j in range(100):
+                bin_mask = bin_indices == j
+                avg_prob_per_bin[j] = np.mean([mod_prob[k] for k in range(len(read_len_pct)) if bin_mask[k]])
+
+            # Create the bar plot
+            fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2)
+
+            # Update the plot style
+            fig.update_xaxes(title="Read Length (%)", row=i + 1, col=1)
+            fig.update_yaxes(title="Modification Probability", row=i + 1, col=1)
+            fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2)
+            fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2)
+
+        # Update the plot layout
+        fig.update_layout(title="Read Length vs. Base Modification Probability", font=dict(size=PLOT_FONT_SIZE))
             
         # Generate the HTML
         if len(base_mod_types) > 0:
diff --git a/src/tin.cpp b/src/tin.cpp
index 0c2d0c4..3beae15 100644
--- a/src/tin.cpp
+++ b/src/tin.cpp
@@ -171,7 +171,7 @@ bool checkMinReads(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::st
     return min_reads_met;
 }
 
-void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder)
+void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count)
 {
     std::cout << "Using TIN minimum coverage " << min_cov << " and sample size " << sample_size << std::endl;
 
@@ -182,6 +182,9 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s
         exit(1);
     }
 
+    // Enable multi-threading
+    hts_set_threads(bam_file, thread_count);
+
     // Read the BAM header
     bam_hdr_t* header = sam_hdr_read(bam_file);
     if (header == NULL) {
@@ -206,6 +209,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s
 
     // Loop through the gene BED file and calculate the TIN score for each
     // transcript
+    std::cout << "Calculating TIN scores for each transcript..." << std::endl;
     std::vector<double> TIN_scores;
     std::vector<std::string> gene_ids;
     std::string line;
@@ -396,6 +400,11 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s
 
         // Store the TIN score for the transcript
         tin_map[name] = std::make_tuple(chrom, start, end, TIN);
+
+        // Log every 1000 transcripts
+        if (gene_ids.size() % 1000 == 0) {
+            std::cout << "Processed " << gene_ids.size() << " transcripts" << std::endl;
+        }
     }
 
     // Close the BAM file
@@ -413,6 +422,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s
     if (TIN_scores.size() == 0) {
         std::cerr << "No TIN scores calculated" << std::endl;
     } else {
+        std::cout << "Calculating TIN summary for " << TIN_scores.size() << " transcripts..." << std::endl;
 
         // Print the TIN mean, median, and standard deviation
         double TIN_sum = 0;

From 1607fb075fc537bb1b98db2f13abdd4dbfeef74d Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 15 Jan 2025 18:36:23 -0500
Subject: [PATCH 15/25] Fix bam cleanup error

---
 src/hts_reader.cpp | 21 +++++++++++++--
 src/plot_utils.py  | 64 +++++++++++++++++++++++++++++++++++-----------
 2 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 01e0524..f46606d 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -12,6 +12,7 @@ Class for reading a set number of records from a BAM file. Used for multi-thread
 #include <fstream>
 #include <math.h>
 #include <algorithm>  // std::find
+#include <random>
 #include <htslib/sam.h>
 
 #include "utils.h"
@@ -361,6 +362,11 @@ int HTSReader::getNumRecords(const std::string& bam_filename, int thread_count)
         num_reads++;
     }
 
+    // Close the BAM file
+    bam_destroy1(bam_record);
+    bam_hdr_destroy(bam_header);
+    sam_close(bam_file);
+
     return num_reads;
 }
 
@@ -372,15 +378,26 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
     bam1_t* bam_record = bam_init1();
     int64_t num_reads = 0;
 
+    // Create a random number generator and seed it with the current time
+    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+    std::default_random_engine generator(seed);
+
     // Create a list of read indices to sample, and only keep the first
     // sample_count reads
     std::vector<int> read_indices;
     for (int i = 0; i < read_count; i++) {
         read_indices.push_back(i);
     }
-    std::random_shuffle(read_indices.begin(), read_indices.end());
+    std::shuffle(read_indices.begin(), read_indices.end(), generator);
     read_indices.resize(sample_count);
 
+    // Print first 100 read indices sorted
+    // std::sort(read_indices.begin(), read_indices.end());
+    // std::cout << "First 100 read indices: " << std::endl;
+    // for (int i = 0; i < 100; i++) {
+    //     std::cout << read_indices[i] << std::endl;
+    // }
+
     // Convert to a set for fast lookup
     std::unordered_set<int> read_indices_set(read_indices.begin(), read_indices.end());
 
@@ -496,7 +513,7 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
                         // Update the read length % and probability for the
                         // modification
                         double read_len_pct = (double) (pos + 1) / read_length;
-                        std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl;
+                        // std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl;
                         final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability);  // Update the base modification probabilities
 
                         // Update counts for predictions exceeding the threshold
diff --git a/src/plot_utils.py b/src/plot_utils.py
index f6bc76f..7686d1f 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -1084,6 +1084,10 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                     mod_prob.append(prob)
                 except Exception as e:
                     logging.error(f"Error getting read length vs. base modification probability data: {e}")
+
+        # Convert the lists to numpy arrays
+        read_len_pct = np.array(read_len_pct) * 100  # Convert to percentage
+        mod_prob = np.array(mod_prob)
         
         # Dictionary of modification character to full name
         mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \
@@ -1111,25 +1115,53 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                 (x=read_len_pct, y=mod_prob, mode='markers', name=mod_char_to_name[mod_type], marker=dict(size=5), showlegend=False),
                 row=i + 1, col=1)
             
+            # Print the first 50 pairs sorted by read length for debugging
+            # read_len_pct, mod_prob = zip(*sorted(zip(read_len_pct, mod_prob)))
+            # if i == 0:
+            #     for j in range(50):
+            #         logging.info(f"Read length: {read_len_pct[j]}, Modification probability: {mod_prob[j]}")
+            
+            # # Create a histogram of the base modification probabilities
+            # base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20)
+            # fig.add_trace(base_mod_prob_hist, row=i + 1, col=2)
+            
             # Add a bar plot of the average base modification probability for
             # 100 bins of the read length
-            bins = np.linspace(0, 100, 101)
-            bin_indices = np.digitize(read_len_pct, bins)
-            avg_prob_per_bin = np.zeros(100)
-            bin_centers = (bins[:-1] + bins[1:]) / 2
-
-            for j in range(100):
-                bin_mask = bin_indices == j
-                avg_prob_per_bin[j] = np.mean([mod_prob[k] for k in range(len(read_len_pct)) if bin_mask[k]])
-
-            # Create the bar plot
-            fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2)
+            # bins = np.linspace(0, 100, 11)  # 10 bins (0-10%, 10-20%, ..., 90-100%)
+            # bin_centers = (bins[:-1] + bins[1:]) / 2  # Bin centers for plotting
+
+            # # Get the average probability per bin
+            # avg_prob_per_bin = np.zeros(10)
+            # bin_indices = np.digitize(read_len_pct, bins) - 1
+            # for j in range(10):  # Loop over bins
+            #     bin_mask = (bin_indices == j)
+            #     if np.any(bin_mask):
+            #         avg_prob_per_bin[j] = np.mean(mod_prob[bin_mask])
+            #         logging.info(f"Bin {j}: {avg_prob_per_bin[j]}")
+
+            # # Create the bar plot
+
+            # # Print the bins and read length percentages for the first 10 reads
+            # # for debugging
+            # if i == 0:
+            #     logging.info("Bins: {}".format(bins))
+            #     logging.info("Bin indices: {}".format(bin_indices[:10]))
+            #     logging.info("Read length percentages: {}".format(read_len_pct[:10]))
+
+            # # Create the bar plot
+            # fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2)
 
             # Update the plot style
             fig.update_xaxes(title="Read Length (%)", row=i + 1, col=1)
             fig.update_yaxes(title="Modification Probability", row=i + 1, col=1)
-            fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2)
-            fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2)
+            fig.update_xaxes(title="Modification Probability", row=i + 1, col=2)
+            fig.update_yaxes(title="Frequency", row=i + 1, col=2)
+            # fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2)
+            # fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2)
+
+            # Set the range of the y-axis to 0-1
+            fig.update_yaxes(range=[0, 1], row=i + 1, col=1)
+            # fig.update_yaxes(range=[0, 1], row=i + 1, col=2)
 
         # Update the plot layout
         fig.update_layout(title="Read Length vs. Base Modification Probability", font=dict(size=PLOT_FONT_SIZE))
@@ -1137,12 +1169,14 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         # Generate the HTML
         if len(base_mod_types) > 0:
             plot_height = 500 * len(base_mod_types)
+            plot_width = 700 * 2
             logging.info("Saving the read length vs. modification rates plot")
-            plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=700)
+            plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=plot_width)
     else:
         logging.warning("WARNING: No modification types found")
 
-    # Create the base modification statistics table
+    # Create the base modification statistics table'
+    logging.info("Creating the base modification statistics table")
     table_str = "<table>\n<tbody>"
     row_str, row_flag = format_row("Total Unfiltered Predictions", [output_data.modified_prediction_count], 'int', None)
     table_str += row_str

From 3e2efdd0853cbefae9a660d6d1eca7cead0f410f Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 16 Jan 2025 16:17:55 -0500
Subject: [PATCH 16/25] Fix Q-score distribution

---
 src/fastq_module.cpp | 66 ++++++++++++++++++++++++++++++++++++++------
 src/output_data.cpp  | 43 +++++++++++++++++++++++------
 src/plot_utils.py    |  6 ++--
 3 files changed, 96 insertions(+), 19 deletions(-)

diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp
index e16dadd..44e70bb 100644
--- a/src/fastq_module.cpp
+++ b/src/fastq_module.cpp
@@ -9,6 +9,7 @@
 
 #include <fstream>
 #include <iostream>
+#include <sstream>
 
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -20,7 +21,7 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
     int exit_code = 0;
     int read_len;
     double read_gc_cnt;
-    double read_mean_base_qual;
+    // double read_mean_base_qual;
     Basic_Seq_Statistics &long_read_info = output_data.long_read_info;
     Basic_Seq_Quality_Statistics &seq_quality_info = output_data.seq_quality_info;
     long_read_info.total_num_reads = ZeroDefault; // total number of long reads
@@ -62,10 +63,33 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
                 // Store the read length
                 long_read_info.read_lengths.push_back(read_len);
 
+                // Access base quality data
+                // printMessage("[TEST1] Base quality string: " + raw_read_qual);
+                char value;
+                std::vector<int> base_quality_values;
+                // std::string base_quality_str = raw_read_qual;
+                std::istringstream iss(raw_read_qual);
+                while (iss >> value)
+                {
+                    int base_quality_value = value - '!';
+                    base_quality_values.push_back(base_quality_value);
+                    // printMessage("[TEST1] Base quality value: " + std::to_string(base_quality_value));
+                }
+
+                // Ensure that the base quality string has the same length as
+                // the read sequence
+                if (base_quality_values.size() != read_len)
+                {
+                    printError("Error: Base quality string length does not match read sequence length");
+                    exit_code = 1;
+                    break;
+                }
+
                 // Process base and quality information
                 read_gc_cnt = 0;
-                read_mean_base_qual = 0;
-                uint64_t base_quality_value;
+                // read_mean_base_qual = 0;
+                int base_quality_value;
+                double cumulative_base_prob = 0;  // Read cumulative base quality probability
                 for (int i = 0; i < read_len; i++)
                 {
                     if (read_seq[i] == 'A' || read_seq[i] == 'a')
@@ -86,15 +110,30 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
                     {
                         long_read_info.total_tu_cnt += 1;
                     }
-                    base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset;
+
+                    // Get the base quality (Phred) value
+                    base_quality_value = base_quality_values[i];
+                    // base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset;
                     try {
                         seq_quality_info.base_quality_distribution[base_quality_value] += 1;
                     } catch (const std::out_of_range& oor) {
                         printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value");
                     }
-                    read_mean_base_qual += (double) base_quality_value;
+                    // read_mean_base_qual += (double) base_quality_value;
+
+                    // Convert the Phred quality value to a probability
+                    double base_quality_prob = pow(10, -base_quality_value / 10.0);
+                    cumulative_base_prob += base_quality_prob;
                 }
 
+                // Calculate the mean base quality probability
+                cumulative_base_prob /= (double)read_len;
+
+                // Convert the mean base quality probability to a Phred quality
+                // value
+                double read_mean_base_qual = -10.0 * log10(cumulative_base_prob);
+                // printMessage("Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual));
+
                 // Update the per-read GC content distribution
                 double gc_content_pct = (100.0 * read_gc_cnt) / static_cast<double>(read_len);
                 int gc_content_int = static_cast<int>(std::round(gc_content_pct));
@@ -105,13 +144,24 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
                 }
                 
                 // Update the per-read base quality distribution
-                double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len);
-                unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct));
+                // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len);
+                // unsigned int read_mean_base_qual_int = static_cast<unsigned
+                // int>(std::round(read_mean_base_qual_pct));
+                int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual));
+
+                // printMessage("Rounded Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual_int));
+
                 try {
-                    seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1;
+                    seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1;
                 } catch (const std::out_of_range& oor) {
                     printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
                 }
+                
+                // try {
+                //     seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1;
+                // } catch (const std::out_of_range& oor) {
+                //     printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
+                // }
 
                 fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual);  // Write to file
             }
diff --git a/src/output_data.cpp b/src/output_data.cpp
index edf85d2..2401847 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -707,11 +707,17 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
         base_quality_values.push_back(base_quality_value);
     }
 
+    // Ensure the base quality values match the sequence length
+    if (base_quality_values.size() != base_count) {
+        printError("Warning: Base quality values do not match the sequence length for read ID " + std::string(read_name));
+    }
+
     // Update the base quality and GC content information
     int gc_count = 0;
-    double read_mean_base_qual = 0;
+    // double read_mean_base_qual = 0;
+    double cumulative_base_prob = 0;  // Read cumulative base quality probability
     char current_base;
-    uint64_t base_quality_value;
+    int base_quality_value;
     for (int i = 0; i < base_count; i++)
     {
         current_base = sequence_data_str[i];
@@ -733,16 +739,30 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
         {
             long_read_info.total_tu_cnt += 1;
         }
-        // Get the base quality
-        base_quality_value = (uint64_t)base_quality_values[i];
+        // Get the base quality (Phred) value
+        base_quality_value = base_quality_values[i];
+
+        // Update the per-base quality distribution
         try {
             seq_quality_info.base_quality_distribution[base_quality_value] += 1;
         } catch (const std::out_of_range& oor) {
             printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value");
         }
-        read_mean_base_qual += (double)base_quality_value;
+
+        // Convert the Phred quality value to a probability
+        double base_quality_prob = pow(10, -base_quality_value / 10.0);
+        // read_mean_base_qual += (double)base_quality_value;
+        cumulative_base_prob += base_quality_prob;
     }
 
+    // Calculate the mean base quality probability
+    cumulative_base_prob /= (double)base_count;
+
+    // Convert the mean base quality probability to a Phred quality value
+    double read_mean_base_qual = -10.0 * log10(cumulative_base_prob);
+
+    // printMessage("Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual));
+
     // Calculate percent guanine & cytosine
     // gc_content_pct = 100.0 *( (double)gc_count / (double)base_count );
 
@@ -756,10 +776,17 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
     }
 
     // Update the per-read base quality distribution
-    double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count);
-    unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct));
+    // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count);
+    // unsigned int read_mean_base_qual_int = static_cast<unsigned
+    // int>(std::round(read_mean_base_qual_pct));
+    int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual));
+
+    // printMessage("Rounded Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual_int));
+
     try {
-        seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1;
+        // seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int]
+        // += 1;
+        seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1;
     } catch (const std::out_of_range& oor) {
         printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
     }
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 7686d1f..472f935 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -1121,9 +1121,9 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
             #     for j in range(50):
             #         logging.info(f"Read length: {read_len_pct[j]}, Modification probability: {mod_prob[j]}")
             
-            # # Create a histogram of the base modification probabilities
-            # base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20)
-            # fig.add_trace(base_mod_prob_hist, row=i + 1, col=2)
+            # Create a histogram of the base modification probabilities
+            base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20)
+            fig.add_trace(base_mod_prob_hist, row=i + 1, col=2)
             
             # Add a bar plot of the average base modification probability for
             # 100 bins of the read length

From e9006da117abf33227b45be46787130031c43797 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 20 Jan 2025 15:11:42 -0500
Subject: [PATCH 17/25] Add bam read avg base quality plots

---
 README.md             |   7 +-
 include/hts_reader.h  |   2 +-
 include/output_data.h |   2 +-
 src/bam_module.cpp    |  23 ++----
 src/cli.py            |   4 +-
 src/fast5_module.cpp  |  12 +---
 src/fastq_module.cpp  |  41 ++++-------
 src/hts_reader.cpp    | 158 +++++++++++-------------------------------
 src/output_data.cpp   |  37 +++-------
 src/plot_utils.py     | 122 ++++++++------------------------
 10 files changed, 110 insertions(+), 298 deletions(-)

diff --git a/README.md b/README.md
index 1ce0df6..8184491 100644
--- a/README.md
+++ b/README.md
@@ -258,7 +258,12 @@ longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY
 # ONT POD5
 
 This section describes how to generate QC reports for ONT POD5 (signal) files and their corresponding basecalled BAM files (data shown is HG002 using ONT
-R10.4.1 and LSK114 downloaded from the tutorial https://github.com/epi2me-labs/wf-basecalling).
+R10.4.1 and LSK114 downloaded from the tutorial
+https://github.com/epi2me-labs/wf-basecalling).
+
+> [!NOTE]
+> This requires generating basecalled BAM files with the move table output. For
+> example, for [dorado](https://github.com/nanoporetech/dorado), the parameter is `--emit-moves`
 
 ![image](https://github.com/user-attachments/assets/62c3c810-5c1a-4124-816b-74245af8b57c)
 
diff --git a/include/hts_reader.h b/include/hts_reader.h
index 5d3a628..bc8f5d8 100644
--- a/include/hts_reader.h
+++ b/include/hts_reader.h
@@ -38,7 +38,7 @@ class HTSReader {
         bool reading_complete = false;
 
         // Update read and base counts
-        int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution, bool is_primary);
+        int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, Basic_Seq_Quality_Statistics& seq_quality_info, bool is_primary);
 
         // Read the next batch of records from the BAM file
         int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold);
diff --git a/include/output_data.h b/include/output_data.h
index bca521f..2e1610d 100644
--- a/include/output_data.h
+++ b/include/output_data.h
@@ -79,7 +79,7 @@ class Basic_Seq_Quality_Statistics
    //std::vector<uint64_t> base_quality_distribution;
    // Array of base quality distribution initialized to 0
    uint64_t base_quality_distribution[MAX_BASE_QUALITY] = {ZeroDefault};
-   std::vector<int> read_average_base_quality_distribution;
+   std::vector<int> read_average_base_quality_distribution;  // Read average base quality distribution
    int min_base_quality = MoneDefault; // minimum base quality;
    int max_base_quality = MoneDefault; // maximum base quality;
    std::vector<int> pos_quality_distribution;
diff --git a/src/bam_module.cpp b/src/bam_module.cpp
index d2a96eb..0014509 100644
--- a/src/bam_module.cpp
+++ b/src/bam_module.cpp
@@ -154,41 +154,27 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_
 
          // Calculate statistics in batches
          printMemoryUsage("Before batch processing");
-
-        // TEST
-        // int max_reads = 10;
-        // int current_reads = 0;
          
          while (reader.hasNextRecord()){
-        // while (current_reads < max_reads && reader.hasNextRecord()){
             // Read the next batch of records
-            std::cout << "Generating " << thread_count << " thread(s)..." << std::endl;
+            // std::cout << "Generating " << thread_count << " thread(s)..." <<
+            // std::endl;
+            printMessage("Generating " + std::to_string(thread_count) + " thread(s)...");
             std::vector<std::thread> thread_vector;
             for (int thread_index=0; thread_index<thread_count; thread_index++){
-
-                // Copy the input read IDs to a new vector
                 std::unordered_set<std::string> rrms_read_ids_copy = input_params.rrms_read_ids;
-
-                // Create a thread
                 std::thread t((BAM_Module::batchStatistics), std::ref(reader), batch_size, rrms_read_ids_copy,std::ref(final_output), std::ref(bam_mutex), std::ref(output_mutex), std::ref(cout_mutex), base_mod_threshold);
-
-                // Add the thread to the vector
                 thread_vector.push_back(std::move(t));
             }
 
             // Join the threads in thread_vector
-            std::cout<<"Joining threads..."<<std::endl;
+            // std::cout<<"Joining threads..."<<std::endl;
             int thread_index = 0;
             for (auto& t : thread_vector){
-                // Join the thread if it is joinable
                 if (t.joinable()){
                     t.join();
                 }
-                printMemoryUsage("After thread " + std::to_string(thread_index));
                 thread_index++;
-
-                // TEST - Increment the current reads
-                // current_reads += batch_size;
             }
             std::cout << "All threads joined." << std::endl;
         }
@@ -257,6 +243,7 @@ void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unorder
 {
     // Read the next N records
     Output_BAM record_output;
+    printMessage("Reading next batch of records... " + std::to_string(batch_size));
     reader.readNextRecords(batch_size, record_output, bam_mutex, read_ids, base_mod_threshold);
 
     // Update the final output
diff --git a/src/cli.py b/src/cli.py
index ebf10d6..80b23ba 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -165,7 +165,7 @@ def fq_module(margs):
 
 
 def fa_module(margs):
-    # Run the FASTA filetype module.
+    """FASTA file input module."""
 
     # Get the filetype-specific parameters
     param_dict = get_common_param(margs)
@@ -253,7 +253,7 @@ def bam_module(margs):
             plot_filepaths = plot(bam_output, param_dict, 'BAM')
 
             # Set the list of QC information to display
-            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"]
+            qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality", "read_avg_base_quality"]
 
             # If base modifications were found, add the base modification plots
             # after the first table
diff --git a/src/fast5_module.cpp b/src/fast5_module.cpp
index ceab46f..11b2909 100644
--- a/src/fast5_module.cpp
+++ b/src/fast5_module.cpp
@@ -470,12 +470,6 @@ static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_dat
 {
     int exit_code = 0;
 
-//    // Open the CSV files
-//    std::ofstream raw_csv;
-//    raw_csv.open(signal_raw_csv);
-//    std::ofstream qc_csv;
-//    qc_csv.open(signal_qc_csv);
-
     // Run QC on the HDF5 file
     //H5::Exception::dontPrint();  // Disable error printing
     try {
@@ -554,11 +548,7 @@ static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_dat
     catch (std::exception& e) {
         std::cerr << "Exception caught : " << e.what() << std::endl;
     }
-
-//    // Close the CSV files
-//    raw_csv.close();
-//    qc_csv.close();
-
+    
     return exit_code;
 }
 
diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp
index 44e70bb..3d87a32 100644
--- a/src/fastq_module.cpp
+++ b/src/fastq_module.cpp
@@ -21,13 +21,13 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
     int exit_code = 0;
     int read_len;
     double read_gc_cnt;
-    // double read_mean_base_qual;
     Basic_Seq_Statistics &long_read_info = output_data.long_read_info;
     Basic_Seq_Quality_Statistics &seq_quality_info = output_data.seq_quality_info;
     long_read_info.total_num_reads = ZeroDefault; // total number of long reads
     long_read_info.longest_read_length = ZeroDefault; // the length of longest reads
 
     std::ifstream input_file_stream(input_file);
+    int count = 0;
     if (!input_file_stream.is_open())
     {
         fprintf(stderr, "Failed to open file for reading: %s\n", input_file);
@@ -38,6 +38,7 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
         {
             if (line[0] == '@')
             {
+            	count++;
                 read_name = line.substr(1);
                 read_name = read_name.substr(0, read_name.find_first_of(" \t"));
                 std::getline(input_file_stream, read_seq);
@@ -64,16 +65,13 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
                 long_read_info.read_lengths.push_back(read_len);
 
                 // Access base quality data
-                // printMessage("[TEST1] Base quality string: " + raw_read_qual);
                 char value;
                 std::vector<int> base_quality_values;
-                // std::string base_quality_str = raw_read_qual;
                 std::istringstream iss(raw_read_qual);
                 while (iss >> value)
                 {
                     int base_quality_value = value - '!';
                     base_quality_values.push_back(base_quality_value);
-                    // printMessage("[TEST1] Base quality value: " + std::to_string(base_quality_value));
                 }
 
                 // Ensure that the base quality string has the same length as
@@ -87,7 +85,6 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
 
                 // Process base and quality information
                 read_gc_cnt = 0;
-                // read_mean_base_qual = 0;
                 int base_quality_value;
                 double cumulative_base_prob = 0;  // Read cumulative base quality probability
                 for (int i = 0; i < read_len; i++)
@@ -113,13 +110,11 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
 
                     // Get the base quality (Phred) value
                     base_quality_value = base_quality_values[i];
-                    // base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset;
                     try {
                         seq_quality_info.base_quality_distribution[base_quality_value] += 1;
                     } catch (const std::out_of_range& oor) {
                         printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value");
                     }
-                    // read_mean_base_qual += (double) base_quality_value;
 
                     // Convert the Phred quality value to a probability
                     double base_quality_prob = pow(10, -base_quality_value / 10.0);
@@ -132,7 +127,14 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
                 // Convert the mean base quality probability to a Phred quality
                 // value
                 double read_mean_base_qual = -10.0 * log10(cumulative_base_prob);
-                // printMessage("Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual));
+              
+                // Update the per-read base quality distribution
+                int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual));
+                try {
+                    seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1;
+                } catch (const std::out_of_range& oor) {
+                    printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
+                }
 
                 // Update the per-read GC content distribution
                 double gc_content_pct = (100.0 * read_gc_cnt) / static_cast<double>(read_len);
@@ -142,28 +144,9 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out
                 } catch (const std::out_of_range& oor) {
                     printError("Warning: Invalid GC content value " + std::to_string(gc_content_int));
                 }
-                
-                // Update the per-read base quality distribution
-                // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len);
-                // unsigned int read_mean_base_qual_int = static_cast<unsigned
-                // int>(std::round(read_mean_base_qual_pct));
-                int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual));
-
-                // printMessage("Rounded Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual_int));
 
-                try {
-                    seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1;
-                } catch (const std::out_of_range& oor) {
-                    printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
-                }
-                
-                // try {
-                //     seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1;
-                // } catch (const std::out_of_range& oor) {
-                //     printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
-                // }
-
-                fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual);  // Write to file
+                // Write read details to file
+                fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual);
             }
         }
         input_file_stream.close();
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index f46606d..1a7e53f 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -36,7 +36,7 @@ HTSReader::~HTSReader(){
 }
 
 // Update read and base counts
-int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution, bool is_primary) {
+int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, Basic_Seq_Quality_Statistics& seq_quality_info, bool is_primary) {
 
     // Update read QC
     basic_qc.total_num_reads++;  // Update the total number of reads
@@ -47,11 +47,16 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& bas
     // Get base counts, quality, and GC content
     double read_gc_count = 0.0;  // For GC content calculation
     double read_base_total = 0.0;  // For GC content calculation
+    double cumulative_base_prob = 0.0;  // For mean base quality probability calculation
     uint8_t *seq = bam_get_seq(record);
     for (int i = 0; i < read_length; i++) {
         // Get the base quality and update the base quality histogram
-        uint64_t base_quality = (uint64_t)bam_get_qual(record)[i];
-        base_quality_distribution[base_quality]++;
+        int base_quality = (int)bam_get_qual(record)[i];
+        seq_quality_info.base_quality_distribution[(uint64_t)base_quality]++;
+
+        // Convert the Phred quality value to a probability
+        double base_quality_prob = pow(10, -base_quality / 10.0);
+        cumulative_base_prob += base_quality_prob;
 
         // Get the base and update the base count
         char base = seq_nt16_str[bam_seqi(seq, i)];
@@ -84,6 +89,20 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& bas
         }
     }
 
+    // Calculate the mean base quality probability
+    cumulative_base_prob /= (double)read_length;
+
+    // Convert the mean base quality probability to a Phred quality value
+    double read_mean_base_qual = -10.0 * log10(cumulative_base_prob);
+
+    // Update the per-read mean base quality distribution
+    int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual));
+    try {
+        seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int]++;
+    } catch (const std::out_of_range& oor) {
+        printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
+    }
+
     // Calculate the read GC content percentage if a primary alignment
     if (is_primary) {
         double gc_content = read_gc_count / read_base_total;
@@ -117,9 +136,6 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
         }
     }
 
-    // Access the base quality histogram from the output_data object
-    uint64_t *base_quality_distribution = output_data.seq_quality_info.base_quality_distribution;
-
     // Do QC on each record and store the results in the output_data object
     while ((record_count < batch_size) && (exit_code >= 0)) {
         // Create a record object
@@ -210,11 +226,13 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
         // Unmapped reads
         if (record->core.flag & BAM_FUNMAP) {
             Basic_Seq_Statistics& basic_qc = output_data.unmapped_long_read_info;
-            this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, false);
+            Basic_Seq_Quality_Statistics& seq_quality_info = output_data.unmapped_seq_quality_info;
+            this->updateReadAndBaseCounts(record, basic_qc, seq_quality_info, false);
 
         } else {
             // Calculate base alignment statistics on non-secondary alignments
             Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info;
+            Basic_Seq_Quality_Statistics& seq_quality_info = output_data.seq_quality_info;
             if (!(record->core.flag & BAM_FSECONDARY)) {
 
                 // Determine if this is a forward or reverse read
@@ -328,9 +346,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
                             break;
                     }
                 }
-
-                // Update read and base QC
-                this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, true);
+                this->updateReadAndBaseCounts(record, basic_qc, seq_quality_info, true);
 
             } else {
                 printError("Error: Unknown alignment type with flag " + std::to_string(record->core.flag));
@@ -376,7 +392,7 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
     hts_set_threads(bam_file, thread_count);  // Enable multi-threading
     bam_hdr_t* bam_header = sam_hdr_read(bam_file);
     bam1_t* bam_record = bam_init1();
-    int64_t num_reads = 0;
+    int64_t read_index = 0;
 
     // Create a random number generator and seed it with the current time
     unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
@@ -390,33 +406,15 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
     }
     std::shuffle(read_indices.begin(), read_indices.end(), generator);
     read_indices.resize(sample_count);
-
-    // Print first 100 read indices sorted
-    // std::sort(read_indices.begin(), read_indices.end());
-    // std::cout << "First 100 read indices: " << std::endl;
-    // for (int i = 0; i < 100; i++) {
-    //     std::cout << read_indices[i] << std::endl;
-    // }
-
-    // Convert to a set for fast lookup
     std::unordered_set<int> read_indices_set(read_indices.begin(), read_indices.end());
-
-    std::cout << "Number of sampled reads = " << read_indices_set.size() << std::endl;
-
-    // Keep track of number of modified bases on the primary alignment vs other
-    // alignments (secondary, supplementary, unmapped)
-    int num_modified_bases_primary = 0;
-    int num_modified_bases_unmapped = 0;
-    int num_modified_bases_secondary = 0;
-    int num_modified_bases_supplementary = 0;
+    printMessage("Number of sampled reads for base modification analysis = " + std::to_string(read_indices_set.size()));
 
     while (sam_read1(bam_file, bam_header, bam_record) >= 0) {
 
-        if (read_indices_set.find(num_reads) == read_indices_set.end()) {
-            num_reads++;
-            continue;
-        }
-        num_reads++;
+        // if (read_indices_set.find(read_index) == read_indices_set.end()) {
+        //     read_index++;
+        //     continue;
+        // }
 
         // Base modification tag analysis
         // Follow here to get base modification tags:
@@ -425,11 +423,6 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
         int read_length = bam_record->core.l_qseq;
         hts_base_mod_state *state = hts_base_mod_state_alloc();
         std::vector<std::pair<int32_t, int>> c_modified_positions;  // C-modified positions for CpG analysis (chr->(position, strand))
-        // std::unordered_map<char, int> base_mod_counts;  // Type-specific
-        // base modification counts for the alignment
-        // std::unordered_map<char, std::unordered_map<char, int>>
-        // base_mod_counts;  // Type-specific base modification counts
-        // (canonical base -> modified base -> count)
         std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts;  // Type-specific base modification probabilities (canonical base -> modified base -> [read length %, probability])
         std::unordered_map<char, int> base_primary_count;  // Total base counts for the alignment
 
@@ -438,23 +431,7 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
         int ret = bam_parse_basemod(bam_record, state);
         bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
 
-        // Update the number of reads with base modifications for the
-        // primary alignment vs other alignments
-        if (ret >= 0) {
-            if (is_primary) {
-                num_modified_bases_primary++;
-            } else if (bam_record->core.flag & BAM_FUNMAP) {
-                num_modified_bases_unmapped++;
-            } else if (bam_record->core.flag & BAM_FSECONDARY) {
-                num_modified_bases_secondary++;
-            } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) {
-                num_modified_bases_supplementary++;
-            }
-        }
-
         if (ret >= 0 && is_primary) {
-            // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP);
-
             // Get the chromosome if alignments are present
             bool alignments_present = true;
             std::string chr;
@@ -513,15 +490,18 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
                         // Update the read length % and probability for the
                         // modification
                         double read_len_pct = (double) (pos + 1) / read_length;
-                        // std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl;
-                        final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability);  // Update the base modification probabilities
+                        // std::cout << "Read length %: " << read_len_pct << ",
+                        // probability: " << probability << std::endl;
+                        
+                        // Update the base modification probabilities for
+                        // sampled reads only (10,000 maximum)
+                        if (read_indices_set.find(read_index) != read_indices_set.end()) {
+                            final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability);  // Update the base modification probabilities
+                        }
 
                         // Update counts for predictions exceeding the threshold
                         if (probability >= base_mod_threshold) {
                             final_output.updateBaseModCounts(mod_type, strand);  // Update the base modification counts
-                            // base_mod_counts[mod_type]++;  // Update the
-                            // type-specific count
-                            // base_mod_counts[canonical_base_char][mod_type]++;  // Update the type-specific count
 
                             // Store the modified positions for later CpG
                             // analysis if a C modification on a primary alignment
@@ -536,9 +516,6 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
                                 }
                             }
                         }
-                        // } else {
-                        //     base_primary_count[mod_type]++;  // Update the type-specific unmodified count
-                        // }
                     }
                 }
             }
@@ -561,61 +538,9 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out
         }
         hts_base_mod_state_free(state);  // Deallocate the base modification state object
 
-        // Calculate the base modification rate for the read
-        // double read_mod_rate = 0.0;
-        // if (read_length > 0) {
-        //     read_mod_rate = (double) read_mod_count / read_length;
-        // }
-
-        // Calculate the type-specific base modification rates for the read
-        // std::unordered_map<char, double> base_mod_rates;
-        // for (auto const &it : base_mod_counts) {
-        //     char canonical_base = it.first;
-        //     std::unordered_map<char, int> mod_counts = it.second;
-        //     double mod_rate = 0.0;
-        //     int total_base_count = base_primary_count[canonical_base];
-
-        //     // Calculate the modification rate for each modification type
-        //     for (auto const &it2 : mod_counts) {
-        //         char mod_type = it2.first;
-        //         int mod_count = it2.second;
-        //         double mod_rate = 0.0;
-        //         if (mod_count + total_base_count > 0) {
-        //             mod_rate = (double) mod_count / total_base_count;
-        //         }
-        //         base_mod_rates[mod_type] = mod_rate;
-        //     }
-        //     // for (auto const &it2 : mod_counts) {
-        //     //     total_mod_count += it2.second;
-        //     // }
-        //     // if (total_mod_count + total_base_count > 0) {
-        //     //     mod_rate = (double) total_mod_count / (total_mod_count + total_base_count);
-        //     // }
-        //     // base_mod_rates[canonical_base] = mod_rate;
-        // }
-        // for (auto const &it : base_mod_counts) {
-        //     char mod_type = it.first;
-        //     int mod_count = it.second;
-        //     double mod_rate = 0.0;
-        //     int total_base_count = base_primary_count[mod_type];
-        //     if (mod_count + unmod_count > 0) {
-        //         mod_rate = (double) mod_count / (mod_count + unmod_count);
-        //     }
-        //     // if (read_length > 0) {
-        //     //     mod_rate = (double) mod_count / read_length;
-        //     // }
-        //     base_mod_rates[mod_type] = mod_rate;
-        // }
-        // final_output.updateReadModRate(read_length, base_mod_rates);  // Update the output data
+        read_index++;  // Update the read index
     }
 
-    // Summary of base modification counts
-    printMessage("Base modification counts:");
-    printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary));
-    printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped));
-    printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary));
-    printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary));
-
     bam_destroy1(bam_record);
     bam_hdr_destroy(bam_header);
     sam_close(bam_file);
@@ -648,7 +573,6 @@ std::map<int, int> HTSReader::getQueryToRefMap(bam1_t *record)
                     query_to_ref_map[current_query_pos] = current_ref_pos + 1;  // Use 1-indexed positions
                     current_ref_pos++;
                     current_query_pos++;
-                    // query_to_ref_map[current_query_pos] = current_ref_pos + 1;  // Use 1-indexed positions
                 }
                 break;
             case BAM_CINS:
diff --git a/src/output_data.cpp b/src/output_data.cpp
index 2401847..283f458 100644
--- a/src/output_data.cpp
+++ b/src/output_data.cpp
@@ -305,20 +305,16 @@ void Output_BAM::updateReadModRate(int read_length, const std::unordered_map<cha
 
 std::vector<char> Output_BAM::getBaseModTypes()
 {
-    printMessage("[TEST] Getting base modification types.");
     std::vector<char> base_mod_types;
     if (this->base_mod_counts.empty()) {
         printError("No base modification counts found.");
         return base_mod_types;
     }
 
-    printMessage("[TEST2] Getting base modification types.");
     for (const auto& it : this->base_mod_counts) {
         base_mod_types.push_back(it.first);
     }
-    // for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) {
-    //     base_mod_types.push_back(it->first);
-    // }
+    
     return base_mod_types;
 }
 
@@ -382,17 +378,17 @@ double Output_BAM::getNthReadLenPct(int read_index, char mod_type)
 
 double Output_BAM::getNthReadModProb(int read_index, char mod_type)
 {
-    double mod_prob = 0.0;
+    double mod_prob = -1.0;
     try {
         this->read_pct_len_vs_mod_prob.at(mod_type);
     } catch (const std::out_of_range& oor) {
-        std::cerr << "Error: Modification probability not found for type " << mod_type << std::endl;
+        return mod_prob;
     }
     try {
         mod_prob = this->read_pct_len_vs_mod_prob[mod_type].at(read_index).second;
     } catch (const std::out_of_range& oor) {
-        std::cerr << "Error: Modification probability not found for read index " << read_index << " and type " << mod_type << std::endl;
-        return 0.0;
+        // std::cerr << "Error: Modification probability not found for read index " << read_index << " and type " << mod_type << std::endl;
+        return -1.0;
     }
     return mod_prob;
 }
@@ -465,6 +461,11 @@ void Output_BAM::add(Output_BAM &output_data)
         this->seq_quality_info.base_quality_distribution[i] += output_data.seq_quality_info.base_quality_distribution[i];
     }
 
+    // Update the read average base quality vector if it is not empty
+    for (int i=0; i<MAX_READ_QUALITY; i++){
+        this->seq_quality_info.read_average_base_quality_distribution[i] += output_data.seq_quality_info.read_average_base_quality_distribution[i];
+    }
+
     this->num_matched_bases += output_data.num_matched_bases;
     this->num_mismatched_bases += output_data.num_mismatched_bases;
     this->num_ins_bases += output_data.num_ins_bases;
@@ -686,9 +687,7 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
     std::string read_name_str;
     std::getline( iss_header, read_name_str, ' ' );
     read_name = read_name_str.c_str();
-
-    // Access the sequence data
-    std::string sequence_data_str = fq[1];
+    std::string sequence_data_str = fq[1];  // Access the sequence data
 
     // Update the total number of bases
     int base_count = sequence_data_str.length();
@@ -714,7 +713,6 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
 
     // Update the base quality and GC content information
     int gc_count = 0;
-    // double read_mean_base_qual = 0;
     double cumulative_base_prob = 0;  // Read cumulative base quality probability
     char current_base;
     int base_quality_value;
@@ -751,7 +749,6 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
 
         // Convert the Phred quality value to a probability
         double base_quality_prob = pow(10, -base_quality_value / 10.0);
-        // read_mean_base_qual += (double)base_quality_value;
         cumulative_base_prob += base_quality_prob;
     }
 
@@ -761,11 +758,6 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
     // Convert the mean base quality probability to a Phred quality value
     double read_mean_base_qual = -10.0 * log10(cumulative_base_prob);
 
-    // printMessage("Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual));
-
-    // Calculate percent guanine & cytosine
-    // gc_content_pct = 100.0 *( (double)gc_count / (double)base_count );
-
     // Update the per-read GC content distribution
     double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count);
     int gc_content_int = static_cast<int>(std::round(gc_content_pct));
@@ -776,16 +768,9 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_
     }
 
     // Update the per-read base quality distribution
-    // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count);
-    // unsigned int read_mean_base_qual_int = static_cast<unsigned
-    // int>(std::round(read_mean_base_qual_pct));
     int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual));
 
-    // printMessage("Rounded Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual_int));
-
     try {
-        // seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int]
-        // += 1;
         seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1;
     } catch (const std::out_of_range& oor) {
         printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value");
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 472f935..da6d017 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -320,20 +320,20 @@ def read_gc_content_histogram(data, font_size, plot_filepaths):
     gc_content = np.array(data.read_gc_content_count)
 
     # Calculate the percentage of reads with a GC content of <30%
-    gc_content_below_30 = np.sum(gc_content[:30])
-    logging.info("[TEST] Percentage of reads with GC content <30%: {}".format(gc_content_below_30 / np.sum(gc_content)))
+    # gc_content_below_30 = np.sum(gc_content[:30])
+    # logging.info("[TEST] Percentage of reads with GC content <30%: {}".format(gc_content_below_30 / np.sum(gc_content)))
 
-    # Calculate the percentage of reads with a GC content of >70%
-    gc_content_above_70 = np.sum(gc_content[70:])
-    logging.info("[TEST] Percentage of reads with GC content >70%: {}".format(gc_content_above_70 / np.sum(gc_content)))
+    # # Calculate the percentage of reads with a GC content of >70%
+    # gc_content_above_70 = np.sum(gc_content[70:])
+    # logging.info("[TEST] Percentage of reads with GC content >70%: {}".format(gc_content_above_70 / np.sum(gc_content)))
 
-    # Calculate the percentage of reads with a GC content of <20%
-    gc_content_below_20 = np.sum(gc_content[:20])
-    logging.info("[TEST] Percentage of reads with GC content <20%: {}".format(gc_content_below_20 / np.sum(gc_content)))
+    # # Calculate the percentage of reads with a GC content of <20%
+    # gc_content_below_20 = np.sum(gc_content[:20])
+    # logging.info("[TEST] Percentage of reads with GC content <20%: {}".format(gc_content_below_20 / np.sum(gc_content)))
 
-    # Calculate the percentage of reads with a GC content of >60%
-    gc_content_above_60 = np.sum(gc_content[60:])
-    logging.info("[TEST] Percentage of reads with GC content >60%: {}".format(gc_content_above_60 / np.sum(gc_content)))
+    # # Calculate the percentage of reads with a GC content of >60%
+    # gc_content_above_60 = np.sum(gc_content[60:])
+    # logging.info("[TEST] Percentage of reads with GC content >60%: {}".format(gc_content_above_60 / np.sum(gc_content)))
 
     # Set the error flag if the GC content is below 20% for more than 10% of the
     # reads
@@ -381,6 +381,8 @@ def base_quality(data, font_size, plot_filepaths):
     """Plot the base quality distribution."""
     xd = np.arange(MAX_BASE_QUALITY)
     yd = np.array(data.base_quality_distribution)
+    xd = xd[:60]
+    yd = yd[:60]
     fig = go.Figure()
 
     customdata = np.dstack((xd, yd))[0, :, :]
@@ -411,9 +413,10 @@ def read_avg_base_quality(data, font_size, plot_filepaths):
     """Plot the read average base quality distribution."""
     xd = np.arange(MAX_READ_QUALITY)
     yd = np.array(data.read_average_base_quality_distribution)
+    xd = xd[:60]
+    yd = yd[:60]
     fig = go.Figure()
     fig.add_trace(go.Bar(x=xd, y=yd, marker_color='#36a5c7'))
-
     fig.update_xaxes(ticks="outside", dtick=10, title_text='Average Base Quality', title_standoff=0)
     fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0)
     fig.update_layout(font=dict(size=PLOT_FONT_SIZE))  # Set font size
@@ -524,19 +527,15 @@ def plot(output_data, para_dict, file_type):
     # Base quality histogram
     if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt':
         seq_quality_info = output_data.seq_quality_info
-
-        # Base quality histogram
         base_quality(seq_quality_info, font_size, plot_filepaths)
         
     # Read average base quality histogram
-    if file_type == 'FASTQ':
+    if file_type == 'FASTQ' or file_type == 'FAST5' or file_type == 'BAM':
         read_avg_base_quality(seq_quality_info, font_size, plot_filepaths)
 
+    # Plot the read alignments and base alignments if the file type is BAM
     if file_type == 'BAM':
-        # Plot read alignment QC
         plot_alignment_numbers(output_data, plot_filepaths)
-        
-        # Plot base alignment and error QC
         plot_errors(output_data, plot_filepaths)
         
     elif file_type == 'FAST5s':
@@ -659,7 +658,6 @@ def plot_pod5(pod5_output, para_dict, bam_output=None):
             xaxis=dict(range=[0, 100])
         )
         fig.update_traces(marker={'size': marker_size})
-        # fig.update_xaxes(title="Index")
 
         # Append the dynamic HTML object to the output structure
         dynamic_html = fig.to_html(full_html=False)
@@ -1048,9 +1046,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     table_error_flag = False
 
     # Print the types of modifications
-    logging.info("Getting base modification types")
     base_mod_types = output_data.getBaseModTypes()
-    logging.info("[TEST] Modification types: ")
     if base_mod_types:
         logging.info("Modification types: ")
         for mod_type in base_mod_types:
@@ -1058,18 +1054,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
         logging.info("Getting base modification statistics")
 
-        # Get the read length vs. base modification rate data for each
-        # # modification type
-        # logging.info("Getting mod data size")
-        # read_mod_data_size = output_data.getReadModDataSize()
-        # logging.info("Mod data size: {}".format(read_mod_data_size))
-
-        # # Choose a maximum of 10,000 reads to randomly sample for the plot
-        # max_reads = min(read_mod_data_size, 10000)        
-        # # read_indices = set(sample(range(read_mod_data_size), max_reads))
-        # read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False)
-        # read_length_mod_rates = {}
-
         # Get the read length (%) vs. base modification probability data for
         # each sampled read
         sample_count = 10000
@@ -1078,8 +1062,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         for mod_type in base_mod_types:
             for i in range(sample_count):
                 try:
-                    pct = output_data.getNthReadLenPct(i, mod_type)
                     prob = output_data.getNthReadModProb(i, mod_type)
+                    if prob == -1:  # Skip if no modifications for the read
+                        continue
+
+                    pct = output_data.getNthReadLenPct(i, mod_type)
                     read_len_pct.append(pct)
                     mod_prob.append(prob)
                 except Exception as e:
@@ -1097,73 +1084,28 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
                             'N': 'Amb. N', \
                             'v': 'pseU'}
 
-        # Create a plot of pct read length vs. base modification probability for
-        # each modification type, as well as a histogram of the average base
-        # modification probability for 100 bins of the read length
-
-        # Make a subplot of two columns for the read length vs. base
-        # modification probability and the histogram of the average base
-        # modification probability for each modification type
         fig = make_subplots(rows=len(base_mod_types), cols=2, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=[f"{mod_char_to_name[mod_type]} Modification Probability" for mod_type in base_mod_types])
 
         for i, mod_type in enumerate(base_mod_types):
             logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}")
 
-            # Add the trace for the read length vs. base modification
+            # Add the trace for the read length (%) vs. base modification
             # probability scatter plot
             fig.add_trace(go.Scatter
                 (x=read_len_pct, y=mod_prob, mode='markers', name=mod_char_to_name[mod_type], marker=dict(size=5), showlegend=False),
                 row=i + 1, col=1)
             
-            # Print the first 50 pairs sorted by read length for debugging
-            # read_len_pct, mod_prob = zip(*sorted(zip(read_len_pct, mod_prob)))
-            # if i == 0:
-            #     for j in range(50):
-            #         logging.info(f"Read length: {read_len_pct[j]}, Modification probability: {mod_prob[j]}")
-            
             # Create a histogram of the base modification probabilities
             base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20)
             fig.add_trace(base_mod_prob_hist, row=i + 1, col=2)
-            
-            # Add a bar plot of the average base modification probability for
-            # 100 bins of the read length
-            # bins = np.linspace(0, 100, 11)  # 10 bins (0-10%, 10-20%, ..., 90-100%)
-            # bin_centers = (bins[:-1] + bins[1:]) / 2  # Bin centers for plotting
-
-            # # Get the average probability per bin
-            # avg_prob_per_bin = np.zeros(10)
-            # bin_indices = np.digitize(read_len_pct, bins) - 1
-            # for j in range(10):  # Loop over bins
-            #     bin_mask = (bin_indices == j)
-            #     if np.any(bin_mask):
-            #         avg_prob_per_bin[j] = np.mean(mod_prob[bin_mask])
-            #         logging.info(f"Bin {j}: {avg_prob_per_bin[j]}")
-
-            # # Create the bar plot
-
-            # # Print the bins and read length percentages for the first 10 reads
-            # # for debugging
-            # if i == 0:
-            #     logging.info("Bins: {}".format(bins))
-            #     logging.info("Bin indices: {}".format(bin_indices[:10]))
-            #     logging.info("Read length percentages: {}".format(read_len_pct[:10]))
-
-            # # Create the bar plot
-            # fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2)
 
             # Update the plot style
             fig.update_xaxes(title="Read Length (%)", row=i + 1, col=1)
             fig.update_yaxes(title="Modification Probability", row=i + 1, col=1)
             fig.update_xaxes(title="Modification Probability", row=i + 1, col=2)
             fig.update_yaxes(title="Frequency", row=i + 1, col=2)
-            # fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2)
-            # fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2)
-
-            # Set the range of the y-axis to 0-1
             fig.update_yaxes(range=[0, 1], row=i + 1, col=1)
-            # fig.update_yaxes(range=[0, 1], row=i + 1, col=2)
 
-        # Update the plot layout
         fig.update_layout(title="Read Length vs. Base Modification Probability", font=dict(size=PLOT_FONT_SIZE))
             
         # Generate the HTML
@@ -1175,7 +1117,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     else:
         logging.warning("WARNING: No modification types found")
 
-    # Create the base modification statistics table'
+    # Create the base modification statistics table
     logging.info("Creating the base modification statistics table")
     table_str = "<table>\n<tbody>"
     row_str, row_flag = format_row("Total Unfiltered Predictions", [output_data.modified_prediction_count], 'int', None)
@@ -1208,7 +1150,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
 
     # Add the modification type data
     for mod_type in base_mod_types:
-        # mod_name = mod_char_to_name[mod_type]
         try:
             mod_name = mod_char_to_name[mod_type]
         except KeyError:
@@ -1234,13 +1175,13 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     # Finish the table
     table_str += "\n</tbody>\n</table>"
 
-    # Add the help text
-    table_str += """
-        <div class="help-icon">
-            💡
-            <div class="tooltip">{}</div>
-        </div>
-        """.format(help_text)
+    # # Add the help text
+    # table_str += """
+    #     <div class="help-icon">
+    #         💡
+    #         <div class="tooltip">{}</div>
+    #     </div>
+    #     """.format(help_text)
     
     # Add text below the table suggesting the user to use Modkit for more
     # detailed analysis on per-site modification rates
@@ -1322,9 +1263,6 @@ def plot_alignment_numbers(data, plot_filepaths):
     # Set the error flag if primary alignments equal 0
     error_flag = data.num_primary_alignment == 0
 
-    logging.info("[TEST] Number of reverse alignments: {}".format(data.reverse_alignment))
-    logging.info("[TEST] Number of forward alignments: {}".format(data.forward_alignment))
-
     # Create a horizontally aligned bar plot trace from the data using plotly
     trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment,
                       data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment,

From c257da611a4ae2cfcb4c6c42fcb504f808d644f9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 11:24:59 -0500
Subject: [PATCH 18/25] Reduce debug output

---
 src/cli.py        |  2 +-
 src/plot_utils.py | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cli.py b/src/cli.py
index 80b23ba..bbee8cf 100644
--- a/src/cli.py
+++ b/src/cli.py
@@ -258,7 +258,7 @@ def bam_module(margs):
             # If base modifications were found, add the base modification plots
             # after the first table
             if bam_output.sample_modified_base_count > 0:
-                logging.info("Base modifications found. Adding base modification plots to the HTML report.")
+                # logging.info("Base modifications found. Adding base modification plots to the HTML report.")
                 qc_info_list.insert(1, "read_length_mod_rates")  # Read length modification rates
                 qc_info_list.insert(1, "base_mods")
 
diff --git a/src/plot_utils.py b/src/plot_utils.py
index da6d017..3e17554 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -1048,11 +1048,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
     # Print the types of modifications
     base_mod_types = output_data.getBaseModTypes()
     if base_mod_types:
-        logging.info("Modification types: ")
-        for mod_type in base_mod_types:
-            logging.info(mod_type)
+        # logging.info("Modification types: ")
+        # for mod_type in base_mod_types:
+        #     logging.info(mod_type)
 
-        logging.info("Getting base modification statistics")
+        # logging.info("Getting base modification statistics")
 
         # Get the read length (%) vs. base modification probability data for
         # each sampled read
@@ -1087,7 +1087,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         fig = make_subplots(rows=len(base_mod_types), cols=2, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=[f"{mod_char_to_name[mod_type]} Modification Probability" for mod_type in base_mod_types])
 
         for i, mod_type in enumerate(base_mod_types):
-            logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}")
+            # logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}")
 
             # Add the trace for the read length (%) vs. base modification
             # probability scatter plot
@@ -1112,7 +1112,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th
         if len(base_mod_types) > 0:
             plot_height = 500 * len(base_mod_types)
             plot_width = 700 * 2
-            logging.info("Saving the read length vs. modification rates plot")
+            logging.info("Generating the read length vs. modification rates plot")
             plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=plot_width)
     else:
         logging.warning("WARNING: No modification types found")

From 82cddc463042e3b0d95ea6b30ccb4afe3b84ca12 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 11:46:26 -0500
Subject: [PATCH 19/25] Debug compilation

---
 Makefile           |  3 +++
 conda/meta.yaml    |  8 ++++----
 src/hts_reader.cpp |  2 +-
 src/plot_utils.py  | 18 +++---------------
 4 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/Makefile b/Makefile
index 5b6392f..529dd03 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,9 @@ all: swig_build compile
 swig_build:
 	swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i
 
+# Create the lib directory if it doesn't exist
+	mkdir -p $(LIB_DIR)
+
 # Compile the C++ shared libraries into lib/
 compile:
 	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \
diff --git a/conda/meta.yaml b/conda/meta.yaml
index edf847d..9253229 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,14 +1,14 @@
 {% set version = "1.4.0" %}
-# {% set revision = "b06670513616fd6342233c1c77e6d0bcf138b3bc" %}
+{% set revision = "c257da611a4ae2cfcb4c6c42fcb504f808d644f9" %}
 
 package:
   name: longreadsum
   version: {{ version }}
 
 source:
-  path: ../
-  # git_url: https://github.com/WGLab/LongReadSum.git
-  # git_rev: {{ revision }}
+  git_url: https://github.com/WGLab/LongReadSum.git
+  git_rev: {{ revision }}
+  # path: ../
 
 channels:
   - conda-forge
diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp
index 1a7e53f..70a6410 100644
--- a/src/hts_reader.cpp
+++ b/src/hts_reader.cpp
@@ -184,7 +184,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
             // Set the atomic flag and print a message if the POD5 tags are
             // present
             if (!this->has_pod5_tags.test_and_set()) {
-                printMessage("POD5 tags found (ts, ns, mv)");
+                printMessage("POD5 basecall move table tags found (ts, ns, mv)");
             }
 
             // Get the ts and ns tags
diff --git a/src/plot_utils.py b/src/plot_utils.py
index 3e17554..c5f2bc3 100644
--- a/src/plot_utils.py
+++ b/src/plot_utils.py
@@ -437,27 +437,16 @@ def read_avg_base_quality(data, font_size, plot_filepaths):
 
 def plot_base_modifications(base_modifications):
     """Plot the base modifications per location."""
-    # Get the modification types
-    modification_types = list(base_modifications.keys())
 
-    # Create the figure
+    # Add a plot for each modification type
     fig = go.Figure()
-
-    # Add a trace for each modification type
+    modification_types = list(base_modifications.keys())
     for mod_type in modification_types:
-        # Get the modification data
         mod_data = base_modifications[mod_type]
-
-        # Create the trace
         trace = go.Scattergl(x=mod_data['positions'], y=mod_data['counts'], mode='markers', name=mod_type)
-
-        # Add the trace to the figure
         fig.add_trace(trace)
 
-    # Update the layout
     fig.update_layout(title='Base Modifications', xaxis_title='Position', yaxis_title='Counts', showlegend=True, font=dict(size=PLOT_FONT_SIZE))
-
-    # Generate the HTML
     html_obj = fig.to_html(full_html=False, default_height=500, default_width=700)
 
     return html_obj
@@ -545,10 +534,9 @@ def plot(output_data, para_dict, file_type):
 
 def plot_pod5(pod5_output, para_dict, bam_output=None):
     """Plot the ONT POD5 signal data for a random sample of reads."""
+    
     out_path = para_dict["output_folder"]
     plot_filepaths = getDefaultPlotFilenames()
-
-    # Create the summary table
     create_pod5_table(pod5_output, plot_filepaths)
 
     # Generate the signal plots

From 6d588bbb1ba19bd8d28a32eec798b1302437e144 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 12:13:40 -0500
Subject: [PATCH 20/25] revert makefile

---
 Makefile | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 529dd03..b2d1097 100644
--- a/Makefile
+++ b/Makefile
@@ -3,9 +3,11 @@ SRC_DIR := $(CURDIR)/src
 LIB_DIR := $(CURDIR)/lib
 
 # Set the library paths for the compiler
-CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
-LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
-INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
+# CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX)
+# LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib
+# INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include
+LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib
+INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include
 
 # All targets
 all: swig_build compile
@@ -19,9 +21,11 @@ swig_build:
 
 # Compile the C++ shared libraries into lib/
 compile:
-	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \
+	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \
 	CXXFLAGS="$(INCLUDE_PATHS)" LDFLAGS="$(LIBRARY_PATHS)" python3 setup.py build_ext --build-lib $(LIB_DIR)
 
 # Clean the build directory
 clean:
 	$(RM) -r $(LIB_DIR)/*.so $(LIB_DIR)/*.py $(SRC_DIR)/lrst_wrap.cpp build/
+
+# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \

From c6c34f1f7e2521d166a15378a27440d73d792f3f Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 12:56:12 -0500
Subject: [PATCH 21/25] Add build debug output

---
 conda/build.sh  | 9 +++++++++
 environment.yml | 2 --
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/conda/build.sh b/conda/build.sh
index 95f8d11..61720bc 100644
--- a/conda/build.sh
+++ b/conda/build.sh
@@ -3,18 +3,27 @@
 # Add the library path to the LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PREFIX}/lib
 
+# Ensure the lib directory exists
+mkdir -p "${SRC_DIR}"/lib
+
 # Generate the SWIG files
+echo "Generating SWIG files..."
 swig -c++ -python -outdir "${SRC_DIR}"/lib -I"${SRC_DIR}"/include -I"${PREFIX}"/include -o "${SRC_DIR}"/src/lrst_wrap.cpp "${SRC_DIR}"/src/lrst.i
 
 # Generate the shared library
+echo "Building the shared library..."
 $PYTHON setup.py -I"${PREFIX}"/include -L"${PREFIX}"/lib install
 
 # Create the src directory
 mkdir -p "${PREFIX}"/src
 
 # Copy source files to the bin directory
+echo "Copying source files..."
 cp -r "${SRC_DIR}"/src/*.py "${PREFIX}"/bin
 
 # Copy the SWIG generated library to the lib directory
+echo "Copying SWIG generated library..."
 cp -r "${SRC_DIR}"/lib/*.py "${PREFIX}"/lib
 cp -r "${SRC_DIR}"/lib/*.so "${PREFIX}"/lib
+
+echo "Build complete."
diff --git a/environment.yml b/environment.yml
index a76645c..c1d3ef4 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,7 +4,6 @@ channels:
   - bioconda
   - defaults
   - jannessp  # for pod5
-  - plotly  # for kaleido
 dependencies:
   - python=3.9
   - numpy
@@ -16,4 +15,3 @@ dependencies:
   - pytest
   - pod5
   - pyarrow
-  - python-kaleido

From da5cb8d06ba0e8e9b8551c37de8e5cf292e3d399 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 13:09:07 -0500
Subject: [PATCH 22/25] add gh actions verbose output

---
 .github/workflows/build-test.yml | 2 +-
 environment.yml                  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index 9694194..6c61557 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -35,7 +35,7 @@ jobs:
 
     - name: Build LongReadSum
       shell: bash --login {0}  # --login enables PATH variable access
-      run: make
+      run: make -d
 
     - name: Run tests
       shell: bash --login {0}
diff --git a/environment.yml b/environment.yml
index c1d3ef4..2cc96c0 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,11 +1,11 @@
 name: longreadsum
 channels:
   - conda-forge
+  - jannessp  # for pod5
   - bioconda
   - defaults
-  - jannessp  # for pod5
 dependencies:
-  - python=3.9
+  - python
   - numpy
   - hdf5
   - ont_vbz_hdf_plugin
@@ -13,5 +13,5 @@ dependencies:
   - swig
   - plotly
   - pytest
-  - pod5
+  - jannessp::pod5
   - pyarrow

From 906c70098115715e0ff12717f4ed915f86e965f1 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 13:18:01 -0500
Subject: [PATCH 23/25] update makefile

---
 Makefile        |  6 +-----
 environment.yml | 10 +++++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/Makefile b/Makefile
index b2d1097..1c23e60 100644
--- a/Makefile
+++ b/Makefile
@@ -14,10 +14,8 @@ all: swig_build compile
 
 # Generate the SWIG Python/C++ wrappers
 swig_build:
-	swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i
-
-# Create the lib directory if it doesn't exist
 	mkdir -p $(LIB_DIR)
+	swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i
 
 # Compile the C++ shared libraries into lib/
 compile:
@@ -27,5 +25,3 @@ compile:
 # Clean the build directory
 clean:
 	$(RM) -r $(LIB_DIR)/*.so $(LIB_DIR)/*.py $(SRC_DIR)/lrst_wrap.cpp build/
-
-# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \
diff --git a/environment.yml b/environment.yml
index 2cc96c0..91b434e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,15 +1,15 @@
 name: longreadsum
 channels:
   - conda-forge
-  - jannessp  # for pod5
-  - bioconda
   - defaults
+  - bioconda  # for htslib
+  - jannessp  # for pod5
 dependencies:
   - python
   - numpy
-  - hdf5
-  - ont_vbz_hdf_plugin
-  - htslib=1.20
+  - hdf5=1.10.6
+  - bioconda::ont_vbz_hdf_plugin
+  - bioconda::htslib=1.20
   - swig
   - plotly
   - pytest

From be0c09c09f1e470ef132b6b073d38e2f5523dc30 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 18:29:47 -0500
Subject: [PATCH 24/25] Fix conda build environment

---
 conda/meta.yaml | 7 +++----
 environment.yml | 8 ++++----
 setup.py        | 2 +-
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index 9253229..e1d7004 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -29,18 +29,17 @@ requirements:
   host:
     - python=3.9
     - swig
-    - hdf5
     - htslib=1.20
+    - ont_vbz_hdf_plugin  # Contains HDF5 as a dependency as well
     # - jannessp::pod5
     # - jannessp::lib-pod5
   run:
     - python=3.9
     - numpy
-    - hdf5
     - ont_vbz_hdf_plugin
-    - htslib=1.20
+    - bioconda::htslib=1.20
     - plotly
-    - janessp::pod5
+    - jannessp::pod5
     - pyarrow
     # - janessp::lib-pod5
 
diff --git a/environment.yml b/environment.yml
index 91b434e..b18d99e 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,14 +1,14 @@
 name: longreadsum
 channels:
   - conda-forge
-  - defaults
-  - bioconda  # for htslib
   - jannessp  # for pod5
+  - bioconda
+  - defaults
+
 dependencies:
   - python
   - numpy
-  - hdf5=1.10.6
-  - bioconda::ont_vbz_hdf_plugin
+  - ont_vbz_hdf_plugin
   - bioconda::htslib=1.20
   - swig
   - plotly
diff --git a/setup.py b/setup.py
index 3c05b69..f136db5 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
 
 # Set up the module
 setup(name="longreadsum",
-      version='1.4.0',
+      version='1.5.0',
       author="WGLab",
       description="""A fast and flexible QC tool for long read sequencing data""",
       ext_modules=[lrst_mod],

From 3e091bb8f9d4cbf9fde79b6d48a21c8b041abea1 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 21 Jan 2025 18:54:27 -0500
Subject: [PATCH 25/25] Update build commit

---
 conda/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/meta.yaml b/conda/meta.yaml
index e1d7004..ded0040 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,5 +1,5 @@
-{% set version = "1.4.0" %}
-{% set revision = "c257da611a4ae2cfcb4c6c42fcb504f808d644f9" %}
+{% set version = "1.5.0" %}
+{% set revision = "47f1310e02ee06f32b8e34417e207f245828a319" %}
 
 package:
   name: longreadsum