From be712e612c77ae63aab5a9fb46a40e64fd054fae Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Mon, 23 Dec 2024 15:58:15 -0500 Subject: [PATCH 01/25] Reduce seqtxt memory usage --- Makefile | 11 ++- include/seqtxt_module.h | 13 +-- include/utils.h | 2 + src/cli.py | 14 +-- src/plot_utils.py | 73 +++++++------- src/seqtxt_module.cpp | 212 ++++++++++++++++++++-------------------- src/utils.cpp | 17 +++- 7 files changed, 176 insertions(+), 166 deletions(-) diff --git a/Makefile b/Makefile index 7b1e50e..855290e 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,11 @@ SRC_DIR := $(CURDIR)/src LIB_DIR := $(CURDIR)/lib # Set the library paths for the compiler -LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib -INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include +#LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib +#INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include +CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX) +LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib +INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include # All targets all: swig_build compile @@ -15,5 +18,7 @@ swig_build: # Compile the C++ shared libraries into lib/ compile: - LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \ + LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \ CXXFLAGS="$(INCLUDE_PATHS)" LDFLAGS="$(LIBRARY_PATHS)" python3 setup.py build_ext --build-lib $(LIB_DIR) + +# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \ diff --git a/include/seqtxt_module.h b/include/seqtxt_module.h index 1035598..d2f0ce5 100644 --- a/include/seqtxt_module.h +++ b/include/seqtxt_module.h @@ -37,7 +37,7 @@ class SeqTxt_Thread_data { Output_SeqTxt t_output_SeqTxt_; std::string current_line; // Current line being read from the file - size_t read_ss_record(std::ifstream* file_stream, std::map<std::string, int> header_columns); + size_t read_ss_record(std::ifstream& file_stream, std::map<std::string, int> header_columns); std::map<std::string, int> getHeaderColumns(); SeqTxt_Thread_data(Input_Para& ref_input_op, std::map<std::string, int> header_columns, int p_thread_id, int p_batch_size); @@ -60,20 +60,13 @@ class SeqTxt_Module{ static std::mutex myMutex_readSeqTxt; static std::mutex myMutex_output; static size_t batch_size_of_record; - Input_Para _input_parameters; - - std::ifstream *input_file_stream; // Stream for the input text file + std::ifstream input_file_stream; // Stream for the input text file std::vector<std::thread> m_threads; - - int has_error; // Methods - // Assign threads - static void SeqTxt_do_thread(std::ifstream* file_stream, Input_Para& ref_input_op, int thread_id, SeqTxt_Thread_data& ref_thread_data, Output_SeqTxt& ref_output); - - // Generate statistics + static void SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref_input_op, int thread_id, Output_SeqTxt& ref_output, std::map<std::string, int> header_columns, size_t batch_size_of_record); int generateStatistics( Output_SeqTxt& t_output_SeqTxt_info); SeqTxt_Module(Input_Para& _m_input); diff --git a/include/utils.h b/include/utils.h index 9637f1e..1828dbb 100644 --- a/include/utils.h +++ b/include/utils.h @@ -12,4 +12,6 @@ void printMessage(std::string message); // Print an error message to stderr in a thread-safe manner void printError(std::string message); +void printMemoryUsage(const std::string &functionName); + #endif // UTILS_H diff --git a/src/cli.py b/src/cli.py index d4c8739..29549f2 100644 --- a/src/cli.py +++ b/src/cli.py @@ -154,7 +154,7 @@ def fq_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fq_output, param_dict, 'FASTQ') fq_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "base_quality", + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality", "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False) fq_html_gen.generate_html() @@ -192,7 +192,7 @@ def fa_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fa_output, param_dict, 'FASTA') fa_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "base_counts"], "FASTA QC", + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts"], "FASTA QC", param_dict], plot_filepaths, static=True) fa_html_gen.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -245,7 +245,7 @@ def bam_module(margs): plot_filepaths = plot(bam_output, param_dict, 'BAM') # Set the list of QC information to display - qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality"] + qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"] # If base modifications were found, add the base modification plots # after the first table @@ -310,7 +310,7 @@ def rrms_module(margs): # Generate the HTML report bam_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "base_counts", "basic_info", + [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False) bam_html_gen.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -383,7 +383,7 @@ def fast5_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fast5_output, param_dict, 'FAST5') fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality", + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", "read_avg_base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -429,7 +429,7 @@ def fast5_signal_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fast5_output, param_dict, 'FAST5s') fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality", + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", "read_avg_base_quality", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html(signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -517,7 +517,7 @@ def pod5_module(margs): # plot_filepaths = plot(read_signal_dict, param_dict, 'POD5') webpage_title = "POD5 QC" fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "base_counts", "basic_info", "base_quality", + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", "read_avg_base_quality", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html(signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) diff --git a/src/plot_utils.py b/src/plot_utils.py index 669b602..ac03046 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -34,6 +34,8 @@ def getDefaultPlotFilenames(): "basic_info": {'title': "Basic Statistics", 'description': "Basic Statistics", 'summary': ""}, "read_length_hist": {'title': "Read Length Histogram", 'description': "Read Length Histogram", 'summary': ""}, + + "gc_content_hist": {'title': "GC Content Histogram", 'description': "GC Content Histogram", 'summary': ""}, "base_quality": {'title': "Base Quality Histogram", 'description': "Base Quality Histogram"}, @@ -251,9 +253,6 @@ def read_lengths_histogram(data, font_size): hist, _ = np.histogram(read_lengths, bins=edges) # Create a figure with two subplots - # fig = make_subplots( - # rows=2, cols=1, - # subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.5) fig = make_subplots( rows=1, cols=2, subplot_titles=("Read Length Histogram", "Log Read Length Histogram"), vertical_spacing=0.0) @@ -276,13 +275,11 @@ def read_lengths_histogram(data, font_size): # Log histogram # Get the log10 histogram of read lengths read_lengths_log = np.log10(read_lengths, out=np.zeros_like(read_lengths), where=(read_lengths != 0)) - # log_hist, log_edges = np.histogram(read_lengths_log, bins=bin_count) log_edges = np.linspace(0, np.max(read_lengths_log), num=log_bin_count + 1) log_hist, _ = np.histogram(read_lengths_log, bins=log_edges) xd = log_edges log_bindata = np.dstack((np.power(10, log_edges)[:-1], np.power(10, log_edges)[1:], log_hist))[0, :, :] - # log_bin_centers = np.round((log_bindata[:, 0] + log_bindata[:, 1]) / 2, 0) yd = log_hist fig.add_trace(go.Bar(x=xd, y=yd, customdata=log_bindata, hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp<br>Counts:%{customdata[2]:.0f}<extra></extra>', @@ -297,17 +294,7 @@ def read_lengths_histogram(data, font_size): fig.update_annotations(font=dict(color="white")) # Set tick value range for the log scale - # Use the bin edge centers as the tick values - # tick_vals = (log_edges[:-1] + log_edges[1:]) / 2 - # tick_labels = ['{:,}'.format(int(10 ** x)) for x in tick_vals] tick_vals = log_edges - # tick_labels = ['{:,}'.format(int(10 ** x)) for x in tick_vals] - - # Format the tick labels to be in kilobases (kb) if the value is greater than - # 1000, and in bases (b) otherwise - # tick_labels = ['{:,}kb'.format(int(x / 1000)) for x in tick_vals] - # tick_labels = ['{:,}kb'.format(int(x) for x in log_bin_centers) if x > - # 1000 else '{:,}b'.format(int(x)) for x in log_bin_centers] tick_labels = [] for i in range(len(log_bindata)): # Format the tick labels to be in kilobases (kb) if the value is greater @@ -322,21 +309,7 @@ def read_lengths_histogram(data, font_size): tick_labels.append('{}-{}'.format(left_val_str, right_val_str)) fig.update_xaxes(ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=1, col=log_col, tickvals=tick_vals, ticktext=tick_labels, tickangle=45) - # fig.update_xaxes(range=[0, np.max(log_edges)], ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1) - # fig.update_xaxes(range=[0, np.max(log_edges)], ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1, tickvals=tick_vals) - # tick_vals = list(range(0, 5)) - # fig.update_xaxes( - # range=[0, np.max(log_edges)], - # tickmode='array', - # tickvals=tick_vals, - # ticktext=['{:,}'.format(10 ** x) for x in tick_vals], - # ticks="outside", title_text='Read Length (Log Scale)', title_standoff=0, row=2, col=1) - - # Set the tick value range for the linear scale - # tick_vals = (edges[:-1] + edges[1:]) / 2 - # tick_labels = ['{:,}'.format(int(x)) for x in tick_vals] tick_vals = edges - # tick_labels = ['{:,}'.format(int(x)) for x in tick_vals] # Format the tick labels to be the range of the bin centers tick_labels = [] @@ -352,26 +325,39 @@ def read_lengths_histogram(data, font_size): tick_labels.append('{}-{}'.format(left_val_str, right_val_str)) - # tick_labels = ['{:,}kb'.format(int(x / 1000)) for x in tick_vals] - # tick_labels = ['{:,}kb'.format(int(x)) if x > 1000 else - # '{:,}b'.format(int(x)) for x in linear_bin_centers] linear_col=1 fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=linear_col, tickvals=tick_vals, ticktext=tick_labels, tickangle=45) - # fig.update_xaxes(ticks="outside", title_text='Read Length', title_standoff=0, row=1, col=1, range=[0, np.max(edges)], tickvals=tick_vals) fig.update_yaxes(ticks="outside", title_text='Counts', title_standoff=0) # Update the layout fig.update_layout(showlegend=False, autosize=True, font=dict(size=PLOT_FONT_SIZE)) - # Set font sizes - # fig.update_layout(showlegend=False, autosize=False) - # fig.update_layout(font=dict(size=font_size), autosize=True) fig.update_annotations(font_size=annotation_size) - # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) html_obj = fig.to_html(full_html=False, default_height=500, default_width=1200) return html_obj +def read_gc_content_histogram(data, font_size): + """Plot the per-read GC content histogram.""" + + # Get the GC content data + gc_content = np.array(data.read_gc_content_count) + + # Create a histogram of the GC content (0-100% with 1% bins) + gc_content_bins = np.linspace(0, 100, 101) + gc_hist, _ = np.histogram(gc_content, bins=gc_content_bins) + + # Create the figure + fig = go.Figure() + fig.add_trace(go.Bar(x=gc_content_bins, y=gc_hist, marker_color='#36a5c7')) + + # Update the layout + fig.update_xaxes(ticks="outside", dtick=10, title_text='GC Content (%)', title_standoff=0) + fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0) + fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) # Set font size + + return fig.to_html(full_html=False, default_height=500, default_width=700) + # Save the 'Base quality' plot image. def base_quality(data, font_size): xd = np.arange(MAX_BASE_QUALITY) @@ -479,10 +465,17 @@ def plot(output_data, para_dict, file_type): plot_filepaths['read_length_bar']['dynamic'] = plot_read_length_stats(output_data, file_type) + # GC content histogram + if file_type != 'FAST5s' and file_type != 'SeqTxt': + if file_type == 'BAM': + plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.mapped_long_read_info, font_size) + elif file_type == 'SeqTxt': + plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size) + else: + plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.long_read_info, font_size) + + # Quality plots if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt': - # if file_type == 'SeqTxt': - # seq_quality_info = output_data.all_long_read_info.seq_quality_info - # else: seq_quality_info = output_data.seq_quality_info # Base quality histogram diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp index 9cd7e7f..2eb868a 100644 --- a/src/seqtxt_module.cpp +++ b/src/seqtxt_module.cpp @@ -4,12 +4,16 @@ Class for calling FAST5 statistics modules. */ +#include "seqtxt_module.h" + +/// @cond #include <iostream> #include <sstream> #include <string> #include <algorithm> +/// @endcond -#include "seqtxt_module.h" +#include "utils.h" size_t SeqTxt_Module::batch_size_of_record=3000; @@ -36,10 +40,11 @@ std::map<std::string, int> SeqTxt_Thread_data::getHeaderColumns() return _header_columns; } -size_t SeqTxt_Thread_data::read_ss_record(std::ifstream* file_stream, std::map<std::string, int> header_columns){ +size_t SeqTxt_Thread_data::read_ss_record(std::ifstream& file_stream, std::map<std::string, int> header_columns){ //std::cout << "Type 1." << std::endl; thread_index = 0; // Index where this thread's data will be stored - while( std::getline( *file_stream, current_line )) { + while( std::getline( file_stream, current_line ) ) + { std::istringstream column_stream( current_line ); // Read each column value from the record line @@ -93,7 +98,7 @@ SeqTxt_Module::SeqTxt_Module(Input_Para& input_parameters){ has_error = 0; file_index = 0; - input_file_stream = NULL; + // input_file_stream = NULL; if (file_index >= _input_parameters.num_input_files){ std::cerr << "Input file list error." << std::endl; has_error |= 1; @@ -102,8 +107,9 @@ SeqTxt_Module::SeqTxt_Module(Input_Para& input_parameters){ // Open the first file in the list const char * first_filepath = _input_parameters.input_files[file_index].c_str(); - input_file_stream = new std::ifstream(first_filepath); - if (!(input_file_stream->is_open())){ + // input_file_stream = new std::ifstream(first_filepath); + input_file_stream.open(first_filepath); + if (!(input_file_stream.is_open())){ std::cerr << "Cannot open sequencing_summary.txt file="<< first_filepath <<std::endl; has_error |= 2; }else{ @@ -112,7 +118,7 @@ SeqTxt_Module::SeqTxt_Module(Input_Para& input_parameters){ // Ensure that we have the columns we need for statistics std::string column_line; - std::getline( *input_file_stream, column_line ); + std::getline( input_file_stream, column_line ); if (requiredHeadersFound(column_line)) { // // Print the column names @@ -172,129 +178,127 @@ bool SeqTxt_Module::requiredHeadersFound(std::string header_string) { } SeqTxt_Module::~SeqTxt_Module(){ - if (input_file_stream!=NULL){ - delete input_file_stream; - } - input_file_stream = NULL; + if (input_file_stream.is_open()){ + input_file_stream.close(); + } } int SeqTxt_Module::generateStatistics( Output_SeqTxt& t_output_SeqTxt_info){ auto relapse_start_time = std::chrono::high_resolution_clock::now(); - t_output_SeqTxt_info.all_long_read_info.long_read_info.resize(); - t_output_SeqTxt_info.passed_long_read_info.long_read_info.resize(); - t_output_SeqTxt_info.failed_long_read_info.long_read_info.resize(); - - if (has_error==0){ - m_threads.reserve(_input_parameters.threads+3); - - int _i_t=0; - SeqTxt_Thread_data** thread_data_vector = new SeqTxt_Thread_data*[_input_parameters.threads]; - try{ - for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){ - // std::cout<<"INFO: generate threads "<<_i_t<<std::endl<<std::flush; - thread_data_vector[_i_t] = new SeqTxt_Thread_data(_input_parameters, _header_columns, _i_t, SeqTxt_Module::batch_size_of_record); - // std::cout<<"INFO: Thread = "<< _i_t+1 <<std::endl<<std::flush; - m_threads.push_back(std::thread((SeqTxt_Module::SeqTxt_do_thread), input_file_stream, std::ref(_input_parameters), _i_t, std::ref(*(thread_data_vector[_i_t])), std::ref(t_output_SeqTxt_info) )); - } - - // std::cout<<"INFO: join threads"<<std::endl<<std::flush; - std::cout << "Joining " << _input_parameters.threads << " threads..." << std::endl; - for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){ - // std::cout<<"INFO: join threads "<<_i_t<<std::endl<<std::flush; - m_threads[_i_t].join(); - } - std::cout << "All threads joined." << std::endl; - - }catch(const std::runtime_error& re){ - std::cerr << "Runtime error: " << re.what() << std::endl; - }catch(const std::exception& ex){ - std::cerr << "Error occurred: " << ex.what() << std::endl; - }catch(...){ - std::cerr << "Unknown failure occurred. Possible memory corruption" << std::endl; - } - - for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){ - delete thread_data_vector[_i_t]; - } - delete [] thread_data_vector; - } + t_output_SeqTxt_info.all_long_read_info.long_read_info.resize(); + t_output_SeqTxt_info.passed_long_read_info.long_read_info.resize(); + t_output_SeqTxt_info.failed_long_read_info.long_read_info.resize(); + printMemoryUsage("Before generating statistics"); - t_output_SeqTxt_info.global_sum(); + if (has_error==0) { + m_threads.reserve(_input_parameters.threads+3); + + int _i_t=0; + printMessage("Generating statistics..."); + try { + for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){ + m_threads.push_back(std::thread((SeqTxt_Module::SeqTxt_do_thread), std::ref(input_file_stream), std::ref(_input_parameters), _i_t, std::ref(t_output_SeqTxt_info), _header_columns, SeqTxt_Module::batch_size_of_record )); + } + printMessage("Joining " + std::to_string(_input_parameters.threads) + " threads..."); + for (_i_t=0; _i_t<_input_parameters.threads; _i_t++){ + m_threads[_i_t].join(); + } + + }catch(const std::runtime_error& re){ + printError("Runtime error: " + std::string(re.what())); + }catch(const std::exception& ex){ + std::cerr << "Error occurred: " << ex.what() << std::endl; + printError("Error: " + std::string(ex.what())); + }catch(...){ + printError("Unknown error occurred in thread " + std::to_string(_i_t)); + } + } + t_output_SeqTxt_info.global_sum(); + printMemoryUsage("After generating statistics"); - auto relapse_end_time = std::chrono::high_resolution_clock::now(); - std::cout<<"Elapsed time (seconds): "<< std::chrono::duration_cast<std::chrono::seconds>(relapse_end_time - relapse_start_time).count() << std::endl; + auto relapse_end_time = std::chrono::high_resolution_clock::now(); + std::cout<<"Elapsed time (seconds): "<< std::chrono::duration_cast<std::chrono::seconds>(relapse_end_time - relapse_start_time).count() << std::endl; - std::cout<<"sequencing_summary.txt QC "<< (has_error==0?"generated":"failed") << std::endl; + std::cout<<"sequencing_summary.txt QC "<< (has_error==0?"generated":"failed") << std::endl; - return has_error; + return has_error; } -void SeqTxt_Module::SeqTxt_do_thread(std::ifstream* file_stream, Input_Para& ref_input_op, int thread_id, SeqTxt_Thread_data& ref_thread_data, Output_SeqTxt& ref_output ){ +void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref_input_op, int thread_id, Output_SeqTxt& ref_output, std::map<std::string, int> header_columns, size_t batch_size_of_record){ size_t read_ss_size, read_ss_i; - while (true){ - myMutex_readSeqTxt.lock(); - std::map<std::string, int> header_column_data = ref_thread_data.getHeaderColumns(); - read_ss_size = ref_thread_data.read_ss_record(file_stream, header_column_data); - - if (read_ss_size == 0 && !(file_index < ref_input_op.num_input_files) ){ - myMutex_readSeqTxt.unlock(); - break; - } - if ( read_ss_size < batch_size_of_record ){ - if ( file_index < ref_input_op.num_input_files ){ - std::cout<< "INFO: Open sequencing_summary.txt file="<< ref_input_op.input_files[file_index] <<std::endl; - file_stream->close(); - file_stream->clear(); - - file_stream->open( ref_input_op.input_files[file_index].c_str() ); - std::string firstline; - std::getline( *file_stream, firstline ); - file_index++; + int total_read_count = 0; + while (true) { + SeqTxt_Thread_data ref_thread_data(ref_input_op, header_columns, thread_id, batch_size_of_record); + { + std::lock_guard<std::mutex> lock(myMutex_readSeqTxt); + std::map<std::string, int> header_column_data = ref_thread_data.getHeaderColumns(); + read_ss_size = ref_thread_data.read_ss_record(file_stream, header_column_data); + + if (read_ss_size == 0 && !(file_index < ref_input_op.num_input_files) ){ + break; } + + if ( read_ss_size < batch_size_of_record ){ + if ( file_index < ref_input_op.num_input_files ){ + // std::cout<< "INFO: Open sequencing_summary.txt file="<< ref_input_op.input_files[file_index] <<std::endl; + file_stream.close(); + file_stream.clear(); + + file_stream.open( ref_input_op.input_files[file_index].c_str() ); + std::string firstline; + std::getline( file_stream, firstline ); + file_index++; + } + } + } + if (read_ss_size == 0 ) { + printMessage("No records read."); + continue; + } else { + total_read_count += read_ss_size; + printMessage("Thread " + std::to_string(thread_id) + " read " + std::to_string(read_ss_size) + " records (total " + std::to_string(total_read_count) + ")"); } - myMutex_readSeqTxt.unlock(); - if (read_ss_size == 0 ) { continue; } // Columns used for statistics: passes_filtering, sequence_length_template, mean_qscore_template - //ref_thread_data.t_output_SeqTxt_.reset(); ref_thread_data.t_output_SeqTxt_.all_long_read_info.long_read_info.resize(); ref_thread_data.t_output_SeqTxt_.passed_long_read_info.long_read_info.resize(); ref_thread_data.t_output_SeqTxt_.failed_long_read_info.long_read_info.resize(); for(read_ss_i=0; read_ss_i<read_ss_size; read_ss_i++){ - Basic_SeqTxt_Statistics* seqtxt_statistics = NULL; - bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering; - if ( passes_filtering_value == true) { - seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.passed_long_read_info); - } else { - seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.failed_long_read_info); - } - seqtxt_statistics->long_read_info.total_num_reads++; - size_t sequence_base_count = ref_thread_data.stored_records[read_ss_i].sequence_length_template; - seqtxt_statistics->long_read_info.total_num_bases += sequence_base_count; + // Basic_SeqTxt_Statistics* seqtxt_statistics = NULL; + // bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering; + // if ( passes_filtering_value == true) { + // seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.passed_long_read_info); + // } else { + // seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.failed_long_read_info); + // } + bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering; + Basic_SeqTxt_Statistics& seqtxt_statistics = (passes_filtering_value == true) ? ref_thread_data.t_output_SeqTxt_.passed_long_read_info : ref_thread_data.t_output_SeqTxt_.failed_long_read_info; + + seqtxt_statistics.long_read_info.total_num_reads++; + size_t sequence_base_count = ref_thread_data.stored_records[read_ss_i].sequence_length_template; + seqtxt_statistics.long_read_info.total_num_bases += sequence_base_count; // Store the read length - seqtxt_statistics->long_read_info.read_lengths.push_back(sequence_base_count); + seqtxt_statistics.long_read_info.read_lengths.push_back(sequence_base_count); // Update the longest read length int64_t current_read_length = (int64_t) ref_thread_data.stored_records[read_ss_i].sequence_length_template; - if ( seqtxt_statistics->long_read_info.longest_read_length < current_read_length){ - seqtxt_statistics->long_read_info.longest_read_length = current_read_length; - } - seqtxt_statistics->long_read_info.read_length_count[ ref_thread_data.stored_records[read_ss_i].sequence_length_template<MAX_READ_LENGTH?ref_thread_data.stored_records[read_ss_i].sequence_length_template:(MAX_READ_LENGTH-1) ] += 1; - - seqtxt_statistics->seq_quality_info.read_quality_distribution[ int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ] += 1; - if ( seqtxt_statistics->seq_quality_info.min_read_quality == MoneDefault || - seqtxt_statistics->seq_quality_info.min_read_quality>int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ){ - seqtxt_statistics->seq_quality_info.min_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ); - } - if ( seqtxt_statistics->seq_quality_info.max_read_quality < int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template) ){ - seqtxt_statistics->seq_quality_info.max_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template); - } - } + if ( seqtxt_statistics.long_read_info.longest_read_length < current_read_length){ + seqtxt_statistics.long_read_info.longest_read_length = current_read_length; + } + seqtxt_statistics.long_read_info.read_length_count[ ref_thread_data.stored_records[read_ss_i].sequence_length_template<MAX_READ_LENGTH?ref_thread_data.stored_records[read_ss_i].sequence_length_template:(MAX_READ_LENGTH-1) ] += 1; - myMutex_output.lock(); + seqtxt_statistics.seq_quality_info.read_quality_distribution[ int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ] += 1; + if ( seqtxt_statistics.seq_quality_info.min_read_quality == MoneDefault || + seqtxt_statistics.seq_quality_info.min_read_quality>int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ) ){ + seqtxt_statistics.seq_quality_info.min_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template ); + } + if ( seqtxt_statistics.seq_quality_info.max_read_quality < int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template) ){ + seqtxt_statistics.seq_quality_info.max_read_quality = int( ref_thread_data.stored_records[read_ss_i].mean_qscore_template); + } + } + std::lock_guard<std::mutex> lock(myMutex_output); ref_output.add( ref_thread_data.t_output_SeqTxt_ ); - myMutex_output.unlock(); } } diff --git a/src/utils.cpp b/src/utils.cpp index 4d27130..c2b1dc2 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,10 +1,12 @@ #include "utils.h" /// @cond -#include <stdio.h> -#include <string> #include <iostream> +#include <iomanip> +#include <string> #include <mutex> +#include <stdio.h> +#include <sys/resource.h> // getrusage /// @endcond @@ -24,3 +26,14 @@ void printError(std::string message) std::lock_guard<std::mutex> lock(print_mtx); std::cerr << message << std::endl; } + +void printMemoryUsage(const std::string& functionName) { + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); + + // Convert from KB to GB + double mem_usage_gb = (double)usage.ru_maxrss / 1024.0 / 1024.0; + std::lock_guard<std::mutex> lock(print_mtx); + std::cout << functionName << " memory usage: " + << std::fixed << std::setprecision(2) << mem_usage_gb << " GB" << std::endl; +} From 3a24515a4eaa05ea5894c3c5acd9985c52c798b1 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Mon, 23 Dec 2024 16:11:27 -0500 Subject: [PATCH 02/25] remove comments --- src/seqtxt_module.cpp | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/seqtxt_module.cpp b/src/seqtxt_module.cpp index 2eb868a..62d9929 100644 --- a/src/seqtxt_module.cpp +++ b/src/seqtxt_module.cpp @@ -241,7 +241,6 @@ void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref if ( read_ss_size < batch_size_of_record ){ if ( file_index < ref_input_op.num_input_files ){ - // std::cout<< "INFO: Open sequencing_summary.txt file="<< ref_input_op.input_files[file_index] <<std::endl; file_stream.close(); file_stream.clear(); @@ -253,11 +252,10 @@ void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref } } if (read_ss_size == 0 ) { - printMessage("No records read."); continue; } else { total_read_count += read_ss_size; - printMessage("Thread " + std::to_string(thread_id) + " read " + std::to_string(read_ss_size) + " records (total " + std::to_string(total_read_count) + ")"); + printMessage("Thread " + std::to_string(thread_id+1) + " read " + std::to_string(read_ss_size) + " records (total " + std::to_string(total_read_count) + ")"); } // Columns used for statistics: passes_filtering, sequence_length_template, mean_qscore_template @@ -265,13 +263,6 @@ void SeqTxt_Module::SeqTxt_do_thread(std::ifstream& file_stream, Input_Para& ref ref_thread_data.t_output_SeqTxt_.passed_long_read_info.long_read_info.resize(); ref_thread_data.t_output_SeqTxt_.failed_long_read_info.long_read_info.resize(); for(read_ss_i=0; read_ss_i<read_ss_size; read_ss_i++){ - // Basic_SeqTxt_Statistics* seqtxt_statistics = NULL; - // bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering; - // if ( passes_filtering_value == true) { - // seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.passed_long_read_info); - // } else { - // seqtxt_statistics = &(ref_thread_data.t_output_SeqTxt_.failed_long_read_info); - // } bool passes_filtering_value = ref_thread_data.stored_records[read_ss_i].passes_filtering; Basic_SeqTxt_Statistics& seqtxt_statistics = (passes_filtering_value == true) ? ref_thread_data.t_output_SeqTxt_.passed_long_read_info : ref_thread_data.t_output_SeqTxt_.failed_long_read_info; From bb4682ddb5f029f2e2f5dc0d5522c68036ed16bc Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Mon, 30 Dec 2024 16:09:53 -0500 Subject: [PATCH 03/25] Add per-read gc content distribution plot --- Makefile | 6 ++--- include/hts_reader.h | 4 ++-- lib/__init__.py | 0 src/bam_module.cpp | 3 +-- src/fasta_module.cpp | 16 +++++++++----- src/fastq_module.cpp | 52 ++++++++++++++++++++++++++++++-------------- src/hts_reader.cpp | 31 +++++++++++++------------- src/output_data.cpp | 44 +++++++++++++++++++++++++------------ src/plot_utils.py | 28 +++++++++++++++++++----- 9 files changed, 120 insertions(+), 64 deletions(-) delete mode 100644 lib/__init__.py diff --git a/Makefile b/Makefile index 855290e..5b6392f 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,6 @@ SRC_DIR := $(CURDIR)/src LIB_DIR := $(CURDIR)/lib # Set the library paths for the compiler -#LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib -#INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX) LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include @@ -21,4 +19,6 @@ compile: LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \ CXXFLAGS="$(INCLUDE_PATHS)" LDFLAGS="$(LIBRARY_PATHS)" python3 setup.py build_ext --build-lib $(LIB_DIR) -# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \ +# Clean the build directory +clean: + $(RM) -r $(LIB_DIR)/*.so $(LIB_DIR)/*.py $(SRC_DIR)/lrst_wrap.cpp build/ diff --git a/include/hts_reader.h b/include/hts_reader.h index 8790e88..79332c6 100644 --- a/include/hts_reader.h +++ b/include/hts_reader.h @@ -38,7 +38,7 @@ class HTSReader { bool reading_complete = false; // Update read and base counts - int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics* basic_qc, uint64_t *base_quality_distribution); + int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution); // Read the next batch of records from the BAM file int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold); @@ -49,7 +49,7 @@ class HTSReader { // Return the number of records in the BAM file using the BAM index int64_t getNumRecords(const std::string &bam_file_name, Output_BAM &final_output, bool mod_analysis, double base_mod_threshold); - std::map<int, int> getQueryToRefMap(bam1_t *record); + std::map<int, int> getQueryToRefMap(bam1_t* record); // Add a modification to the base modification map void addModificationToQueryMap(std::map<int32_t, std::tuple<char, char, double, int>> &base_modifications, int32_t pos, char mod_type, char canonical_base, double likelihood, int strand); diff --git a/lib/__init__.py b/lib/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/bam_module.cpp b/src/bam_module.cpp index f3a0573..2d47cb4 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -243,9 +243,8 @@ void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unorder reader.readNextRecords(batch_size, record_output, bam_mutex, read_ids, base_mod_threshold); // Update the final output - output_mutex.lock(); + std::lock_guard<std::mutex> lock(output_mutex); final_output.add(record_output); - output_mutex.unlock(); } std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_file, bool accepted_reads) diff --git a/src/fasta_module.cpp b/src/fasta_module.cpp index 666d369..93f3e90 100644 --- a/src/fasta_module.cpp +++ b/src/fasta_module.cpp @@ -6,6 +6,7 @@ FASTA_module.cpp: #include <stdlib.h> // #include <zlib.h> #include <ctype.h> +#include <cmath> // std::round #include <sys/types.h> #include <sys/stat.h> @@ -92,8 +93,11 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_ long_read_info.total_num_bases += base_count; long_read_info.total_n_cnt += n_count; - read_gc_cnt = 100.0 * gc_count / (double)base_count; - long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; + + // Update the per-read GC content distribution + double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count); + int gc_content_int = static_cast<int>(std::round(gc_content_pct)); + long_read_info.read_gc_content_count[gc_content_int] += 1; // Remove the newline character from the sequence data size_t pos = sequence_data_str.find_first_of("\r\n"); @@ -168,10 +172,10 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_ long_read_info.read_length_count[(int)base_count] += 1; } - long_read_info.total_num_bases += base_count; - long_read_info.total_n_cnt += n_count; - read_gc_cnt = 100.0 * gc_count / (double)base_count; - long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; + // Update the per-read GC content distribution + double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count); + int gc_content_int = static_cast<int>(std::round(gc_content_pct)); + long_read_info.read_gc_content_count[gc_content_int] += 1; // Remove the newline character from the sequence data size_t pos = sequence_data_str.find_first_of("\r\n"); diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp index c45a79d..e16dadd 100644 --- a/src/fastq_module.cpp +++ b/src/fastq_module.cpp @@ -1,15 +1,19 @@ +#include "fastq_module.h" + +#include <ctype.h> #include <stdio.h> #include <stdlib.h> -#include <ctype.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <iostream> +#include <algorithm> // std::sort +#include <cmath> // std::round + #include <fstream> -#include <algorithm> +#include <iostream> -#include "fastq_module.h" +#include <sys/stat.h> +#include <sys/types.h> +#include "utils.h" int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &output_data, FILE *read_details_fp) { @@ -83,14 +87,33 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out long_read_info.total_tu_cnt += 1; } base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset; - seq_quality_info.base_quality_distribution[base_quality_value] += 1; + try { + seq_quality_info.base_quality_distribution[base_quality_value] += 1; + } catch (const std::out_of_range& oor) { + printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value"); + } read_mean_base_qual += (double) base_quality_value; } - read_gc_cnt = 100.0 * read_gc_cnt / (double)read_len; - long_read_info.read_gc_content_count[(int)(read_gc_cnt + 0.5)] += 1; - read_mean_base_qual /= (double) read_len; - seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1; - fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, read_gc_cnt, read_mean_base_qual); + + // Update the per-read GC content distribution + double gc_content_pct = (100.0 * read_gc_cnt) / static_cast<double>(read_len); + int gc_content_int = static_cast<int>(std::round(gc_content_pct)); + try { + long_read_info.read_gc_content_count[gc_content_int] += 1; + } catch (const std::out_of_range& oor) { + printError("Warning: Invalid GC content value " + std::to_string(gc_content_int)); + } + + // Update the per-read base quality distribution + double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len); + unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct)); + try { + seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1; + } catch (const std::out_of_range& oor) { + printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); + } + + fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual); // Write to file } } input_file_stream.close(); @@ -140,10 +163,7 @@ int qc_fastq_files(Input_Para &_input_data, Output_FQ &output_data) output_data.long_read_info.NXX_read_length.resize(101, 0); // NXX_read_length[50] means N50 read length; NXX_read_length[95] means N95 read length; - //output_data.seq_quality_info.base_quality_distribution.resize(256, 0); - // base_quality_distribution[x] means number of bases that quality = x. - - output_data.seq_quality_info.read_average_base_quality_distribution.resize(256, 0); + output_data.seq_quality_info.read_average_base_quality_distribution.resize(MAX_BASE_QUALITY, 0); if (_input_data.user_defined_fastq_base_qual_offset > 0) { fastq_base_qual_offset = _input_data.user_defined_fastq_base_qual_offset; diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 31e9ed9..6b93974 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -35,16 +35,16 @@ HTSReader::~HTSReader(){ } // Update read and base counts -int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics *basic_qc, uint64_t *base_quality_distribution){ +int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution) { int exit_code = 0; // Update the total number of reads - basic_qc->total_num_reads++; + basic_qc.total_num_reads++; // Update read length statistics int read_length = (int) record->core.l_qseq; - basic_qc->total_num_bases += (uint64_t) read_length; // Update the total number of bases - basic_qc->read_lengths.push_back(read_length); + basic_qc.total_num_bases += (uint64_t) read_length; // Update the total number of bases + basic_qc.read_lengths.push_back(read_length); // Loop and count the number of each base uint8_t *seq = bam_get_seq(record); @@ -57,19 +57,19 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics *bas char base = seq_nt16_str[bam_seqi(seq, i)]; switch (base) { case 'A': - basic_qc->total_a_cnt++; + basic_qc.total_a_cnt++; break; case 'C': - basic_qc->total_c_cnt++; + basic_qc.total_c_cnt++; break; case 'G': - basic_qc->total_g_cnt++; + basic_qc.total_g_cnt++; break; case 'T': - basic_qc->total_tu_cnt++; + basic_qc.total_tu_cnt++; break; case 'N': - basic_qc->total_n_cnt++; + basic_qc.total_n_cnt++; std::cerr << "Warning: N base found in read " << bam_get_qname(record) << std::endl; break; default: @@ -195,14 +195,17 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // Determine if this is an unmapped read if (record->core.flag & BAM_FUNMAP) { - Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info; + Basic_Seq_Statistics& basic_qc = output_data.unmapped_long_read_info; + // Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info; // Update read and base QC this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution); } else { // Set up the basic QC object - Basic_Seq_Statistics *basic_qc = &output_data.mapped_long_read_info; + // Basic_Seq_Statistics *basic_qc = + // &output_data.mapped_long_read_info; + Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info; // Calculate base alignment statistics on non-secondary alignments if (!(record->core.flag & BAM_FSECONDARY)) { @@ -323,10 +326,8 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution); // Calculate the percent GC content - int percent_gc = round((basic_qc->total_g_cnt + basic_qc->total_c_cnt) / (double) (basic_qc->total_a_cnt + basic_qc->total_c_cnt + basic_qc->total_g_cnt + basic_qc->total_tu_cnt) * 100); - - // Update the GC content histogram - basic_qc->read_gc_content_count.push_back(percent_gc); + int percent_gc = round((basic_qc.total_g_cnt + basic_qc.total_c_cnt) / (double) (basic_qc.total_a_cnt + basic_qc.total_c_cnt + basic_qc.total_g_cnt + basic_qc.total_tu_cnt) * 100); + basic_qc.read_gc_content_count[percent_gc]++; // Update the GC content distribution } else { std::cerr << "Error: Unknown alignment type" << std::endl; diff --git a/src/output_data.cpp b/src/output_data.cpp index cb89804..c2e58d0 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -3,6 +3,7 @@ #include <math.h> // sqrt #include <iostream> #include <sstream> +#include <cmath> // std::round #include "output_data.h" #include "utils.h" @@ -84,9 +85,9 @@ void Basic_Seq_Statistics::add(Basic_Seq_Statistics& basic_qc){ this->read_lengths.insert(this->read_lengths.end(), basic_qc.read_lengths.begin(), basic_qc.read_lengths.end()); } - // Add GC content if not empty - if (!basic_qc.read_gc_content_count.empty()) { - this->read_gc_content_count.insert(this->read_gc_content_count.end(), basic_qc.read_gc_content_count.begin(), basic_qc.read_gc_content_count.end()); + // Update the per-read GC content distribution + for (int i = 0; i < 101; i++) { + this->read_gc_content_count[i] += basic_qc.read_gc_content_count[i]; } } @@ -190,7 +191,6 @@ Basic_Seq_Quality_Statistics::Basic_Seq_Quality_Statistics(){ pos_quality_distribution.resize(MAX_READ_LENGTH, ZeroDefault); pos_quality_distribution_dev.resize(MAX_READ_LENGTH, ZeroDefault); pos_quality_distribution_count.resize(MAX_READ_LENGTH, ZeroDefault); - read_average_base_quality_distribution.resize(MAX_READ_QUALITY, ZeroDefault); read_quality_distribution.resize(MAX_READ_QUALITY, ZeroDefault); } @@ -544,7 +544,6 @@ void Output_FAST5::addReadBaseSignals(Base_Signals values){ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_fp) { const char * read_name; - double gc_content_pct; // Access the read name std::string header_str = fq[0]; @@ -601,21 +600,38 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ } // Get the base quality base_quality_value = (uint64_t)base_quality_values[i]; - seq_quality_info.base_quality_distribution[base_quality_value] += 1; + try { + seq_quality_info.base_quality_distribution[base_quality_value] += 1; + } catch (const std::out_of_range& oor) { + printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value"); + } read_mean_base_qual += (double)base_quality_value; } // Calculate percent guanine & cytosine - gc_content_pct = 100.0 *( (double)gc_count / (double)base_count ); + // gc_content_pct = 100.0 *( (double)gc_count / (double)base_count ); + + // Update the per-read GC content distribution + double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count); + int gc_content_int = static_cast<int>(std::round(gc_content_pct)); + try { + long_read_info.read_gc_content_count[gc_content_int] += 1; + } catch (const std::out_of_range& oor) { + printError("Warning: Invalid GC content value " + std::to_string(gc_content_int)); + } + + // Update the per-read base quality distribution + double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count); + unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct)); + try { + seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1; + } catch (const std::out_of_range& oor) { + printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); + } - // Look into this section - long_read_info.read_gc_content_count[(int)(gc_content_pct + 0.5)] += 1; - read_mean_base_qual /= (double) base_count; - seq_quality_info.read_average_base_quality_distribution[(uint)(read_mean_base_qual + 0.5)] += 1; - fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual); + fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name, base_count, gc_content_pct, read_mean_base_qual); // Write to file - // Update the total number of reads - long_read_info.total_num_reads += 1; + long_read_info.total_num_reads += 1; // Update read count } // Get the read count diff --git a/src/plot_utils.py b/src/plot_utils.py index ac03046..dd911ae 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -339,17 +339,33 @@ def read_lengths_histogram(data, font_size): def read_gc_content_histogram(data, font_size): """Plot the per-read GC content histogram.""" + bin_size = 1 - # Get the GC content data + # Bin the GC content if the bin size is greater than 1 gc_content = np.array(data.read_gc_content_count) - - # Create a histogram of the GC content (0-100% with 1% bins) - gc_content_bins = np.linspace(0, 100, 101) - gc_hist, _ = np.histogram(gc_content, bins=gc_content_bins) + if bin_size > 1: + gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)]) + + gc_content_bins = [i for i in range(0, 101, bin_size)] + + # Generate hover text for each bin + hover_text = [] + if bin_size > 1: + for i in range(len(gc_content_bins)): + hover_text.append('GC content: {}-{}%<br>Counts: {}'.format(gc_content_bins[i], gc_content_bins[i] + bin_size, gc_content[i])) + else: + for i in range(len(gc_content_bins)): + hover_text.append('GC content: {}%<br>Counts: {}'.format(gc_content_bins[i], gc_content[i])) + + # Set the X values to be the center of the bins + if bin_size > 1: + x_values = [gc_content_bins[i] + bin_size / 2 for i in range(len(gc_content_bins))] + else: + x_values = gc_content_bins # Create the figure fig = go.Figure() - fig.add_trace(go.Bar(x=gc_content_bins, y=gc_hist, marker_color='#36a5c7')) + fig.add_trace(go.Bar(x=x_values, y=gc_content, marker_color='#36a5c7', hovertext=hover_text, hoverinfo='text')) # Update the layout fig.update_xaxes(ticks="outside", dtick=10, title_text='GC Content (%)', title_standoff=0) From 9dce02531b952a2127264ca3e77f3e7b7d1b11f2 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Thu, 2 Jan 2025 16:58:42 -0500 Subject: [PATCH 04/25] Fix per-read gc content error --- include/hts_reader.h | 2 +- src/hts_reader.cpp | 61 ++++++++++++++++++++++---------------------- src/plot_utils.py | 6 +++++ 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/include/hts_reader.h b/include/hts_reader.h index 79332c6..00900b2 100644 --- a/include/hts_reader.h +++ b/include/hts_reader.h @@ -38,7 +38,7 @@ class HTSReader { bool reading_complete = false; // Update read and base counts - int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution); + int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution, bool is_primary); // Read the next batch of records from the BAM file int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold); diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 6b93974..0cd1a10 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -35,18 +35,17 @@ HTSReader::~HTSReader(){ } // Update read and base counts -int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution) { - int exit_code = 0; - - // Update the total number of reads - basic_qc.total_num_reads++; +int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution, bool is_primary) { - // Update read length statistics + // Update read QC + basic_qc.total_num_reads++; // Update the total number of reads int read_length = (int) record->core.l_qseq; basic_qc.total_num_bases += (uint64_t) read_length; // Update the total number of bases basic_qc.read_lengths.push_back(read_length); - // Loop and count the number of each base + // Get base counts, quality, and GC content + double read_gc_count = 0.0; // For GC content calculation + double read_base_total = 0.0; // For GC content calculation uint8_t *seq = bam_get_seq(record); for (int i = 0; i < read_length; i++) { // Get the base quality and update the base quality histogram @@ -58,28 +57,42 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& bas switch (base) { case 'A': basic_qc.total_a_cnt++; + read_base_total++; break; case 'C': basic_qc.total_c_cnt++; + read_gc_count++; + read_base_total++; break; case 'G': basic_qc.total_g_cnt++; + read_gc_count++; + read_base_total++; break; case 'T': basic_qc.total_tu_cnt++; + read_base_total++; break; case 'N': basic_qc.total_n_cnt++; std::cerr << "Warning: N base found in read " << bam_get_qname(record) << std::endl; break; default: - std::cerr << "Error reading nucleotide: " << base << std::endl; - exit_code = 1; + printError("Invalid base: " + std::to_string(base)); break; } } - return exit_code; + // Calculate the read GC content percentage if a primary alignment + if (is_primary) { + double gc_content = read_gc_count / read_base_total; + int gc_content_percent = (int) round(gc_content * 100); + std::string query_name = bam_get_qname(record); + // printMessage("Read name: " + query_name + ", GC content: " + std::to_string(gc_content) + ", GC count: " + std::to_string(read_gc_count) + ", Total count: " + std::to_string(read_base_total)); + basic_qc.read_gc_content_count[gc_content_percent]++; + } + + return 0; } // Read the next batch of records from the BAM file and store QC in the output_data object @@ -193,21 +206,14 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu output_data.addReadMoveTable(query_name, seq_str, signal_index_vector, ts, ns); } - // Determine if this is an unmapped read + // Unmapped reads if (record->core.flag & BAM_FUNMAP) { Basic_Seq_Statistics& basic_qc = output_data.unmapped_long_read_info; - // Basic_Seq_Statistics *basic_qc = &output_data.unmapped_long_read_info; - - // Update read and base QC - this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution); + this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, false); } else { - // Set up the basic QC object - // Basic_Seq_Statistics *basic_qc = - // &output_data.mapped_long_read_info; - Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info; - // Calculate base alignment statistics on non-secondary alignments + Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info; if (!(record->core.flag & BAM_FSECONDARY)) { // Determine if this is a forward or reverse read @@ -261,7 +267,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu output_data.num_mismatched_bases += num_mismatches; } - // Determine if this is a secondary alignment (not included in QC, only read count) + // Secondary alignment (not included in QC, only read count) if (record->core.flag & BAM_FSECONDARY) { output_data.num_secondary_alignment++; @@ -271,7 +277,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // Update the read's secondary alignments (count once per read) output_data.reads_with_secondary[query_name] = true; - // Determine if this is a supplementary alignment (not included in QC, only read count) + // Supplementary alignment (not included in QC, only read count) } else if (record->core.flag & BAM_FSUPPLEMENTARY) { output_data.num_supplementary_alignment++; @@ -281,7 +287,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // Update the read's supplementary alignments (count once per read) output_data.reads_with_supplementary[query_name] = true; - // Determine if this is a primary alignment + // Primary alignment } else if (!(record->core.flag & BAM_FSECONDARY || record->core.flag & BAM_FSUPPLEMENTARY)) { output_data.num_primary_alignment++; // Update the number of primary alignments @@ -323,15 +329,10 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu } // Update read and base QC - this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution); - - // Calculate the percent GC content - int percent_gc = round((basic_qc.total_g_cnt + basic_qc.total_c_cnt) / (double) (basic_qc.total_a_cnt + basic_qc.total_c_cnt + basic_qc.total_g_cnt + basic_qc.total_tu_cnt) * 100); - basic_qc.read_gc_content_count[percent_gc]++; // Update the GC content distribution + this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, true); } else { - std::cerr << "Error: Unknown alignment type" << std::endl; - std::cerr << "Flag: " << record->core.flag << std::endl; + printError("Error: Unknown alignment type with flag " + std::to_string(record->core.flag)); } } diff --git a/src/plot_utils.py b/src/plot_utils.py index dd911ae..8adb1ee 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -346,6 +346,12 @@ def read_gc_content_histogram(data, font_size): if bin_size > 1: gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)]) + # # Print the GC content if count > 0 + # logging.info("[HIST] GC content values:") + # for i in range(len(gc_content)): + # if gc_content[i] > 0: + # logging.info("{}-{}%: {}".format(i * bin_size, i * bin_size + bin_size, gc_content[i])) + gc_content_bins = [i for i in range(0, 101, bin_size)] # Generate hover text for each bin From 19136048e063e684733b0c5c084cea3828ceb2d6 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Sat, 4 Jan 2025 13:57:30 -0500 Subject: [PATCH 05/25] Work on read length vs base mod rate --- include/output_data.h | 193 +++++++++++++++++++++++------------------- src/fasta_module.cpp | 2 + src/hts_reader.cpp | 61 ++++++++----- src/output_data.cpp | 24 ++++++ 4 files changed, 170 insertions(+), 110 deletions(-) diff --git a/include/output_data.h b/include/output_data.h index 53d30bf..0651341 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -14,6 +14,7 @@ Define the output structures for each module. #include "input_parameters.h" #include "tin_stats.h" +#include "utils.h" #define MAX_READ_LENGTH 10485760 #define MAX_BASE_QUALITY 100 @@ -114,7 +115,7 @@ class Output_FQ : public Output_FA // Define the base modification data structure (modification type, canonical // base, likelihood, strand: 0 for forward, 1 for reverse, and CpG flag: T/F) -using Base_Modification = std::tuple<char, char, double, int, bool>; +// using Base_Modification = std::tuple<char, char, double, int, bool>; // Define the signal-level data structure for POD5 (ts, ns, move table vector) using POD5_Signal_Data = std::tuple<int32_t, int32_t, std::vector<int32_t>>; @@ -159,92 +160,110 @@ class Base_Move_Table // BAM output class Output_BAM : public Output_FQ { -public: - uint64_t num_primary_alignment = ZeroDefault; // the number of primary alignment/ - uint64_t num_secondary_alignment = ZeroDefault; // the number of secondary alignment - uint64_t num_reads_with_secondary_alignment = ZeroDefault; // the number of long reads with the secondary alignment: one read might have multiple seconard alignment - uint64_t num_supplementary_alignment = ZeroDefault; // the number of supplementary alignment - uint64_t num_reads_with_supplementary_alignment = ZeroDefault; // the number of long reads with secondary alignment; - uint64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment. - uint64_t forward_alignment = ZeroDefault; // Total number of forward alignments - uint64_t reverse_alignment = ZeroDefault; // Total number of reverse alignments - std::map<std::string, bool> reads_with_supplementary; // Map of reads with supplementary alignments - std::map<std::string, bool> reads_with_secondary; // Map of reads with secondary alignments - - // Similar to Output_FA: below are for mapped. - uint64_t num_matched_bases = ZeroDefault; // the number of matched bases with = - uint64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X - uint64_t num_ins_bases = ZeroDefault; // the number of inserted bases; - uint64_t num_del_bases = ZeroDefault; // the number of deleted bases; - uint64_t num_clip_bases = ZeroDefault; // the number of soft-clipped bases; - - // The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators - int num_columns = ZeroDefault; // the number of columns - double percent_identity = ZeroDefault; // Percent identity = (num columns - NM) / num columns - std::vector<int> accuracy_per_read; - - // Preprint revisions: Remove all counts with unique positions in the - // reference genome, and only report raw counts - uint64_t modified_prediction_count = ZeroDefault; // Total number of modified base predictions - uint64_t sample_modified_base_count = ZeroDefault; // Total number of modified bases passing the threshold - uint64_t sample_modified_base_count_forward = ZeroDefault; // Total number of modified bases passing the threshold on the forward strand - uint64_t sample_modified_base_count_reverse = ZeroDefault; // Total number of modified bases passing the threshold on the reverse strand - uint64_t sample_cpg_forward_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the forward strand (non-unique) - uint64_t sample_cpg_reverse_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique) - std::map<std::string, std::vector<std::pair<int32_t, int>>> sample_c_modified_positions; // chr -> vector of (position, strand) for modified bases passing the threshold - - // Signal data section - int read_count = ZeroDefault; - int base_count = ZeroDefault; - std::unordered_map<std::string, Base_Move_Table> read_move_table; - - // POD5 signal-level information is stored in a map of read names to a map of - // reference positions to a tuple of (ts, ns, move table vector) - std::unordered_map<std::string, POD5_Signal_Data> pod5_signal_data; - - // Dictionary of bam filepath to TIN data - std::unordered_map<std::string, TINStats> tin_data; - - Basic_Seq_Statistics mapped_long_read_info; - Basic_Seq_Statistics unmapped_long_read_info; - - Basic_Seq_Quality_Statistics mapped_seq_quality_info; - Basic_Seq_Quality_Statistics unmapped_seq_quality_info; - - // POD5 signal data functions - int getReadCount(); - void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table, int start, int end); - std::vector<int> getReadMoveTable(std::string read_id); - std::string getReadSequence(std::string read_id); - int getReadSequenceStart(std::string read_id); - int getReadSequenceEnd(std::string read_id); - - // Add a batch of records to the output - void add(Output_BAM &t_output_bam); - - // Add TIN data for a single BAM file - void addTINData(std::string &bam_file, TINStats &tin_data); - - // Get the TIN mean for a single BAM file - double getTINMean(std::string bam_file); - - // Get the TIN median for a single BAM file - double getTINMedian(std::string bam_file); - - // Get the TIN standard deviation for a single BAM file - double getTINStdDev(std::string bam_file); - - // Get the TIN count for a single BAM file - int getTINCount(std::string bam_file); - - // Calculate QC across all records - void global_sum(); - - // Save the output to a summary text file - void save_summary(std::string &output_file, Input_Para ¶ms, Output_BAM &output_data); - - Output_BAM(); - ~Output_BAM(); + public: + uint64_t num_primary_alignment = ZeroDefault; // the number of primary alignment/ + uint64_t num_secondary_alignment = ZeroDefault; // the number of secondary alignment + uint64_t num_reads_with_secondary_alignment = ZeroDefault; // the number of long reads with the secondary alignment: one read might have multiple seconard alignment + uint64_t num_supplementary_alignment = ZeroDefault; // the number of supplementary alignment + uint64_t num_reads_with_supplementary_alignment = ZeroDefault; // the number of long reads with secondary alignment; + uint64_t num_reads_with_both_secondary_supplementary_alignment = ZeroDefault; // the number of long reads with both secondary and supplementary alignment. + uint64_t forward_alignment = ZeroDefault; // Total number of forward alignments + uint64_t reverse_alignment = ZeroDefault; // Total number of reverse alignments + std::map<std::string, bool> reads_with_supplementary; // Map of reads with supplementary alignments + std::map<std::string, bool> reads_with_secondary; // Map of reads with secondary alignments + + // Similar to Output_FA: below are for mapped. + uint64_t num_matched_bases = ZeroDefault; // the number of matched bases with = + uint64_t num_mismatched_bases = ZeroDefault; // the number of mismatched bases X + uint64_t num_ins_bases = ZeroDefault; // the number of inserted bases; + uint64_t num_del_bases = ZeroDefault; // the number of deleted bases; + uint64_t num_clip_bases = ZeroDefault; // the number of soft-clipped bases; + + // The number of columns can be calculated by summing over the lengths of M/I/D CIGAR operators + int num_columns = ZeroDefault; // the number of columns + double percent_identity = ZeroDefault; // Percent identity = (num columns - NM) / num columns + std::vector<int> accuracy_per_read; + + // Preprint revisions: Remove all counts with unique positions in the + // reference genome, and only report raw counts + uint64_t modified_prediction_count = ZeroDefault; // Total number of modified base predictions + uint64_t sample_modified_base_count = ZeroDefault; // Total number of modified bases passing the threshold + uint64_t sample_modified_base_count_forward = ZeroDefault; // Total number of modified bases passing the threshold on the forward strand + uint64_t sample_modified_base_count_reverse = ZeroDefault; // Total number of modified bases passing the threshold on the reverse strand + uint64_t sample_cpg_forward_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the forward strand (non-unique) + uint64_t sample_cpg_reverse_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique) + std::map<std::string, std::vector<std::pair<int32_t, int>>> sample_c_modified_positions; // chr -> vector of (position, strand) for modified bases passing the threshold + + // Further revisions + // Structures for storing read length vs. base modification rate data + struct ReadModData + { + int read_length; + double mod_rate; + std::unordered_map<char, double> base_mod_rates; // Type-specific base modification rates + }; + std::vector<ReadModData> read_mod_data; // Read length vs. base modification rate + + // std::pair<std::vector<int>, std::vector<double>> read_length_mod_rate; // Read length vs. base modification rate + // std::unordered_map<char, std::pair<std::vector<int>, std::vector<double>>> read_length_mod_rate; // Read length vs. base modification rate for each base modification type + std::unordered_map<char, uint64_t> base_mod_counts; // Counts for each base modification type exceeding the threshold + std::unordered_map<char, uint64_t> base_mod_counts_forward; // Counts for each base modification type exceeding the threshold on the forward strand + std::unordered_map<char, uint64_t> base_mod_counts_reverse; // Counts for each base modification type exceeding the threshold on the reverse strand + + // Signal data section + int read_count = ZeroDefault; + int base_count = ZeroDefault; + std::unordered_map<std::string, Base_Move_Table> read_move_table; + + // POD5 signal-level information is stored in a map of read names to a map of + // reference positions to a tuple of (ts, ns, move table vector) + std::unordered_map<std::string, POD5_Signal_Data> pod5_signal_data; + + std::unordered_map<std::string, TINStats> tin_data; // TIN data for each BAM file + + Basic_Seq_Statistics mapped_long_read_info; + Basic_Seq_Statistics unmapped_long_read_info; + + Basic_Seq_Quality_Statistics mapped_seq_quality_info; + Basic_Seq_Quality_Statistics unmapped_seq_quality_info; + + // POD5 signal data functions + int getReadCount(); + void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table, int start, int end); + std::vector<int> getReadMoveTable(std::string read_id); + std::string getReadSequence(std::string read_id); + int getReadSequenceStart(std::string read_id); + int getReadSequenceEnd(std::string read_id); + + void updateBaseModCounts(char mod_type, int strand); // Update base modification counts for predictions exceeding the threshold + void updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates); // Update read length vs. base modification rate data + + // Add TIN data for a single BAM file + void addTINData(std::string &bam_file, TINStats &tin_data); + + // TIN mean for a single BAM file + double getTINMean(std::string bam_file); // Get the TIN mean for a single BAM file + + // TIN median for a single BAM file + double getTINMedian(std::string bam_file); + + // TIN standard deviation for a single BAM file + double getTINStdDev(std::string bam_file); + + // TIN count for a single BAM file + int getTINCount(std::string bam_file); + + // Add a batch of records to the output + void add(Output_BAM &t_output_bam); + + // Calculate QC across all records + void global_sum(); + + // Save the output to a summary text file + void save_summary(std::string &output_file, Input_Para ¶ms, Output_BAM &output_data); + + Output_BAM(); + ~Output_BAM(); }; diff --git a/src/fasta_module.cpp b/src/fasta_module.cpp index 93f3e90..8b48371 100644 --- a/src/fasta_module.cpp +++ b/src/fasta_module.cpp @@ -172,6 +172,8 @@ static int qc1fasta(const char *input_file, Output_FA &py_output_fa, FILE *read_ long_read_info.read_length_count[(int)base_count] += 1; } + long_read_info.total_num_bases += base_count; // Update the total number of bases + // Update the per-read GC content distribution double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count); int gc_content_int = static_cast<int>(std::round(gc_content_pct)); diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 0cd1a10..8056c19 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -355,8 +355,12 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f samFile* bam_file = sam_open(bam_filename.c_str(), "r"); bam_hdr_t* bam_header = sam_hdr_read(bam_file); bam1_t* bam_record = bam_init1(); - int64_t num_reads = 0; + + // Data structure for storing read length vs. base modification rate + std::vector<int> read_lengths; // Read lengths + std::vector<double> read_mod_rates; // Total base modification rate for each read length + std::vector<std::unordered_map<char, double>> read_base_mod_rates; // Type-specific base modification rates for each read length while (sam_read1(bam_file, bam_header, bam_record) >= 0) { num_reads++; @@ -366,14 +370,13 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f // Follow here to get base modification tags: // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274 + int read_length = bam_record->core.l_qseq; hts_base_mod_state *state = hts_base_mod_state_alloc(); - - // Preprint revisions: New data structure that does not require unique - // positions for each base modification - // chr -> vector of (position, strand) for C modified bases passing the threshold - std::vector<std::pair<int32_t, int>> c_modified_positions; + std::vector<std::pair<int32_t, int>> c_modified_positions; // C-modified positions for CpG analysis (chr->(position, strand)) + std::unordered_map<char, int> base_mod_counts; // Type-specific base modification counts for the read // Parse the base modification tags if a primary alignment + int read_mod_count = 0; int ret = bam_parse_basemod(bam_record, state); if (ret >= 0 && !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP)) { @@ -402,8 +405,12 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f std::vector<int> query_pos; while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) { for (int i = 0; i < n; i++) { - // Update the prediction count - final_output.modified_prediction_count++; + // Update the modified prediction counts + read_mod_count++; // Read-specific count + final_output.modified_prediction_count++; // Cumulative count + char mod_type = mods[i].modified_base; + base_mod_counts[mod_type]++; // Update the type-specific count + // Note: The modified base value can be a positive char (e.g. 'm', // 'h') (DNA Mods DB) or negative integer (ChEBI ID): @@ -419,22 +426,13 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f if (mods[i].qual != -1) { probability = mods[i].qual / 256.0; - // If the probability is greater than the threshold, - // update the count + // Update counts for predictions exceeding the threshold if (probability >= base_mod_threshold) { - final_output.sample_modified_base_count++; - - // Update the modified base count for the strand - if (strand == 0) { - final_output.sample_modified_base_count_forward++; - } else { - final_output.sample_modified_base_count_reverse++; - } + final_output.updateBaseModCounts(mod_type, strand); // Update the base modification counts - // Preprint revisions: Store the modified positions + // Store the modified positions for later CpG analysis char canonical_base_char = std::toupper(mods[i].canonical_base); - char mod_type = mods[i].modified_base; - if (canonical_base_char == 'C' && mod_type == 'm') { + if (canonical_base_char == 'C' && mod_type != 'C') { // Convert the query position to reference position if available if (alignments_present) { @@ -465,9 +463,26 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f } } } + hts_base_mod_state_free(state); // Deallocate the base modification state object + + // Calculate the base modification rate for the read + double read_mod_rate = 0.0; + if (read_length > 0) { + read_mod_rate = (double) read_mod_count / read_length; + } - // Deallocate the state object - hts_base_mod_state_free(state); + // Calculate the type-specific base modification rates for the read + std::unordered_map<char, double> base_mod_rates; + for (auto const &it : base_mod_counts) { + char mod_type = it.first; + int mod_count = it.second; + double mod_rate = 0.0; + if (read_length > 0) { + mod_rate = (double) mod_count / read_length; + } + base_mod_rates[mod_type] = mod_rate; + } + final_output.updateReadModRate(read_length, read_mod_rate, base_mod_rates); // Update the output data } } diff --git a/src/output_data.cpp b/src/output_data.cpp index c2e58d0..7405a7c 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -262,6 +262,30 @@ Output_BAM::Output_BAM(){ Output_BAM::~Output_BAM(){ } +void Output_BAM::updateBaseModCounts(char mod_type, int strand) +{ + // Update the sample modified base count for predictions exceeding the threshold + this->sample_modified_base_count++; + this->base_mod_counts[mod_type]++; // Update the type-specific modified base count + + // Update the modified base count for the strand + if (strand == 0) { + this->sample_modified_base_count_forward++; + this->base_mod_counts_forward[mod_type]++; // Update the type-specific modified base count + } else { + this->sample_modified_base_count_reverse++; + this->base_mod_counts_reverse[mod_type]++; // Update the type-specific modified base count + } +} + +void Output_BAM::updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates) { + ReadModData read_mod_data; + read_mod_data.read_length = read_length; + read_mod_data.mod_rate = read_mod_rate; + read_mod_data.base_mod_rates = base_mod_rates; + this->read_mod_data.push_back(read_mod_data); +} + int Output_BAM::getReadCount() { return this->read_move_table.size(); From 345528210a18309d690caeca9300e7a0f536a3ce Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Sat, 4 Jan 2025 17:06:52 -0500 Subject: [PATCH 06/25] Add base mod plots --- README.md | 2 +- include/output_data.h | 27 ++++++---- src/cli.py | 6 ++- src/lrst.i | 40 +-------------- src/output_data.cpp | 55 ++++++++++++++++++++ src/plot_utils.py | 115 +++++++++++++++++++++++++++++++----------- 6 files changed, 165 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index e5b4bed..1ce0df6 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ MinION R9.4.1 from https://labs.epi2me.io/gm24385-5mc/) ## General usage ``` -longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --ref $REF_GENOME --modprob 0.8 +longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY --mod --modprob 0.8 --ref $REF_GENOME ``` # RRMS BAM diff --git a/include/output_data.h b/include/output_data.h index 0651341..06608d6 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -157,6 +157,14 @@ class Base_Move_Table }; +// Structures for storing read length vs. base modification rate data +struct ReadModData +{ + int read_length; + double mod_rate; + std::unordered_map<char, double> base_mod_rates; // Type-specific base modification rates +}; + // BAM output class Output_BAM : public Output_FQ { @@ -194,16 +202,6 @@ class Output_BAM : public Output_FQ uint64_t sample_cpg_reverse_count = ZeroDefault; // Total number of modified bases passing the threshold that are in CpG sites and in the reverse strand (non-unique) std::map<std::string, std::vector<std::pair<int32_t, int>>> sample_c_modified_positions; // chr -> vector of (position, strand) for modified bases passing the threshold - // Further revisions - // Structures for storing read length vs. base modification rate data - struct ReadModData - { - int read_length; - double mod_rate; - std::unordered_map<char, double> base_mod_rates; // Type-specific base modification rates - }; - std::vector<ReadModData> read_mod_data; // Read length vs. base modification rate - // std::pair<std::vector<int>, std::vector<double>> read_length_mod_rate; // Read length vs. base modification rate // std::unordered_map<char, std::pair<std::vector<int>, std::vector<double>>> read_length_mod_rate; // Read length vs. base modification rate for each base modification type std::unordered_map<char, uint64_t> base_mod_counts; // Counts for each base modification type exceeding the threshold @@ -227,6 +225,15 @@ class Output_BAM : public Output_FQ Basic_Seq_Quality_Statistics mapped_seq_quality_info; Basic_Seq_Quality_Statistics unmapped_seq_quality_info; + std::vector<ReadModData> read_mod_data; // Read length vs. base modification rate + std::vector<char> getBaseModTypes(); // Get the types of base modifications found + int getReadModDataSize(); // Get the number of read length vs. base modification rate data points + int getNthReadModLength(int read_index); // Get the read length for the nth read + double getNthReadModRate(int read_index); // Get the base modification rate for the nth read + double getNthReadModRate(int read_index, char mod_type); // Get the base modification rate for the nth read for a specific base modification type + uint64_t getModTypeCount(char mod_type); // Get the count of a specific base modification type + uint64_t getModTypeCount(char mod_type, int strand); // Get the count of a specific base modification type for a specific strand + // POD5 signal data functions int getReadCount(); void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table, int start, int end); diff --git a/src/cli.py b/src/cli.py index 29549f2..59b8a9c 100644 --- a/src/cli.py +++ b/src/cli.py @@ -97,10 +97,11 @@ def get_common_param(margs): # Set up logging to stdout logging.basicConfig(stream=sys.stdout, level=get_log_level(margs.log_level), - format="%(asctime)s [%(levelname)s] %(message)s") + format="%(asctime)s %(message)s") + # format="%(asctime)s [%(levelname)s] %(message)s") else: logging.basicConfig(level=get_log_level(margs.log_level), - format="%(asctime)s [%(levelname)s] %(message)s", + format="%(asctime)s %(message)s", handlers=[ logging.FileHandler(margs.log), logging.StreamHandler(sys.stdout) @@ -250,6 +251,7 @@ def bam_module(margs): # If base modifications were found, add the base modification plots # after the first table if bam_output.sample_modified_base_count > 0: + qc_info_list.insert(1, "read_length_mod_rates") # Read length modification rates qc_info_list.insert(1, "base_mods") # If gene BED file was provided, add the TIN plots diff --git a/src/lrst.i b/src/lrst.i index 9def020..fc93d0e 100644 --- a/src/lrst.i +++ b/src/lrst.i @@ -36,41 +36,6 @@ lrst.i: SWIG module defining the Python wrapper for our C++ modules $result = list; } -// Map std::map<int32_t, std::map<char, std::tuple<char, double>>> to Python -// dictionary -// %typemap(out) std::map<int32_t, std::map<char, std::tuple<char, double>>> { -// PyObject *dict = PyDict_New(); -// for (auto const &it : $1) { -// PyObject *inner_dict = PyDict_New(); -// for (auto const &inner_it : it.second) { -// PyObject *tuple = PyTuple_Pack(2, -// PyUnicode_FromStringAndSize(&std::get<0>(inner_it.second), 1), -// PyFloat_FromDouble(std::get<1>(inner_it.second))); -// PyDict_SetItem(inner_dict, -// PyUnicode_FromStringAndSize(&inner_it.first, 1), -// tuple); -// } -// PyDict_SetItem(dict, PyLong_FromLong(it.first), inner_dict); -// } -// $result = dict; -// } - -// Map std::map<int32_t, std::tuple<char, char, double, int, bool>> to Python -// dictionary -// %typemap(out) std::map<int32_t, std::tuple<char, char, double, int, bool>> { -// PyObject *dict = PyDict_New(); -// for (auto const &it : $1) { -// PyObject *tuple = PyTuple_Pack(5, -// PyUnicode_FromStringAndSize(&std::get<0>(it.second), 1), -// PyUnicode_FromStringAndSize(&std::get<1>(it.second), 1), -// PyFloat_FromDouble(std::get<2>(it.second)), -// PyLong_FromLong(std::get<3>(it.second)), -// PyBool_FromLong(std::get<4>(it.second))); -// PyDict_SetItem(dict, PyLong_FromLong(it.first), tuple); -// } -// $result = dict; -// } - // Map std::map<std::string, std::map<int32_t, std::tuple<char, char, double, // int, bool>>> to Python dictionary %typemap(out) std::map<std::string, std::map<int32_t, std::tuple<char, char, double, int, bool>>> { @@ -104,12 +69,11 @@ lrst.i: SWIG module defining the Python wrapper for our C++ modules %include <stdint.i> %include <std_vector.i> -// Define the conversion for uint64_t arrays -//%array_class(uint64_t, uint64Array); - %template(IntVector) std::vector<int>; %template(DoubleVector) std::vector<double>; %template(Int2DVector) std::vector<std::vector<int>>; +%template(StringVector) std::vector<std::string>; +%template(CharVector) std::vector<char>; // These are the header functions wrapped by our lrst module (Like an 'import') %include "input_parameters.h" // Contains InputPara for passing parameters to C++ diff --git a/src/output_data.cpp b/src/output_data.cpp index 7405a7c..c92a60d 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -286,6 +286,61 @@ void Output_BAM::updateReadModRate(int read_length, double read_mod_rate, std::u this->read_mod_data.push_back(read_mod_data); } +std::vector<char> Output_BAM::getBaseModTypes() +{ + std::vector<char> base_mod_types; + for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) { + base_mod_types.push_back(it->first); + } + return base_mod_types; +} + +int Output_BAM::getReadModDataSize() +{ + return this->read_mod_data.size(); +} + +int Output_BAM::getNthReadModLength(int read_index) +{ + return this->read_mod_data[read_index].read_length; +} + +double Output_BAM::getNthReadModRate(int read_index) +{ + return this->read_mod_data[read_index].mod_rate; +} + +double Output_BAM::getNthReadModRate(int read_index, char mod_type) +{ + double mod_rate = 0.0; + try { + this->read_mod_data.at(read_index); + } catch (const std::out_of_range& oor) { + std::cerr << "Error: Read index " << read_index << " is out of range." << std::endl; + } + try { + mod_rate = this->read_mod_data[read_index].base_mod_rates.at(mod_type); + } catch (const std::out_of_range& oor) { + // No modification rate found for the specified type in the read + mod_rate = 0.0; + } + return mod_rate; +} + +uint64_t Output_BAM::getModTypeCount(char mod_type) +{ + return this->base_mod_counts[mod_type]; +} + +uint64_t Output_BAM::getModTypeCount(char mod_type, int strand) +{ + if (strand == 0) { + return this->base_mod_counts_forward[mod_type]; + } else { + return this->base_mod_counts_reverse[mod_type]; + } +} + int Output_BAM::getReadCount() { return this->read_move_table.size(); diff --git a/src/plot_utils.py b/src/plot_utils.py index 8adb1ee..d82bffb 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -37,6 +37,8 @@ def getDefaultPlotFilenames(): "gc_content_hist": {'title': "GC Content Histogram", 'description': "GC Content Histogram", 'summary': ""}, + "read_length_mod_rates": {'title': "Read Length vs. Modification Rates", 'description': "Read Length vs. Modification Rates", 'summary': ""}, + "base_quality": {'title': "Base Quality Histogram", 'description': "Base Quality Histogram"}, "read_avg_base_quality": {'title': "Read Base Quality Histogram", 'description': "Read Base Quality Histogram"}, @@ -346,12 +348,6 @@ def read_gc_content_histogram(data, font_size): if bin_size > 1: gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)]) - # # Print the GC content if count > 0 - # logging.info("[HIST] GC content values:") - # for i in range(len(gc_content)): - # if gc_content[i] > 0: - # logging.info("{}-{}%: {}".format(i * bin_size, i * bin_size + bin_size, gc_content[i])) - gc_content_bins = [i for i in range(0, 101, bin_size)] # Generate hover text for each bin @@ -450,17 +446,22 @@ def plot(output_data, para_dict, file_type): # Create the summary table create_summary_table(output_data, plot_filepaths, file_type) - # Create the modified base table if available + # Modified base table and plots if file_type == 'BAM' and para_dict["mod"] > 0: + # Modified base table base_modification_threshold = para_dict["modprob"] create_modified_base_table(output_data, plot_filepaths, base_modification_threshold) - - # Check if the modified base table is available - if 'base_mods' in plot_filepaths: - logging.info("SUCCESS: Modified base table created") - else: + if 'base_mods' not in plot_filepaths: logging.warning("WARNING: Modified base table not created") + # # Print the types of modifications + # base_mod_types = output_data.getBaseModTypes() + # logging.info("Modification types: ") + # for mod_type in base_mod_types: + # logging.info(mod_type) + + + # Create the TIN table if available if file_type == 'BAM' and para_dict["genebed"] != "": input_files = para_dict["input_files"] @@ -886,6 +887,67 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th plot_filepaths["base_mods"]['title'] = "Base Modifications" plot_filepaths["base_mods"]['description'] = "Base modification statistics" + # Print the types of modifications + base_mod_types = output_data.getBaseModTypes() + logging.info("Modification types: ") + for mod_type in base_mod_types: + logging.info(mod_type) + + # Get the read length vs. base modification rate data for each modification type + read_mod_data_size = output_data.getReadModDataSize() + read_length_mod_rates = {} + for i in range(read_mod_data_size): + for mod_type in base_mod_types: + if mod_type not in read_length_mod_rates: + read_length_mod_rates[mod_type] = [] + + read_length = output_data.getNthReadModLength(i) + mod_rate = output_data.getNthReadModRate(i, mod_type) + read_length_mod_rates[mod_type].append((read_length, mod_rate)) + + # Dictionary of modification character to full name + mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \ + 'g': '5hmU', 'e': '5fu', 'b': '5caU', \ + 'a': '6mA', 'o': '8oxoG', 'n': 'Xao', \ + 'C': 'Amb. C', 'A': 'Amb. A', 'T': 'Amb. T', 'G': 'Amb. G',\ + 'N': 'Amb. N'} + + + # Create a plot of read length vs. base modification rate for each + # modification type + for mod_type in base_mod_types: + + # Format the data + mod_data = read_length_mod_rates[mod_type] + x_vals = [data[0] for data in mod_data] + read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals] + mod_rates = [data[1] * 100 for data in mod_data] + + # Get the modification name + try: + mod_char_to_name[mod_type] + except KeyError: + logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) + mod_name = mod_type + + mod_name = mod_char_to_name[mod_type] + + # Create the figure + fig = go.Figure() + fig.add_trace(go.Scatter(x=x_vals, y=mod_rates, mode='markers', name=mod_name)) + + # Update the layout + fig.update_layout(xaxis_title='Read Length', + yaxis_title='Modification Rate (%)', + showlegend=True, + yaxis=dict(range=[0, 100]), + xaxis=dict(tickvals=x_vals, ticktext=read_lengths), + font=dict(size=PLOT_FONT_SIZE)) + + # Generate the HTML + html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj + # Create the base modification statistics table table_str = "<table>\n<tbody>" table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count) @@ -895,6 +957,18 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse) table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count) table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count) + + # Add the modification type data + for mod_type in base_mod_types: + mod_name = mod_char_to_name[mod_type] + mod_count = output_data.getModTypeCount(mod_type) + mod_count_fwd = output_data.getModTypeCount(mod_type, 0) + mod_count_rev = output_data.getModTypeCount(mod_type, 1) + table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count) + table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd) + table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev) + + # Finish the table table_str += "\n</tbody>\n</table>" plot_filepaths["base_mods"]['detail'] = table_str @@ -929,23 +1003,6 @@ def create_tin_table(output_data, input_files, plot_filepaths): # Add the table to the plot filepaths plot_filepaths["tin"]['detail'] = table_str - # plot_filepaths["base_mods"] = {} - # plot_filepaths["base_mods"]['file'] = "" - # plot_filepaths["base_mods"]['title'] = "Base Modifications" - # plot_filepaths["base_mods"]['description'] = "Base modification statistics" - - # # Create the base modification statistics table - # table_str = "<table>\n<tbody>" - # table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count) - # table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold) - # table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count) - # table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward) - # table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse) - # table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count) - # table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count) - # table_str += "\n</tbody>\n</table>" - # plot_filepaths["base_mods"]['detail'] = table_str - def create_pod5_table(output_dict, plot_filepaths): """Create a summary table for the ONT POD5 signal data.""" plot_filepaths["basic_st"] = {} From 5cfcf9f4b772200f206d40cbd51509069cb3507e Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Sat, 4 Jan 2025 18:10:20 -0500 Subject: [PATCH 07/25] Work on flags --- src/generate_html.py | 24 ++++++++++++++++++++++-- src/plot_utils.py | 3 +-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/generate_html.py b/src/generate_html.py index dcc5be7..a2e061e 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -237,19 +237,39 @@ def generate_left(self): self.html_writer.write('<h2>Summary</h2>') self.html_writer.write('<ul>') + # Define ASCII/Unicode icons for different flags + flag_icons = { + "PASS": "✔", + "WARN": "⚠", + } + # "WARN": "⚠", + # "PASS": "✔", + # "FAIL": "❌", + # "INFO": "ℹ" + # Add links to the right sections key_index = 0 for plot_key in self.image_key_list: - self.html_writer.write('<li>') + # Determine the flag icon + # [TEST] Select a random flag for testing + flags = ["PASS", "WARN"] + flag = flags[key_index % 2] + + # flag = self.plot_filepaths[plot_key]['flag'] + flag_icon = flag_icons[flag] + self.html_writer.write('<li>') + self.html_writer.write(f'{flag_icon} ') self.html_writer.write( '<a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>') + # f'{flag_icon} <a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>') + key_index += 1 self.html_writer.write('</li>') # Add the input files section link self.html_writer.write('<li>') - self.html_writer.write('<a href="#lrst' + str(key_index) + '">Input File List</a>') + self.html_writer.write('• <a href="#lrst' + str(key_index) + '">Input File List</a>') key_index += 1 self.html_writer.write('</li>') diff --git a/src/plot_utils.py b/src/plot_utils.py index d82bffb..5accb86 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -915,6 +915,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Create a plot of read length vs. base modification rate for each # modification type + fig = go.Figure() for mod_type in base_mod_types: # Format the data @@ -932,8 +933,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th mod_name = mod_char_to_name[mod_type] - # Create the figure - fig = go.Figure() fig.add_trace(go.Scatter(x=x_vals, y=mod_rates, mode='markers', name=mod_name)) # Update the layout From 13b150f529790388db10948b995c10bb8f7e6ac5 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Sun, 5 Jan 2025 18:34:39 -0500 Subject: [PATCH 08/25] Work on flags and mod plots --- src/cli.py | 52 +++--- src/generate_html.py | 34 ++-- src/hts_reader.cpp | 18 +- src/output_data.cpp | 4 +- src/plot_utils.py | 426 ++++++++++++++++++++++++++++--------------- 5 files changed, 338 insertions(+), 196 deletions(-) diff --git a/src/cli.py b/src/cli.py index 59b8a9c..85951a6 100644 --- a/src/cli.py +++ b/src/cli.py @@ -155,7 +155,7 @@ def fq_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fq_output, param_dict, 'FASTQ') fq_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality", + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality", "read_avg_base_quality"], "FASTQ QC", param_dict], plot_filepaths, static=False) fq_html_gen.generate_html() @@ -246,11 +246,12 @@ def bam_module(margs): plot_filepaths = plot(bam_output, param_dict, 'BAM') # Set the list of QC information to display - qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"] + qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", "read_avg_base_quality"] # If base modifications were found, add the base modification plots # after the first table if bam_output.sample_modified_base_count > 0: + logging.info("Base modifications found. Adding base modification plots to the HTML report.") qc_info_list.insert(1, "read_length_mod_rates") # Read length modification rates qc_info_list.insert(1, "base_mods") @@ -313,7 +314,7 @@ def rrms_module(margs): # Generate the HTML report bam_html_gen = generate_html.ST_HTML_Generator( [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", - "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False) + "base_quality", "read_avg_base_quality"], "BAM QC", param_dict], plot_filepaths, static=False) bam_html_gen.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -431,8 +432,7 @@ def fast5_signal_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fast5_output, param_dict, 'FAST5s') fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", - "read_avg_base_quality", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False) + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html(signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -440,25 +440,6 @@ def fast5_signal_module(margs): logging.error("QC did not generate.") -def set_file_parser_defaults(file_parser): - """Create a parser with default arguments for a specific filetype.""" - file_parser.add_argument("-i", "--input", type=argparse.FileType('r'), default=None, - help="Single input filepath") - file_parser.add_argument("-I", "--inputs", type=str, default=None, - help="Multiple comma-separated input filepaths") - file_parser.add_argument("-P", "--pattern", type=str, default=None, - help="Use pattern matching (*) to specify multiple input files. Enclose the pattern in double quotes.") - file_parser.add_argument("-g", "--log", type=str, default="log_output.log", - help="Log file") - file_parser.add_argument("-G", "--log-level", type=int, default=2, - help="Logging level. 1: DEBUG, 2: INFO, 3: WARNING, 4: ERROR, 5: CRITICAL. Default: 2.") - file_parser.add_argument("-o", "--outputfolder", type=str, default="output_" + prg_name, - help="The output folder.") - file_parser.add_argument("-t", "--threads", type=int, default=1, - help="The number of threads used. Default: 1.") - file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_", - help="The prefix for output filenames. Default: `QC_`.") - def pod5_module(margs): """POD5 file input module.""" # Get the filetype-specific parameters @@ -519,13 +500,32 @@ def pod5_module(margs): # plot_filepaths = plot(read_signal_dict, param_dict, 'POD5') webpage_title = "POD5 QC" fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", - "read_avg_base_quality", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False) + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html(signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) else: logging.error("QC did not generate.") + + +def set_file_parser_defaults(file_parser): + """Create a parser with default arguments for a specific filetype.""" + file_parser.add_argument("-i", "--input", type=argparse.FileType('r'), default=None, + help="Single input filepath") + file_parser.add_argument("-I", "--inputs", type=str, default=None, + help="Multiple comma-separated input filepaths") + file_parser.add_argument("-P", "--pattern", type=str, default=None, + help="Use pattern matching (*) to specify multiple input files. Enclose the pattern in double quotes.") + file_parser.add_argument("-g", "--log", type=str, default="log_output.log", + help="Log file") + file_parser.add_argument("-G", "--log-level", type=int, default=2, + help="Logging level. 1: DEBUG, 2: INFO, 3: WARNING, 4: ERROR, 5: CRITICAL. Default: 2.") + file_parser.add_argument("-o", "--outputfolder", type=str, default="output_" + prg_name, + help="The output folder.") + file_parser.add_argument("-t", "--threads", type=int, default=1, + help="The number of threads used. Default: 1.") + file_parser.add_argument("-Q", "--outprefix", type=str, default="QC_", + help="The prefix for output filenames. Default: `QC_`.") # Set up the argument parser diff --git a/src/generate_html.py b/src/generate_html.py index a2e061e..64f1641 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -237,39 +237,34 @@ def generate_left(self): self.html_writer.write('<h2>Summary</h2>') self.html_writer.write('<ul>') - # Define ASCII/Unicode icons for different flags - flag_icons = { - "PASS": "✔", - "WARN": "⚠", + # Define ASCII/Unicode icons for error flags + error_flag_icon = { + True: "⚠", + False: "✔", } - # "WARN": "⚠", - # "PASS": "✔", - # "FAIL": "❌", - # "INFO": "ℹ" # Add links to the right sections key_index = 0 for plot_key in self.image_key_list: # Determine the flag icon - # [TEST] Select a random flag for testing - flags = ["PASS", "WARN"] - flag = flags[key_index % 2] + try: + flag = self.plot_filepaths[plot_key]['error_flag'] + except KeyError: + flag = False - # flag = self.plot_filepaths[plot_key]['flag'] - flag_icon = flag_icons[flag] + flag_icon = error_flag_icon[flag] self.html_writer.write('<li>') self.html_writer.write(f'{flag_icon} ') self.html_writer.write( '<a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>') - # f'{flag_icon} <a href="#lrst' + str(key_index) + '">' + self.plot_filepaths[plot_key]['title'] + '</a>') key_index += 1 self.html_writer.write('</li>') # Add the input files section link - self.html_writer.write('<li>') - self.html_writer.write('• <a href="#lrst' + str(key_index) + '">Input File List</a>') + self.html_writer.write('<br><li>') + self.html_writer.write('<a href="#lrst' + str(key_index) + '">Input File List</a>') key_index += 1 self.html_writer.write('</li>') @@ -297,7 +292,12 @@ def generate_right(self): self.html_writer.write(dynamic_plot) except KeyError: - logging.error("Missing dynamic plot for %s", plot_key) + # See if an image is available + try: + image_path = self.plot_filepaths[plot_key]['file'] + self.html_writer.write(f'<img src="{image_path}" alt="{plot_key}">') + except KeyError: + logging.error("Missing plot for %s", plot_key) self.html_writer.write('</div>') diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 8056c19..8762f45 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -378,7 +378,8 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f // Parse the base modification tags if a primary alignment int read_mod_count = 0; int ret = bam_parse_basemod(bam_record, state); - if (ret >= 0 && !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP)) { + if (ret >= 0) { + bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); // Get the chromosome if alignments are present bool alignments_present = true; @@ -397,13 +398,20 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f // but it always yields 0...) int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0; + // Set strand to null (-1) if the read is not primary + if (!is_primary) { + strand = -1; + } + // Iterate over the state object to get the base modification tags // using bam_next_basemod hts_base_mod mods[10]; int n = 0; int32_t pos = 0; std::vector<int> query_pos; + bool first_mod_found = false; while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) { + for (int i = 0; i < n; i++) { // Update the modified prediction counts read_mod_count++; // Read-specific count @@ -411,7 +419,6 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f char mod_type = mods[i].modified_base; base_mod_counts[mod_type]++; // Update the type-specific count - // Note: The modified base value can be a positive char (e.g. 'm', // 'h') (DNA Mods DB) or negative integer (ChEBI ID): // https://github.com/samtools/hts-specs/issues/741 @@ -430,9 +437,10 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f if (probability >= base_mod_threshold) { final_output.updateBaseModCounts(mod_type, strand); // Update the base modification counts - // Store the modified positions for later CpG analysis + // Store the modified positions for later CpG + // analysis if a C modification on a primary alignment char canonical_base_char = std::toupper(mods[i].canonical_base); - if (canonical_base_char == 'C' && mod_type != 'C') { + if (is_primary && canonical_base_char == 'C' && mod_type != 'C') { // Convert the query position to reference position if available if (alignments_present) { @@ -447,7 +455,7 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f } } - // Preprint revisions: Append the modified positions to the output data + // Append the modified positions to the output data if (c_modified_positions.size() > 0) { // Set the atomic flag and print a message if base // modification tags are present in the file diff --git a/src/output_data.cpp b/src/output_data.cpp index c92a60d..02cd5a8 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -268,11 +268,11 @@ void Output_BAM::updateBaseModCounts(char mod_type, int strand) this->sample_modified_base_count++; this->base_mod_counts[mod_type]++; // Update the type-specific modified base count - // Update the modified base count for the strand + // Update the modified base count for the strand from primary alignments if (strand == 0) { this->sample_modified_base_count_forward++; this->base_mod_counts_forward[mod_type]++; // Update the type-specific modified base count - } else { + } else if (strand == 1) { this->sample_modified_base_count_reverse++; this->base_mod_counts_reverse[mod_type]++; // Update the type-specific modified base count } diff --git a/src/plot_utils.py b/src/plot_utils.py index 5accb86..209cfbd 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -422,7 +422,7 @@ def plot_base_modifications(base_modifications): mod_data = base_modifications[mod_type] # Create the trace - trace = go.Scatter(x=mod_data['positions'], y=mod_data['counts'], mode='markers', name=mod_type) + trace = go.Scattergl(x=mod_data['positions'], y=mod_data['counts'], mode='markers', name=mod_type) # Add the trace to the figure fig.add_trace(trace) @@ -448,20 +448,17 @@ def plot(output_data, para_dict, file_type): # Modified base table and plots if file_type == 'BAM' and para_dict["mod"] > 0: - # Modified base table + # Output file for the read length vs. modification rates plot + output_folder = para_dict["output_folder"] + read_length_hist_file = os.path.join(output_folder, 'read_length_hist.png') + plot_filepaths['read_length_mod_rates']['file'] = read_length_hist_file + + # Generate the modified base table and read length vs. modification rates plot base_modification_threshold = para_dict["modprob"] create_modified_base_table(output_data, plot_filepaths, base_modification_threshold) if 'base_mods' not in plot_filepaths: logging.warning("WARNING: Modified base table not created") - # # Print the types of modifications - # base_mod_types = output_data.getBaseModTypes() - # logging.info("Modification types: ") - # for mod_type in base_mod_types: - # logging.info(mod_type) - - - # Create the TIN table if available if file_type == 'BAM' and para_dict["genebed"] != "": input_files = para_dict["input_files"] @@ -616,7 +613,7 @@ def plot_pod5(pod5_output, para_dict, bam_output=None): # Plot the signal data x = np.arange(signal_length) - fig.add_trace(go.Scatter( + fig.add_trace(go.Scattergl( x=x, y=nth_read_data, mode='markers', marker=dict(color='LightSkyBlue', @@ -624,7 +621,7 @@ def plot_pod5(pod5_output, para_dict, bam_output=None): line=dict(color='MediumPurple', width=2)), opacity=0.5)) - # Update the plot style + # Update the plot style (using 0-100 to improve performance) fig.update_layout( title=nth_read_name, yaxis_title="Signal", @@ -708,7 +705,7 @@ def plot_signal(output_data, para_dict): # Plot x = np.arange(start_index, end_index, 1) - fig.add_trace(go.Scatter( + fig.add_trace(go.Scattergl( x=x, y=base_signals, mode='markers', marker=dict(color='LightSkyBlue', @@ -763,6 +760,29 @@ def plot_signal(output_data, para_dict): return output_html_plots +def format_cell(value, type_str='int', error_flag=False): + """Format the cell value for the summary table.""" + style = "background-color: #F88379;" if error_flag else "" + if type_str == 'int': + return "<td style=\"text-align:right;{}\">{:,d}</td>".format(style, value) + elif type_str == 'float': + return "<td style=\"text-align:right;{}\">{:.1f}</td>".format(style, value) + else: + logging.error("ERROR: Invalid type for formatting cell value") + +def format_row(row_name, values, type_str='int', col_ignore=None): + """Format the row for the summary table. Skip flagging null values in specific columns.""" + cell_str = [] + row_flag = False + for i, value in enumerate(values): + # Set the error flag if the value is 0 except for unmapped reads + error_flag = value == 0 and i != col_ignore + row_flag = row_flag or error_flag # Flag for the entire row + cell_str.append(format_cell(value, type_str, error_flag)) + + return "<tr><td>{}</td>{}</tr>".format(row_name, "".join(cell_str)), row_flag + + def create_summary_table(output_data, plot_filepaths, file_type): """Create the summary table for the basic statistics.""" plot_filepaths["basic_st"] = {} @@ -777,73 +797,135 @@ def create_summary_table(output_data, plot_filepaths, file_type): file_type_label = 'Basecall Summary' plot_filepaths["basic_st"]['description'] = "{} Basic Statistics".format(file_type_label) + table_error_flag = False if file_type == 'BAM': + # Add alignment statistics to the summary table table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Mapped</th><th>Unmapped</th><th>All</th></tr>\n" \ "</thead> " table_str += "\n<tbody>" - int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:," \ - "d}</td><td style=\"text-align:right\">{:,d}</td></tr> " - double_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:.1f}</td><td " \ - "style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr> " - table_str += int_str_for_format.format("#Total Reads", output_data.mapped_long_read_info.total_num_reads, - output_data.unmapped_long_read_info.total_num_reads, - output_data.long_read_info.total_num_reads) - table_str += int_str_for_format.format("#Total Bases", - output_data.mapped_long_read_info.total_num_bases, - output_data.unmapped_long_read_info.total_num_bases, - output_data.long_read_info.total_num_bases) - table_str += int_str_for_format.format("Longest Read Length", - output_data.mapped_long_read_info.longest_read_length, - output_data.unmapped_long_read_info.longest_read_length, - output_data.long_read_info.longest_read_length) - table_str += int_str_for_format.format("N50", - output_data.mapped_long_read_info.n50_read_length, - output_data.unmapped_long_read_info.n50_read_length, - output_data.long_read_info.n50_read_length) - table_str += double_str_for_format.format("GC Content(%)", - output_data.mapped_long_read_info.gc_cnt * 100, - output_data.unmapped_long_read_info.gc_cnt * 100, - output_data.long_read_info.gc_cnt * 100) - table_str += double_str_for_format.format("Mean Read Length", - output_data.mapped_long_read_info.mean_read_length, - output_data.unmapped_long_read_info.mean_read_length, - output_data.long_read_info.mean_read_length) - table_str += int_str_for_format.format("Median Read Length", - output_data.mapped_long_read_info.median_read_length, - output_data.unmapped_long_read_info.median_read_length, - output_data.long_read_info.median_read_length) + + # Total reads + row_str, row_flag = format_row("Total Reads", \ + [output_data.mapped_long_read_info.total_num_reads, \ + output_data.unmapped_long_read_info.total_num_reads, \ + output_data.long_read_info.total_num_reads], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Total bases + row_str, row_flag = format_row("Total Bases", \ + [output_data.mapped_long_read_info.total_num_bases, \ + output_data.unmapped_long_read_info.total_num_bases, \ + output_data.long_read_info.total_num_bases], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Longest read length + row_str, row_flag = format_row("Longest Read Length", \ + [output_data.mapped_long_read_info.longest_read_length, \ + output_data.unmapped_long_read_info.longest_read_length, \ + output_data.long_read_info.longest_read_length], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # N50 + row_str, row_flag = format_row("N50", \ + [output_data.mapped_long_read_info.n50_read_length, \ + output_data.unmapped_long_read_info.n50_read_length, \ + output_data.long_read_info.n50_read_length], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # GC content + row_str, row_flag = format_row("GC Content(%)", \ + [output_data.mapped_long_read_info.gc_cnt * 100, \ + output_data.unmapped_long_read_info.gc_cnt * 100, \ + output_data.long_read_info.gc_cnt * 100], \ + 'float', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Mean read length + row_str, row_flag = format_row("Mean Read Length", \ + [output_data.mapped_long_read_info.mean_read_length, \ + output_data.unmapped_long_read_info.mean_read_length, \ + output_data.long_read_info.mean_read_length], \ + 'float', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Median read length + row_str, row_flag = format_row("Median Read Length", \ + [output_data.mapped_long_read_info.median_read_length, \ + output_data.unmapped_long_read_info.median_read_length, \ + output_data.long_read_info.median_read_length], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag elif file_type == 'SeqTxt': table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Passed</th><th>Failed</th><th>All</th></tr>\n</thead>" table_str += "\n<tbody>" - int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:,d}</td></tr>" - double_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>" - table_str += int_str_for_format.format("#Total Reads", - output_data.passed_long_read_info.long_read_info.total_num_reads, - output_data.failed_long_read_info.long_read_info.total_num_reads, - output_data.all_long_read_info.long_read_info.total_num_reads) - table_str += int_str_for_format.format("#Total Bases", - output_data.passed_long_read_info.long_read_info.total_num_bases, - output_data.failed_long_read_info.long_read_info.total_num_bases, - output_data.all_long_read_info.long_read_info.total_num_bases) - table_str += int_str_for_format.format("Longest Read Length", - output_data.passed_long_read_info.long_read_info.longest_read_length, - output_data.failed_long_read_info.long_read_info.longest_read_length, - output_data.all_long_read_info.long_read_info.longest_read_length) - table_str += int_str_for_format.format("N50", - output_data.passed_long_read_info.long_read_info.n50_read_length, - output_data.failed_long_read_info.long_read_info.n50_read_length, - output_data.all_long_read_info.long_read_info.n50_read_length) - table_str += double_str_for_format.format("Mean Read Length", - output_data.passed_long_read_info.long_read_info.mean_read_length, - output_data.failed_long_read_info.long_read_info.mean_read_length, - output_data.all_long_read_info.long_read_info.mean_read_length) - table_str += int_str_for_format.format("Median Read Length", - output_data.passed_long_read_info.long_read_info.median_read_length, - output_data.failed_long_read_info.long_read_info.median_read_length, - output_data.all_long_read_info.long_read_info.median_read_length) + + # Total reads + row_str, row_flag = format_row("Total Reads", \ + [output_data.passed_long_read_info.long_read_info.total_num_reads, \ + output_data.failed_long_read_info.long_read_info.total_num_reads, \ + output_data.all_long_read_info.long_read_info.total_num_reads], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Total bases + row_str, row_flag = format_row("Total Bases", \ + [output_data.passed_long_read_info.long_read_info.total_num_bases, \ + output_data.failed_long_read_info.long_read_info.total_num_bases, \ + output_data.all_long_read_info.long_read_info.total_num_bases], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Longest read length + row_str, row_flag = format_row("Longest Read Length", \ + [output_data.passed_long_read_info.long_read_info.longest_read_length, \ + output_data.failed_long_read_info.long_read_info.longest_read_length, \ + output_data.all_long_read_info.long_read_info.longest_read_length], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # N50 + row_str, row_flag = format_row("N50", \ + [output_data.passed_long_read_info.long_read_info.n50_read_length, \ + output_data.failed_long_read_info.long_read_info.n50_read_length, \ + output_data.all_long_read_info.long_read_info.n50_read_length], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Mean read length + row_str, row_flag = format_row("Mean Read Length", \ + [output_data.passed_long_read_info.long_read_info.mean_read_length, \ + output_data.failed_long_read_info.long_read_info.mean_read_length, \ + output_data.all_long_read_info.long_read_info.mean_read_length], \ + 'float', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Median read length + row_str, row_flag = format_row("Median Read Length", \ + [output_data.passed_long_read_info.long_read_info.median_read_length, \ + output_data.failed_long_read_info.long_read_info.median_read_length, \ + output_data.all_long_read_info.long_read_info.median_read_length], \ + 'int', 1) + table_str += row_str + table_error_flag = table_error_flag or row_flag elif file_type == 'FAST5s': # Get values @@ -853,32 +935,58 @@ def create_summary_table(output_data, plot_filepaths, file_type): # Set up the HTML table table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>" table_str += "\n<tbody>" - int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>" - table_str += int_str_for_format.format("#Total Reads", read_count) - table_str += int_str_for_format.format("#Total Bases", total_base_count) + + # Total reads + row_str, row_flag = format_row("Total Reads", [read_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Total bases + row_str, row_flag = format_row("Total Bases", [total_base_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag else: table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>" table_str += "\n<tbody>" - int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>" - double_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:.1f}</td></tr>" - table_str += int_str_for_format.format("#Total Reads", - output_data.long_read_info.total_num_reads) - table_str += int_str_for_format.format("#Total Bases", - output_data.long_read_info.total_num_bases) - table_str += int_str_for_format.format("Longest Read Length", - output_data.long_read_info.longest_read_length) - table_str += int_str_for_format.format("N50", - output_data.long_read_info.n50_read_length) - table_str += double_str_for_format.format("GC Content(%)", - output_data.long_read_info.gc_cnt * 100) - table_str += double_str_for_format.format("Mean Read Length", - output_data.long_read_info.mean_read_length) - table_str += int_str_for_format.format("Median Read Length", - output_data.long_read_info.median_read_length) + # Total reads + row_str, row_flag = format_row("Total Reads", [output_data.long_read_info.total_num_reads], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Total bases + row_str, row_flag = format_row("Total Bases", [output_data.long_read_info.total_num_bases], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Longest read length + row_str, row_flag = format_row("Longest Read Length", [output_data.long_read_info.longest_read_length], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # N50 + row_str, row_flag = format_row("N50", [output_data.long_read_info.n50_read_length], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # GC content + row_str, row_flag = format_row("GC Content(%)", [output_data.long_read_info.gc_cnt * 100], 'float', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Mean read length + row_str, row_flag = format_row("Mean Read Length", [output_data.long_read_info.mean_read_length], 'float', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # Median read length + row_str, row_flag = format_row("Median Read Length", [output_data.long_read_info.median_read_length], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag table_str += "\n</tbody>\n</table>" plot_filepaths["basic_st"]['detail'] = table_str + plot_filepaths["basic_st"]['error_flag'] = table_error_flag def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold): """Create a summary table for the base modifications.""" @@ -888,64 +996,84 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th plot_filepaths["base_mods"]['description'] = "Base modification statistics" # Print the types of modifications + logging.info("Getting base modification types") base_mod_types = output_data.getBaseModTypes() - logging.info("Modification types: ") - for mod_type in base_mod_types: - logging.info(mod_type) - - # Get the read length vs. base modification rate data for each modification type - read_mod_data_size = output_data.getReadModDataSize() - read_length_mod_rates = {} - for i in range(read_mod_data_size): + if base_mod_types: + logging.info("Modification types: ") for mod_type in base_mod_types: - if mod_type not in read_length_mod_rates: - read_length_mod_rates[mod_type] = [] - - read_length = output_data.getNthReadModLength(i) - mod_rate = output_data.getNthReadModRate(i, mod_type) - read_length_mod_rates[mod_type].append((read_length, mod_rate)) - - # Dictionary of modification character to full name - mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \ - 'g': '5hmU', 'e': '5fu', 'b': '5caU', \ - 'a': '6mA', 'o': '8oxoG', 'n': 'Xao', \ - 'C': 'Amb. C', 'A': 'Amb. A', 'T': 'Amb. T', 'G': 'Amb. G',\ - 'N': 'Amb. N'} - - - # Create a plot of read length vs. base modification rate for each - # modification type - fig = go.Figure() - for mod_type in base_mod_types: - - # Format the data - mod_data = read_length_mod_rates[mod_type] - x_vals = [data[0] for data in mod_data] - read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals] - mod_rates = [data[1] * 100 for data in mod_data] - - # Get the modification name - try: - mod_char_to_name[mod_type] - except KeyError: - logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) - mod_name = mod_type - - mod_name = mod_char_to_name[mod_type] - - fig.add_trace(go.Scatter(x=x_vals, y=mod_rates, mode='markers', name=mod_name)) - - # Update the layout - fig.update_layout(xaxis_title='Read Length', - yaxis_title='Modification Rate (%)', - showlegend=True, - yaxis=dict(range=[0, 100]), - xaxis=dict(tickvals=x_vals, ticktext=read_lengths), - font=dict(size=PLOT_FONT_SIZE)) - - # Generate the HTML - html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) - plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj + logging.info(mod_type) + + # Get the read length vs. base modification rate data for each modification type + read_mod_data_size = output_data.getReadModDataSize() + logging.info("[TEST] read_mod_data_size: {}".format(read_mod_data_size)) + read_length_mod_rates = {} + for i in range(read_mod_data_size): + for mod_type in base_mod_types: + if mod_type not in read_length_mod_rates: + read_length_mod_rates[mod_type] = [] + + read_length = output_data.getNthReadModLength(i) + mod_rate = output_data.getNthReadModRate(i, mod_type) + read_length_mod_rates[mod_type].append((read_length, mod_rate)) + + # Dictionary of modification character to full name + mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \ + 'g': '5hmU', 'e': '5fu', 'b': '5caU', \ + 'a': '6mA', 'o': '8oxoG', 'n': 'Xao', \ + 'C': 'Amb. C', 'A': 'Amb. A', 'T': 'Amb. T', 'G': 'Amb. G',\ + 'N': 'Amb. N', \ + 'v': 'pseU'} + + + # Create a plot of read length vs. base modification rate for each + # modification type + # Make subplots vertically for each modification type + fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1) + min_x = float('inf') + max_x = 0 + # for mod_type in base_mod_types: + for i, mod_type in enumerate(base_mod_types): + + # Format the data + mod_data = read_length_mod_rates[mod_type] + x_vals = [data[0] for data in mod_data] + read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals] + mod_rates = [data[1] * 100 for data in mod_data] + + # Update the min and max x values + min_x = min(min_x, min(x_vals)) + max_x = max(max_x, max(x_vals)) + + # Get the modification name + try: + mod_name = mod_char_to_name[mod_type] + except KeyError: + logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) + mod_name = mod_type + + fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name), row=i + 1, col=1) + + # Update the layout + max_x_range = min(max_x, 10000) # To improve the plot performance + fig.update_layout(title='Read Length vs. {} Modification Rate'.format(mod_name), + xaxis_title='Read Length', + yaxis_title='Modification Rate (%)', + showlegend=False, + yaxis=dict(range=[0, 100]), + xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]), + font=dict(size=PLOT_FONT_SIZE)) + + logging.info("Plotting read length vs. {} modification rate".format(mod_name)) + + # Save the plot image + fig_file = plot_filepaths["read_length_mod_rates"]['file'] + fig.write_image(fig_file) + + # Generate the HTML + # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj + else: + logging.warning("WARNING: No modification types found") # Create the base modification statistics table table_str = "<table>\n<tbody>" @@ -959,7 +1087,13 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Add the modification type data for mod_type in base_mod_types: - mod_name = mod_char_to_name[mod_type] + # mod_name = mod_char_to_name[mod_type] + try: + mod_name = mod_char_to_name[mod_type] + except KeyError: + logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) + mod_name = mod_type + mod_count = output_data.getModTypeCount(mod_type) mod_count_fwd = output_data.getModTypeCount(mod_type, 0) mod_count_rev = output_data.getModTypeCount(mod_type, 1) From 957a8d833eed15b25c18570edad7a77801878add Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Sun, 5 Jan 2025 19:52:11 -0500 Subject: [PATCH 09/25] work on flags --- environment.yml | 4 +- src/plot_utils.py | 164 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 136 insertions(+), 32 deletions(-) diff --git a/environment.yml b/environment.yml index 6a9e2af..a76645c 100644 --- a/environment.yml +++ b/environment.yml @@ -4,6 +4,7 @@ channels: - bioconda - defaults - jannessp # for pod5 + - plotly # for kaleido dependencies: - python=3.9 - numpy @@ -14,4 +15,5 @@ dependencies: - plotly - pytest - pod5 - - pyarrow \ No newline at end of file + - pyarrow + - python-kaleido diff --git a/src/plot_utils.py b/src/plot_utils.py index 209cfbd..db96017 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -376,8 +376,9 @@ def read_gc_content_histogram(data, font_size): return fig.to_html(full_html=False, default_height=500, default_width=700) -# Save the 'Base quality' plot image. -def base_quality(data, font_size): + +def base_quality(data, font_size, plot_filepaths): + """Plot the base quality distribution.""" xd = np.arange(MAX_BASE_QUALITY) yd = np.array(data.base_quality_distribution) fig = go.Figure() @@ -392,9 +393,19 @@ def base_quality(data, font_size): fig.update_yaxes(ticks="outside", title_text='Number of bases', title_standoff=0) fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) # Set font size - return fig.to_html(full_html=False, default_height=500, default_width=700) + # return fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths['base_quality']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) + + # Set the error flag if the base quality is below 20 for more than 10% of + # the bases + error_flag = False + if np.sum(yd[:20]) / np.sum(yd) > 0.1: + error_flag = True + + plot_filepaths['base_quality']['error_flag'] = error_flag -def read_avg_base_quality(data, font_size): + +def read_avg_base_quality(data, font_size, plot_filepaths): """Plot the read average base quality distribution.""" xd = np.arange(MAX_READ_QUALITY) yd = np.array(data.read_average_base_quality_distribution) @@ -405,7 +416,16 @@ def read_avg_base_quality(data, font_size): fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0) fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) # Set font size - return fig.to_html(full_html=False, default_height=500, default_width=700) + # return fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths['read_avg_base_quality']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) + + # Set the error flag if the average base quality is below 20 for more than + # 10% of the reads + error_flag = False + if np.sum(yd[:20]) / np.sum(yd) > 0.1: + error_flag = True + + plot_filepaths['read_avg_base_quality']['error_flag'] = error_flag def plot_base_modifications(base_modifications): @@ -499,16 +519,24 @@ def plot(output_data, para_dict, file_type): seq_quality_info = output_data.seq_quality_info # Base quality histogram - plot_filepaths['base_quality']['dynamic'] = base_quality(seq_quality_info, font_size) + base_quality(seq_quality_info, font_size, plot_filepaths) + # plot_filepaths['base_quality']['dynamic'] = base_quality(seq_quality_info, font_size) # Read quality histogram - read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size) - plot_filepaths['read_avg_base_quality']['dynamic'] = read_quality_dynamic + # read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size) + # plot_filepaths['read_avg_base_quality']['dynamic'] = + # read_quality_dynamic + read_avg_base_quality(seq_quality_info, font_size, plot_filepaths) if file_type == 'BAM': # Plot read alignment QC - plot_filepaths['read_alignments_bar']['dynamic'] = plot_alignment_numbers(output_data) - plot_filepaths['base_alignments_bar']['dynamic'] = plot_errors(output_data) + plot_alignment_numbers(output_data, plot_filepaths) + # plot_filepaths['read_alignments_bar']['dynamic'] = + # plot_alignment_numbers(output_data) + + # Plot base alignment and error QC + plot_errors(output_data, plot_filepaths) + # plot_filepaths['base_alignments_bar']['dynamic'] = plot_errors(output_data) elif file_type == 'FAST5s': plot_filepaths['ont_signal']['dynamic'] = plot_signal(output_data, para_dict) @@ -988,12 +1016,14 @@ def create_summary_table(output_data, plot_filepaths, file_type): plot_filepaths["basic_st"]['detail'] = table_str plot_filepaths["basic_st"]['error_flag'] = table_error_flag + def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold): """Create a summary table for the base modifications.""" plot_filepaths["base_mods"] = {} plot_filepaths["base_mods"]['file'] = "" plot_filepaths["base_mods"]['title'] = "Base Modifications" plot_filepaths["base_mods"]['description'] = "Base modification statistics" + table_error_flag = False # Print the types of modifications logging.info("Getting base modification types") @@ -1005,7 +1035,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Get the read length vs. base modification rate data for each modification type read_mod_data_size = output_data.getReadModDataSize() - logging.info("[TEST] read_mod_data_size: {}".format(read_mod_data_size)) read_length_mod_rates = {} for i in range(read_mod_data_size): for mod_type in base_mod_types: @@ -1077,13 +1106,41 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Create the base modification statistics table table_str = "<table>\n<tbody>" - table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count) - table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold) - table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count) - table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward) - table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse) - table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count) - table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count) + row_str, row_flag = format_row("Total Predictions", [output_data.modified_prediction_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Probability Threshold", [base_modification_threshold], 'float', 0) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Total Modified Bases in the Sample", [output_data.sample_modified_base_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Total in the Forward Strand", [output_data.sample_modified_base_count_forward], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Total in the Reverse Strand", [output_data.sample_modified_base_count_reverse], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + # table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count) + # table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold) + # table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count) + # table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward) + # table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse) + # table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count) + # table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count) # Add the modification type data for mod_type in base_mod_types: @@ -1097,13 +1154,26 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th mod_count = output_data.getModTypeCount(mod_type) mod_count_fwd = output_data.getModTypeCount(mod_type, 0) mod_count_rev = output_data.getModTypeCount(mod_type, 1) - table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count) - table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd) - table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev) + + row_str, row_flag = format_row("Total {} Sites in the Sample".format(mod_name), [mod_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Total {} Sites in the Sample (Forward Strand)".format(mod_name), [mod_count_fwd], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + + row_str, row_flag = format_row("Total {} Sites in the Sample (Reverse Strand)".format(mod_name), [mod_count_rev], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag + # table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count) + # table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd) + # table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev) # Finish the table table_str += "\n</tbody>\n</table>" plot_filepaths["base_mods"]['detail'] = table_str + plot_filepaths["base_mods"]['error_flag'] = table_error_flag def create_tin_table(output_data, input_files, plot_filepaths): """Create a summary table for the RNA-Seq TIN values.""" @@ -1118,12 +1188,15 @@ def create_tin_table(output_data, input_files, plot_filepaths): table_str += "\n<tbody>" # Loop through each BAM file + error_flag = False for bam_file in input_files: # Format the filepath as filename only bam_filename = os.path.basename(bam_file) # Get the file data tin_count = output_data.getTINCount(bam_file) + error_flag = error_flag or tin_count == 0 + tin_mean = output_data.getTINMean(bam_file) tin_median = output_data.getTINMedian(bam_file) tin_std = output_data.getTINStdDev(bam_file) @@ -1135,6 +1208,8 @@ def create_tin_table(output_data, input_files, plot_filepaths): # Add the table to the plot filepaths plot_filepaths["tin"]['detail'] = table_str + plot_filepaths["tin"]['error_flag'] = error_flag + def create_pod5_table(output_dict, plot_filepaths): """Create a summary table for the ONT POD5 signal data.""" @@ -1143,26 +1218,41 @@ def create_pod5_table(output_dict, plot_filepaths): plot_filepaths["basic_st"]['title'] = "Summary Table" file_type_label = "POD5" plot_filepaths["basic_st"]['description'] = f"{file_type_label} Basic Statistics" + table_error_flag = False # Get values - read_count = len(output_dict.keys()) # Set up the HTML table table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>" table_str += "\n<tbody>" - int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>" - table_str += int_str_for_format.format("#Total Reads", read_count) + # int_str_for_format = "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td></tr>" + # table_str += int_str_for_format.format("Total Reads", read_count) + read_count = len(output_dict.keys()) + row_str, row_flag = format_row("Total Reads", [read_count], 'int', None) + table_str += row_str + table_error_flag = table_error_flag or row_flag table_str += "\n</tbody>\n</table>" plot_filepaths["basic_st"]['detail'] = table_str + plot_filepaths["basic_st"]['error_flag'] = table_error_flag -def plot_alignment_numbers(data): +def plot_alignment_numbers(data, plot_filepaths): category = ['Primary Alignments', 'Supplementary Alignments', 'Secondary Alignments', 'Reads with Supplementary Alignments', 'Reads with Secondary Alignments', 'Reads with Secondary and Supplementary Alignments', 'Forward Alignments', 'Reverse Alignments'] category = [wrap(x) for x in category] + # Identify null values + error_flag = False + for value in [data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, + data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, + data.num_reads_with_both_secondary_supplementary_alignment, data.forward_alignment, + data.reverse_alignment]: + if value == 0: + error_flag = True + break + # Create a horizontally aligned bar plot trace from the data using plotly trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, @@ -1179,13 +1269,18 @@ def plot_alignment_numbers(data): fig = go.Figure(data=[trace], layout=layout) # Generate the HTML object for the plot - html_obj = fig.to_html(full_html=False, default_height=500, default_width=1000) + # html_obj = fig.to_html(full_html=False, default_height=500, + # default_width=1000) - return html_obj + # return html_obj, error_flag + # Update the HTML data for the plot + plot_filepaths['read_alignments_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1000) + plot_filepaths['read_alignments_bar']['error_flag'] = error_flag -# Plot base alignment statistics -def plot_errors(output_data): + +def plot_errors(output_data, plot_filepaths): + """Plot the error statistics for the alignment data.""" category = \ ['Matched Bases', 'Mismatched Bases', 'Inserted Bases', 'Deleted Bases', 'Clipped Bases\n(Primary Alignments)'] category = [wrap(x) for x in category] @@ -1204,7 +1299,14 @@ def plot_errors(output_data): fig = go.Figure(data=[trace], layout=layout) # Generate the HTML object for the plot - html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + # html_obj = fig.to_html(full_html=False, default_height=500, + # default_width=700) + plot_filepaths['base_alignments_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) - return html_obj + # Set the error flag if mismatch or clipped bases > matched bases + error_flag = output_data.num_mismatched_bases > output_data.num_matched_bases or \ + output_data.num_clip_bases > output_data.num_matched_bases + plot_filepaths['base_alignments_bar']['error_flag'] = error_flag + + # return html_obj From 8b2a913d233a9273e17fe554e3b040cafebccee8 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Mon, 6 Jan 2025 15:45:49 -0500 Subject: [PATCH 10/25] Work on flags --- src/plot_utils.py | 196 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 145 insertions(+), 51 deletions(-) diff --git a/src/plot_utils.py b/src/plot_utils.py index db96017..c6dd718 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -72,11 +72,12 @@ def wrap(label): return new_label # Plot the read alignment numbers -def plot_read_length_stats(output_data, file_type): +def plot_read_length_stats(output_data, file_type, plot_filepaths): # Define the three categories category = ['N50', 'Mean', 'Median'] all_traces = [] + error_flag = False if file_type == 'BAM': # Create a bar trace for each type of read length statistic @@ -89,6 +90,10 @@ def plot_read_length_stats(output_data, file_type): trace = go.Bar(x=category, y=values, name=plot_title) all_traces.append(trace) + # Set the error flag if any of the values are zero (except for unmapped reads) + if i != 2 and (values[0] == 0 or values[1] == 0 or values[2] == 0): + error_flag = True + elif file_type == 'SeqTxt': # Create a bar trace for each type of read length statistic bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads'] @@ -100,6 +105,10 @@ def plot_read_length_stats(output_data, file_type): trace = go.Bar(x=category, y=values, name=plot_title) all_traces.append(trace) + # Set the error flag if any of the values are zero (except for failed reads) + if i != 2 and (values[0] == 0 or values[1] == 0 or values[2] == 0): + error_flag = True + else: # Get the data for all reads key_list = ['n50_read_length', 'mean_read_length', 'median_read_length'] @@ -111,6 +120,11 @@ def plot_read_length_stats(output_data, file_type): trace = go.Bar(x=category, y=values, name=bar_title) all_traces.append(trace) + # Set the error flag if any of the values are zero + if values[0] == 0 or values[1] == 0 or values[2] == 0: + error_flag = True + + # Create the layout layout = go.Layout(title='', xaxis=dict(title='Statistics'), yaxis=dict(title='Length (bp)'), barmode='group', font=dict(size=PLOT_FONT_SIZE)) @@ -118,16 +132,19 @@ def plot_read_length_stats(output_data, file_type): fig = go.Figure(data=all_traces, layout=layout) # Generate the HTML - html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths['read_length_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) + + # Set the error flag + plot_filepaths['read_length_bar']['error_flag'] = error_flag - return html_obj # Plot the base counts -def plot_base_counts(output_data, filetype): - # Define the five categories - category = ['A', 'C', 'G', 'T/U', 'N'] +def plot_base_counts(output_data, filetype, plot_filepaths): - # Create a bar trace for each type of data + # Create a bar trace for each base + error_flag = False + category = ['A', 'C', 'G', 'T/U', 'N'] all_traces = [] if filetype == 'BAM': bar_titles = ['All Reads', 'Mapped Reads', 'Unmapped Reads'] @@ -139,6 +156,15 @@ def plot_base_counts(output_data, filetype): trace = go.Bar(x=category, y=values, name=plot_title) all_traces.append(trace) + # Set the error flag if the N count is greater than 10% or the A, C, + # G, or T/U counts are zero + if data.total_num_bases == 0: + error_flag = True + elif data.total_n_cnt / data.total_num_bases > 0.1: + error_flag = True + elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0: + error_flag = True + elif filetype == 'SeqTxt': bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads'] data_objects = [output_data.all_long_read_info.long_read_info, output_data.passed_long_read_info.long_read_info, output_data.failed_long_read_info.long_read_info] @@ -149,6 +175,15 @@ def plot_base_counts(output_data, filetype): trace = go.Bar(x=category, y=values, name=plot_title) all_traces.append(trace) + # Set the error flag if the N count is greater than 10% or the A, C, + # G, or T/U counts are zero + if data.total_num_bases == 0: + error_flag = True + elif data.total_n_cnt / data.total_num_bases > 0.1: + error_flag = True + elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0: + error_flag = True + else: plot_title = 'All Reads' data = output_data.long_read_info @@ -156,19 +191,30 @@ def plot_base_counts(output_data, filetype): trace = go.Bar(x=category, y=values, name=plot_title) all_traces.append(trace) - # Create the layout - layout = go.Layout(title='', xaxis=dict(title='Base'), yaxis=dict(title='Counts'), barmode='group', font=dict(size=PLOT_FONT_SIZE)) + # Set the error flag if the N count is greater than 10% or the A, C, + # G, or T/U counts are zero + if data.total_num_bases == 0: + error_flag = True + elif data.total_n_cnt / data.total_num_bases > 0.1: + error_flag = True + elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0: + error_flag = True # Create the figure and add the traces + layout = go.Layout(title='', xaxis=dict(title='Base'), yaxis=dict(title='Counts'), barmode='group', font=dict(size=PLOT_FONT_SIZE)) fig = go.Figure(data=all_traces, layout=layout) # Generate the HTML - html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) - return html_obj + # return html_obj + + # Generate the HTML + plot_filepaths['base_counts']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths['base_counts']['error_flag'] = error_flag # Plot basic information about the reads in bar chart format -def plot_basic_info(output_data, file_type): +def plot_basic_info(output_data, file_type, plot_filepaths): html_obj = '' if file_type == 'BAM': @@ -181,6 +227,7 @@ def plot_basic_info(output_data, file_type): # Add traces for each category key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length', 'gc_cnt'] + error_flag = False for i in range(4): # Get the data for this category key_name = key_list[i] @@ -188,6 +235,10 @@ def plot_basic_info(output_data, file_type): # Add the traces for each type of data data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)] + # Set the error flag if any of the values are zero + if data[0] == 0 or data[1] == 0 or data[2] == 0: + error_flag = True + # Create the trace trace = go.Bar(x=data, y=bar_titles, orientation='h') @@ -199,7 +250,11 @@ def plot_basic_info(output_data, file_type): fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE)) # Generate the HTML - html_obj = fig.to_html(full_html=False, default_height=800, default_width=1200) + # html_obj = fig.to_html(full_html=False, default_height=800, + # default_width=1200) + plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=800, default_width=1200) + plot_filepaths['basic_info']['error_flag'] = error_flag + elif file_type == 'SeqTxt': @@ -212,6 +267,7 @@ def plot_basic_info(output_data, file_type): # Add traces for each category key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length'] + error_flag = False for i in range(3): # Get the data for this category key_name = key_list[i] @@ -219,6 +275,10 @@ def plot_basic_info(output_data, file_type): # Add the traces for each type of data data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)] + # Set the error flag if any of the values are zero + if data[0] == 0 or data[1] == 0 or data[2] == 0: + error_flag = True + # Create the trace trace = go.Bar(x=data, y=bar_titles, orientation='h') @@ -229,13 +289,16 @@ def plot_basic_info(output_data, file_type): fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE)) # Generate the HTML - html_obj = fig.to_html(full_html=False, default_height=500, default_width=1600) + # html_obj = fig.to_html(full_html=False, default_height=500, + # default_width=1600) + plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1600) + plot_filepaths['basic_info']['error_flag'] = error_flag return html_obj # Plot the read length histograms -def read_lengths_histogram(data, font_size): +def read_lengths_histogram(data, font_size, plot_filepaths): linear_bin_count = 10 log_bin_count = 10 @@ -262,7 +325,6 @@ def read_lengths_histogram(data, font_size): log_col=2 linear_bindata = np.dstack((edges[:-1], edges[1:], hist))[0, :, :] - # linear_bin_centers = np.round((linear_bindata[:, 0] + linear_bindata[:, 1]) / 2, 0) fig.add_trace(go.Bar(x=edges, y=hist, customdata=linear_bindata, hovertemplate='Length: %{customdata[0]:.0f}-%{customdata[1]:.0f}bp<br>Counts:%{customdata[2]:.0f}<extra></extra>', marker_color='#36a5c7'), row=1, col=linear_col) @@ -274,8 +336,7 @@ def read_lengths_histogram(data, font_size): fig.add_vline(n50, line_width=1, line_dash="dash", annotation_text='N50', annotation_bgcolor="green", annotation_textangle=90, row=1, col=linear_col) - # Log histogram - # Get the log10 histogram of read lengths + # Log scale histogram read_lengths_log = np.log10(read_lengths, out=np.zeros_like(read_lengths), where=(read_lengths != 0)) log_edges = np.linspace(0, np.max(read_lengths_log), num=log_bin_count + 1) log_hist, _ = np.histogram(read_lengths_log, bins=log_edges) @@ -333,18 +394,26 @@ def read_lengths_histogram(data, font_size): # Update the layout fig.update_layout(showlegend=False, autosize=True, font=dict(size=PLOT_FONT_SIZE)) - fig.update_annotations(font_size=annotation_size) - html_obj = fig.to_html(full_html=False, default_height=500, default_width=1200) + + # Generate the HTML + # html_obj = fig.to_html(full_html=False, default_height=500, + # default_width=1200) + plot_filepaths['read_length_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1200) - return html_obj -def read_gc_content_histogram(data, font_size): +def read_gc_content_histogram(data, font_size, plot_filepaths): """Plot the per-read GC content histogram.""" bin_size = 1 + gc_content = np.array(data.read_gc_content_count) + + # Set the error flag if the GC content is below 20% for more than 10% of the + # reads + error_flag = False + if np.sum(gc_content[:20]) / np.sum(gc_content) > 0.1: + error_flag = True # Bin the GC content if the bin size is greater than 1 - gc_content = np.array(data.read_gc_content_count) if bin_size > 1: gc_content = np.array([np.sum(gc_content[i:i + bin_size]) for i in range(0, 101, bin_size)]) @@ -374,7 +443,9 @@ def read_gc_content_histogram(data, font_size): fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0) fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) # Set font size - return fig.to_html(full_html=False, default_height=500, default_width=700) + # return fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths['gc_content_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths['gc_content_hist']['error_flag'] = error_flag def base_quality(data, font_size, plot_filepaths): @@ -456,22 +527,18 @@ def plot_base_modifications(base_modifications): return html_obj -# Main plot function def plot(output_data, para_dict, file_type): + """Generate the plots for the output data.""" plot_filepaths = getDefaultPlotFilenames() - - # Get the font size for plotly plots - font_size = 14 - - # Create the summary table - create_summary_table(output_data, plot_filepaths, file_type) + font_size = 14 # Font size for the plots + create_summary_table(output_data, plot_filepaths, file_type) # Create the summary table # Modified base table and plots if file_type == 'BAM' and para_dict["mod"] > 0: # Output file for the read length vs. modification rates plot output_folder = para_dict["output_folder"] - read_length_hist_file = os.path.join(output_folder, 'read_length_hist.png') - plot_filepaths['read_length_mod_rates']['file'] = read_length_hist_file + read_length_mod_rate_file = os.path.join(output_folder, 'read_length_hist.png') + plot_filepaths['read_length_mod_rates']['file'] = read_length_mod_rate_file # Generate the modified base table and read length vs. modification rates plot base_modification_threshold = para_dict["modprob"] @@ -491,8 +558,14 @@ def plot(output_data, para_dict, file_type): logging.warning("WARNING: TIN table not created") # Generate plots - plot_filepaths['base_counts']['dynamic'] = plot_base_counts(output_data, file_type) - plot_filepaths['basic_info']['dynamic'] = plot_basic_info(output_data, file_type) + # plot_filepaths['base_counts']['dynamic'] = plot_base_counts(output_data, + # file_type) + plot_base_counts(output_data, file_type, plot_filepaths) + + # Plot basic information + # plot_filepaths['basic_info']['dynamic'] = plot_basic_info(output_data, + # file_type) + plot_basic_info(output_data, file_type, plot_filepaths) # Read length histogram if file_type == 'SeqTxt': @@ -501,18 +574,30 @@ def plot(output_data, para_dict, file_type): long_read_data = output_data.long_read_info if file_type != 'FAST5s': - plot_filepaths['read_length_hist']['dynamic'] = read_lengths_histogram(long_read_data, font_size) + # plot_filepaths['read_length_hist']['dynamic'] = + # read_lengths_histogram(long_read_data, font_size) + read_lengths_histogram(long_read_data, font_size, plot_filepaths) - plot_filepaths['read_length_bar']['dynamic'] = plot_read_length_stats(output_data, file_type) + # plot_filepaths['read_length_bar']['dynamic'] = + # plot_read_length_stats(output_data, file_type) + plot_read_length_stats(output_data, file_type, plot_filepaths) # GC content histogram if file_type != 'FAST5s' and file_type != 'SeqTxt': if file_type == 'BAM': - plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.mapped_long_read_info, font_size) + # plot_filepaths['gc_content_hist']['dynamic'] = + # read_gc_content_histogram(output_data.mapped_long_read_info, + # font_size) + read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths) elif file_type == 'SeqTxt': - plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size) + # plot_filepaths['gc_content_hist']['dynamic'] = + # read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, + # font_size) + read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths) else: - plot_filepaths['gc_content_hist']['dynamic'] = read_gc_content_histogram(output_data.long_read_info, font_size) + # plot_filepaths['gc_content_hist']['dynamic'] = + # read_gc_content_histogram(output_data.long_read_info, font_size) + read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths) # Quality plots if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt': @@ -681,6 +766,8 @@ def plot_signal(output_data, para_dict): # Get read and base counts read_count = output_data.getReadCount() + if read_count == 0: + raise ValueError("No reads found in the dataset") # Randomly sample a small set of reads if it is a large dataset read_sample_size = min(read_count_max, read_count) @@ -1033,7 +1120,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th for mod_type in base_mod_types: logging.info(mod_type) - # Get the read length vs. base modification rate data for each modification type + logging.info("Getting base modification statistics") + + # Get the read length vs. base modification rate data for each + # modification type + logging.info("Getting mod data size") read_mod_data_size = output_data.getReadModDataSize() read_length_mod_rates = {} for i in range(read_mod_data_size): @@ -1041,8 +1132,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th if mod_type not in read_length_mod_rates: read_length_mod_rates[mod_type] = [] + logging.info("Getting read length for read {}".format(i)) read_length = output_data.getNthReadModLength(i) + logging.info("Getting read length vs. {} modification rate".format(mod_type)) mod_rate = output_data.getNthReadModRate(i, mod_type) + logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate)) read_length_mod_rates[mod_type].append((read_length, mod_rate)) # Dictionary of modification character to full name @@ -1092,12 +1186,12 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]), font=dict(size=PLOT_FONT_SIZE)) - logging.info("Plotting read length vs. {} modification rate".format(mod_name)) - # Save the plot image - fig_file = plot_filepaths["read_length_mod_rates"]['file'] - fig.write_image(fig_file) - + if len(base_mod_types) > 0: + fig_file = plot_filepaths["read_length_mod_rates"]['file'] + logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file)) + fig.write_image(fig_file, format='png', width=700, height=500) + # Generate the HTML # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj @@ -1195,14 +1289,16 @@ def create_tin_table(output_data, input_files, plot_filepaths): # Get the file data tin_count = output_data.getTINCount(bam_file) - error_flag = error_flag or tin_count == 0 - tin_mean = output_data.getTINMean(bam_file) tin_median = output_data.getTINMedian(bam_file) tin_std = output_data.getTINStdDev(bam_file) # Add the data to the table - table_str += "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>".format(bam_filename, tin_count, tin_mean, tin_median, tin_std) + row_str, row_flag = format_row(bam_filename, [tin_count, tin_mean, tin_median, tin_std], 'float', None) + table_str += row_str + error_flag = error_flag or row_flag + + # table_str += "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>".format(bam_filename, tin_count, tin_mean, tin_median, tin_std) table_str += "\n</tbody>\n</table>" @@ -1220,8 +1316,6 @@ def create_pod5_table(output_dict, plot_filepaths): plot_filepaths["basic_st"]['description'] = f"{file_type_label} Basic Statistics" table_error_flag = False - # Get values - # Set up the HTML table table_str = "<table>\n<thead>\n<tr><th>Measurement</th><th>Statistics</th></tr>\n</thead>" table_str += "\n<tbody>" From 1456cd7e7cc752d61b16d5224dd5bcc2e2d25e07 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Mon, 6 Jan 2025 18:34:19 -0500 Subject: [PATCH 11/25] subsample read length vs mod rate plot --- src/cli.py | 8 +-- src/generate_html.py | 8 ++- src/plot_utils.py | 127 +++++++++++++++++++++++++++---------------- 3 files changed, 90 insertions(+), 53 deletions(-) diff --git a/src/cli.py b/src/cli.py index 85951a6..cbe2a7d 100644 --- a/src/cli.py +++ b/src/cli.py @@ -246,7 +246,7 @@ def bam_module(margs): plot_filepaths = plot(bam_output, param_dict, 'BAM') # Set the list of QC information to display - qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", "read_avg_base_quality"] + qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"] # If base modifications were found, add the base modification plots # after the first table @@ -314,7 +314,7 @@ def rrms_module(margs): # Generate the HTML report bam_html_gen = generate_html.ST_HTML_Generator( [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", - "base_quality", "read_avg_base_quality"], "BAM QC", param_dict], plot_filepaths, static=False) + "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False) bam_html_gen.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -386,8 +386,8 @@ def fast5_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fast5_output, param_dict, 'FAST5') fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality", - "read_avg_base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=False) + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"], + "FAST5 QC", param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) diff --git a/src/generate_html.py b/src/generate_html.py index 64f1641..4cc3cf0 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -304,10 +304,12 @@ def generate_right(self): key_index += 1 self.html_writer.write('<div class="module">') - self.html_writer.write('<h2 id="lrst' + str(key_index) + '">File count = ' + str( + self.html_writer.write('<h2 id="lrst' + str(key_index) + '">File Count = ' + str( len(self.input_para["input_files"])) + '</h2><p>') - for _af in self.input_para["input_files"]: - self.html_writer.write("<br/>" + _af) + # for _af in self.input_para["input_files"]: + # self.html_writer.write("<br/>" + _af) + # Write the input files in format "1.\tfile1\n2.\tfile2\n..." + self.html_writer.write("<br/>" + "<br/>".join([f"{i+1}.\t{af}" for i, af in enumerate(self.input_para["input_files"])])) self.html_writer.write('</p></div>') key_index += 1 diff --git a/src/plot_utils.py b/src/plot_utils.py index c6dd718..2f25c7d 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -139,8 +139,8 @@ def plot_read_length_stats(output_data, file_type, plot_filepaths): plot_filepaths['read_length_bar']['error_flag'] = error_flag -# Plot the base counts def plot_base_counts(output_data, filetype, plot_filepaths): + """Plot overall base counts for the reads.""" # Create a bar trace for each base error_flag = False @@ -157,13 +157,14 @@ def plot_base_counts(output_data, filetype, plot_filepaths): all_traces.append(trace) # Set the error flag if the N count is greater than 10% or the A, C, - # G, or T/U counts are zero - if data.total_num_bases == 0: - error_flag = True - elif data.total_n_cnt / data.total_num_bases > 0.1: - error_flag = True - elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0: - error_flag = True + # G, or T/U counts are zero (except for unmapped reads) + if i != 2: + if data.total_num_bases == 0: + error_flag = True + elif data.total_n_cnt / data.total_num_bases > 0.1: + error_flag = True + elif data.total_a_cnt == 0 or data.total_c_cnt == 0 or data.total_g_cnt == 0 or data.total_tu_cnt == 0: + error_flag = True elif filetype == 'SeqTxt': bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads'] @@ -213,8 +214,8 @@ def plot_base_counts(output_data, filetype, plot_filepaths): plot_filepaths['base_counts']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) plot_filepaths['base_counts']['error_flag'] = error_flag -# Plot basic information about the reads in bar chart format def plot_basic_info(output_data, file_type, plot_filepaths): + """Plot basic information about the reads in bar chart format.""" html_obj = '' if file_type == 'BAM': @@ -235,8 +236,9 @@ def plot_basic_info(output_data, file_type, plot_filepaths): # Add the traces for each type of data data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)] - # Set the error flag if any of the values are zero - if data[0] == 0 or data[1] == 0 or data[2] == 0: + # Set the error flag if any of the values are zero (except for unmapped reads) + # if data[0] == 0 or data[1] == 0 or data[2] == 0: + if data[0] == 0 or data[1] == 0: error_flag = True # Create the trace @@ -410,7 +412,9 @@ def read_gc_content_histogram(data, font_size, plot_filepaths): # Set the error flag if the GC content is below 20% for more than 10% of the # reads error_flag = False - if np.sum(gc_content[:20]) / np.sum(gc_content) > 0.1: + if np.sum(gc_content) == 0: + error_flag = True + elif np.sum(gc_content[:20]) / np.sum(gc_content) > 0.1: error_flag = True # Bin the GC content if the bin size is greater than 1 @@ -470,7 +474,9 @@ def base_quality(data, font_size, plot_filepaths): # Set the error flag if the base quality is below 20 for more than 10% of # the bases error_flag = False - if np.sum(yd[:20]) / np.sum(yd) > 0.1: + if np.sum(yd) == 0: + error_flag = True + elif np.sum(yd[:20]) / np.sum(yd) > 0.1: error_flag = True plot_filepaths['base_quality']['error_flag'] = error_flag @@ -493,7 +499,9 @@ def read_avg_base_quality(data, font_size, plot_filepaths): # Set the error flag if the average base quality is below 20 for more than # 10% of the reads error_flag = False - if np.sum(yd[:20]) / np.sum(yd) > 0.1: + if np.sum(yd) == 0: + error_flag = True + elif np.sum(yd[:20]) / np.sum(yd) > 0.1: error_flag = True plot_filepaths['read_avg_base_quality']['error_flag'] = error_flag @@ -599,15 +607,17 @@ def plot(output_data, para_dict, file_type): # read_gc_content_histogram(output_data.long_read_info, font_size) read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths) - # Quality plots + # Base quality histogram if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt': seq_quality_info = output_data.seq_quality_info # Base quality histogram base_quality(seq_quality_info, font_size, plot_filepaths) - # plot_filepaths['base_quality']['dynamic'] = base_quality(seq_quality_info, font_size) - - # Read quality histogram + # plot_filepaths['base_quality']['dynamic'] = + # base_quality(seq_quality_info, font_size) + + # Read average base quality histogram + if file_type == 'FASTQ': # read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size) # plot_filepaths['read_avg_base_quality']['dynamic'] = # read_quality_dynamic @@ -1126,17 +1136,31 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # modification type logging.info("Getting mod data size") read_mod_data_size = output_data.getReadModDataSize() + logging.info("Mod data size: {}".format(read_mod_data_size)) + + # Choose a maximum of 10,000 reads to randomly sample for the plot + max_reads = min(read_mod_data_size, 10000) + # read_indices = set(sample(range(read_mod_data_size), max_reads)) + read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False) read_length_mod_rates = {} - for i in range(read_mod_data_size): + + # Get the read length vs. base modification rate data for each + # modification type in the sampled reads + # for i in range(read_mod_data_size): + # if i not in read_indices: + # continue + for i in read_indices: for mod_type in base_mod_types: if mod_type not in read_length_mod_rates: read_length_mod_rates[mod_type] = [] - logging.info("Getting read length for read {}".format(i)) - read_length = output_data.getNthReadModLength(i) - logging.info("Getting read length vs. {} modification rate".format(mod_type)) - mod_rate = output_data.getNthReadModRate(i, mod_type) - logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate)) + # logging.info("Getting read length for read {}".format(i)) + # read_length = output_data.getNthReadModLength(i) + read_length = output_data.getNthReadModLength(int(i)) + # logging.info("Getting read length vs. {} modification rate".format(mod_type)) + # mod_rate = output_data.getNthReadModRate(i, mod_type) + mod_rate = output_data.getNthReadModRate(int(i), mod_type) + # logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate)) read_length_mod_rates[mod_type].append((read_length, mod_rate)) # Dictionary of modification character to full name @@ -1147,7 +1171,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th 'N': 'Amb. N', \ 'v': 'pseU'} - # Create a plot of read length vs. base modification rate for each # modification type # Make subplots vertically for each modification type @@ -1159,13 +1182,19 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Format the data mod_data = read_length_mod_rates[mod_type] - x_vals = [data[0] for data in mod_data] - read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals] mod_rates = [data[1] * 100 for data in mod_data] + x_vals = [data[0] for data in mod_data] + + # Generate evenly-spaced x values and labels (10 ticks across the + # range) with the read lengths being a multiple of 1000 + x_tick_values = np.linspace(0, max(x_vals), num=10) + read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values] + + # read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals] # Update the min and max x values - min_x = min(min_x, min(x_vals)) - max_x = max(max_x, max(x_vals)) + # min_x = min(min_x, *x_vals) + # max_x = max(max_x, *x_vals) # Get the modification name try: @@ -1177,23 +1206,36 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name), row=i + 1, col=1) # Update the layout - max_x_range = min(max_x, 10000) # To improve the plot performance fig.update_layout(title='Read Length vs. {} Modification Rate'.format(mod_name), xaxis_title='Read Length', yaxis_title='Modification Rate (%)', showlegend=False, yaxis=dict(range=[0, 100]), - xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]), + xaxis=dict(tickvals=x_tick_values, ticktext=read_lengths, range=[0, max(x_vals)]), font=dict(size=PLOT_FONT_SIZE)) + # Get the X tick values generated by Plotly and format the read lengths + # x_tick_values = fig.layout.x + # if x_tick_values: + # read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values] + + # # Update the X tick labels + # fig.update_xaxes(tickvals=x_tick_values, ticktext=read_lengths, row=i + 1, col=1) + + # xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]), + # Save the plot image - if len(base_mod_types) > 0: - fig_file = plot_filepaths["read_length_mod_rates"]['file'] - logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file)) - fig.write_image(fig_file, format='png', width=700, height=500) + # if len(base_mod_types) > 0: + # fig_file = plot_filepaths["read_length_mod_rates"]['file'] + # logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file)) + # fig.write_image(fig_file, format='png', width=700, height=500) # Generate the HTML - # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) + # html_obj = fig.to_html(full_html=False, default_height=500, + # default_width=700) + if len(base_mod_types) > 0: + logging.info("Saving the read length vs. modification rates plot") + plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj else: logging.warning("WARNING: No modification types found") @@ -1337,16 +1379,9 @@ def plot_alignment_numbers(data, plot_filepaths): 'Reads with Secondary and Supplementary Alignments', 'Forward Alignments', 'Reverse Alignments'] category = [wrap(x) for x in category] - # Identify null values - error_flag = False - for value in [data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, - data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, - data.num_reads_with_both_secondary_supplementary_alignment, data.forward_alignment, - data.reverse_alignment]: - if value == 0: - error_flag = True - break - + # Set the error flag if primary alignments equal 0 + error_flag = data.num_primary_alignment == 0 + # Create a horizontally aligned bar plot trace from the data using plotly trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, From 330ba933b4fcb3604a045aaa828e7cf6a13e6834 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Sun, 12 Jan 2025 15:43:46 -0500 Subject: [PATCH 12/25] Update base mod rate and remove basic statistics bar graphs --- include/output_data.h | 4 +- src/bam_module.cpp | 10 +- src/cli.py | 39 +++++-- src/hts_reader.cpp | 114 +++++++++++++++---- src/output_data.cpp | 34 ++++-- src/plot_utils.py | 252 ++++++++++++------------------------------ 6 files changed, 228 insertions(+), 225 deletions(-) diff --git a/include/output_data.h b/include/output_data.h index 06608d6..fa66cdb 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -161,7 +161,6 @@ class Base_Move_Table struct ReadModData { int read_length; - double mod_rate; std::unordered_map<char, double> base_mod_rates; // Type-specific base modification rates }; @@ -229,7 +228,6 @@ class Output_BAM : public Output_FQ std::vector<char> getBaseModTypes(); // Get the types of base modifications found int getReadModDataSize(); // Get the number of read length vs. base modification rate data points int getNthReadModLength(int read_index); // Get the read length for the nth read - double getNthReadModRate(int read_index); // Get the base modification rate for the nth read double getNthReadModRate(int read_index, char mod_type); // Get the base modification rate for the nth read for a specific base modification type uint64_t getModTypeCount(char mod_type); // Get the count of a specific base modification type uint64_t getModTypeCount(char mod_type, int strand); // Get the count of a specific base modification type for a specific strand @@ -243,7 +241,7 @@ class Output_BAM : public Output_FQ int getReadSequenceEnd(std::string read_id); void updateBaseModCounts(char mod_type, int strand); // Update base modification counts for predictions exceeding the threshold - void updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates); // Update read length vs. base modification rate data + void updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates); // Update read length vs. base modification rate data // Add TIN data for a single BAM file void addTINData(std::string &bam_file, TINStats &tin_data); diff --git a/src/bam_module.cpp b/src/bam_module.cpp index 2d47cb4..058f831 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -146,6 +146,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ } // Calculate statistics in batches + printMemoryUsage("Before batch processing"); while (reader.hasNextRecord()){ std::cout << "Generating " << thread_count << " thread(s)..." << std::endl; std::vector<std::thread> thread_vector; @@ -169,6 +170,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ if (t.joinable()){ t.join(); } + printMemoryUsage("After thread " + std::to_string(thread_index)); thread_index++; } std::cout << "All threads joined." << std::endl; @@ -245,6 +247,7 @@ void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unorder // Update the final output std::lock_guard<std::mutex> lock(output_mutex); final_output.add(record_output); + printMemoryUsage("After record processing"); } std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_file, bool accepted_reads) @@ -262,7 +265,10 @@ std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_fi std::stringstream ss(header); std::string field; // std::cout << "RRMS CSV header:" << std::endl; - while (std::getline(ss, field, ',')){ + + // Split the header fields + char delimiter = ','; + while (std::getline(ss, field, delimiter)){ header_fields.push_back(field); // std::cout << field << std::endl; } @@ -297,7 +303,7 @@ std::unordered_set<std::string> BAM_Module::readRRMSFile(std::string rrms_csv_fi std::vector<std::string> fields; std::string field; std::stringstream ss(line); - while (std::getline(ss, field, ',')){ + while (std::getline(ss, field, delimiter)){ fields.push_back(field); } diff --git a/src/cli.py b/src/cli.py index cbe2a7d..ebf10d6 100644 --- a/src/cli.py +++ b/src/cli.py @@ -222,9 +222,16 @@ def bam_module(margs): param_dict["ref"] = input_para.ref_genome = ref_genome # Set the base modification flag, and filtering threshold - param_dict["mod"] = input_para.mod_analysis = margs.mod + # param_dict["mod"] = input_para.mod_analysis = margs.mod + if margs.mod: + param_dict["mod"] = input_para.mod_analysis = True + else: + param_dict["mod"] = input_para.mod_analysis = False + mod_prob = margs.modprob - param_dict["modprob"] = input_para.base_mod_threshold = mod_prob + param_dict["modprob"] = mod_prob + input_para.base_mod_threshold = mod_prob + logging.info("Base modification threshold is set to " + str(input_para.base_mod_threshold)) # Set the gene BED file for RNA-seq transcript analysis input_para.gene_bed = margs.genebed if margs.genebed != "" or margs.genebed is not None else "" @@ -246,7 +253,7 @@ def bam_module(margs): plot_filepaths = plot(bam_output, param_dict, 'BAM') # Set the list of QC information to display - qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"] + qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"] # If base modifications were found, add the base modification plots # after the first table @@ -301,6 +308,7 @@ def rrms_module(margs): # Set the output prefix param_dict["out_prefix"] = output_prefix + "rrms_" + ("accepted" if filter_type else "rejected") + param_dict["mod"] = input_para.mod_analysis = False # Disable base modification analysis for RRMS (use BAM module for this) # Run the QC module logging.info("Running QC for " + ("accepted" if filter_type else "rejected") + " reads...") @@ -311,10 +319,19 @@ def rrms_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(bam_output, param_dict, 'BAM') + # Set the list of QC information to display + qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"] + + # If base modifications were found, add the base modification + # plots + if bam_output.sample_modified_base_count > 0: + logging.info("Base modifications found. Adding base modification plots to the HTML report.") + qc_info_list.insert(1, "read_length_mod_rates") + qc_info_list.insert(1, "base_mods") + # Generate the HTML report bam_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", - "base_quality"], "BAM QC", param_dict], plot_filepaths, static=False) + [qc_info_list, "BAM QC", param_dict], plot_filepaths, static=False) bam_html_gen.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -350,7 +367,7 @@ def seqtxt_module(margs): report_title = "Basecall Summary QC" seqtxt_html_gen = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "basic_info"], + [["basic_st", "read_length_bar", "read_length_hist"], report_title, param_dict], plot_filepaths, static=False) seqtxt_html_gen.generate_html() @@ -386,7 +403,7 @@ def fast5_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fast5_output, param_dict, 'FAST5') fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "base_quality"], + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"], "FAST5 QC", param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html() logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -432,7 +449,7 @@ def fast5_signal_module(margs): logging.info("Generating HTML report...") plot_filepaths = plot(fast5_output, param_dict, 'FAST5s') fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False) + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], "FAST5 QC", param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html(signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -500,7 +517,7 @@ def pod5_module(margs): # plot_filepaths = plot(read_signal_dict, param_dict, 'POD5') webpage_title = "POD5 QC" fast5_html_obj = generate_html.ST_HTML_Generator( - [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "basic_info", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False) + [["basic_st", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "ont_signal"], webpage_title, param_dict], plot_filepaths, static=False) fast5_html_obj.generate_html(signal_plots=True) logging.info("Done. Output files are in %s", param_dict["output_folder"]) @@ -637,8 +654,8 @@ def set_file_parser_defaults(file_parser): bam_parser.add_argument("--genebed", type=str, default="", help="Gene BED12 file required for calculating TIN scores from RNA-seq BAM files. Default: None.") -bam_parser.add_argument("--modprob", type=float, default=0.8, - help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.8.") +bam_parser.add_argument("--modprob", type=float, default=0.5, + help="Base modification filtering threshold. Above/below this value, the base is considered modified/unmodified. Default: 0.5.") bam_parser.add_argument("--ref", type=str, default="", help="The reference genome FASTA file to use for identifying CpG sites.") diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 8762f45..585254d 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -104,10 +104,10 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu bool read_ids_present = false; if (read_ids.size() > 0){ read_ids_present = true; - printMessage("Filtering reads by read ID"); + // printMessage("Filtering reads by read ID"); - printMessage("Number of read IDs: " + std::to_string(read_ids.size())); - printMessage("First read ID: " + *read_ids.begin()); + // printMessage("Number of read IDs: " + std::to_string(read_ids.size())); + // printMessage("First read ID: " + *read_ids.begin()); // Check if the first read ID has any newlines, carriage returns, tabs, // or spaces if (read_ids.begin()->find_first_of("\n\r\t ") != std::string::npos) { @@ -361,6 +361,14 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f std::vector<int> read_lengths; // Read lengths std::vector<double> read_mod_rates; // Total base modification rate for each read length std::vector<std::unordered_map<char, double>> read_base_mod_rates; // Type-specific base modification rates for each read length + + // Keep track of number of modified bases on the primary alignment vs other + // alignments (secondary, supplementary, unmapped) + int num_modified_bases_primary = 0; + int num_modified_bases_unmapped = 0; + int num_modified_bases_secondary = 0; + int num_modified_bases_supplementary = 0; + while (sam_read1(bam_file, bam_header, bam_record) >= 0) { num_reads++; @@ -373,13 +381,32 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f int read_length = bam_record->core.l_qseq; hts_base_mod_state *state = hts_base_mod_state_alloc(); std::vector<std::pair<int32_t, int>> c_modified_positions; // C-modified positions for CpG analysis (chr->(position, strand)) - std::unordered_map<char, int> base_mod_counts; // Type-specific base modification counts for the read + // std::unordered_map<char, int> base_mod_counts; // Type-specific + // base modification counts for the alignment + std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts; // Type-specific base modification counts (canonical base -> modified base -> count) + std::unordered_map<char, int> base_primary_count; // Total base counts for the alignment // Parse the base modification tags if a primary alignment int read_mod_count = 0; int ret = bam_parse_basemod(bam_record, state); + bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); + + // Update the number of reads with base modifications for the + // primary alignment vs other alignments if (ret >= 0) { - bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); + if (is_primary) { + num_modified_bases_primary++; + } else if (bam_record->core.flag & BAM_FUNMAP) { + num_modified_bases_unmapped++; + } else if (bam_record->core.flag & BAM_FSECONDARY) { + num_modified_bases_secondary++; + } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) { + num_modified_bases_supplementary++; + } + } + + if (ret >= 0 && is_primary) { + // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); // Get the chromosome if alignments are present bool alignments_present = true; @@ -398,9 +425,11 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f // but it always yields 0...) int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0; - // Set strand to null (-1) if the read is not primary - if (!is_primary) { - strand = -1; + // Get the number of each type of base for the read + uint8_t *seq = bam_get_seq(bam_record); + for (int i = 0; i < read_length; i++) { + char base = seq_nt16_str[bam_seqi(seq, i)]; + base_primary_count[std::toupper(base)]++; } // Iterate over the state object to get the base modification tags @@ -416,8 +445,9 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f // Update the modified prediction counts read_mod_count++; // Read-specific count final_output.modified_prediction_count++; // Cumulative count + char canonical_base_char = std::toupper(mods[i].canonical_base); char mod_type = mods[i].modified_base; - base_mod_counts[mod_type]++; // Update the type-specific count + // base_mod_counts[mod_type]++; // Update the type-specific count // Note: The modified base value can be a positive char (e.g. 'm', // 'h') (DNA Mods DB) or negative integer (ChEBI ID): @@ -436,11 +466,13 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f // Update counts for predictions exceeding the threshold if (probability >= base_mod_threshold) { final_output.updateBaseModCounts(mod_type, strand); // Update the base modification counts + // base_mod_counts[mod_type]++; // Update the + // type-specific count + base_mod_counts[canonical_base_char][mod_type]++; // Update the type-specific count // Store the modified positions for later CpG // analysis if a C modification on a primary alignment - char canonical_base_char = std::toupper(mods[i].canonical_base); - if (is_primary && canonical_base_char == 'C' && mod_type != 'C') { + if (canonical_base_char == 'C' && mod_type != 'C') { // Convert the query position to reference position if available if (alignments_present) { @@ -451,6 +483,9 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f } } } + // } else { + // base_primary_count[mod_type]++; // Update the type-specific unmodified count + // } } } } @@ -474,26 +509,63 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f hts_base_mod_state_free(state); // Deallocate the base modification state object // Calculate the base modification rate for the read - double read_mod_rate = 0.0; - if (read_length > 0) { - read_mod_rate = (double) read_mod_count / read_length; - } + // double read_mod_rate = 0.0; + // if (read_length > 0) { + // read_mod_rate = (double) read_mod_count / read_length; + // } // Calculate the type-specific base modification rates for the read std::unordered_map<char, double> base_mod_rates; for (auto const &it : base_mod_counts) { - char mod_type = it.first; - int mod_count = it.second; + char canonical_base = it.first; + std::unordered_map<char, int> mod_counts = it.second; double mod_rate = 0.0; - if (read_length > 0) { - mod_rate = (double) mod_count / read_length; + int total_base_count = base_primary_count[canonical_base]; + + // Calculate the modification rate for each modification type + for (auto const &it2 : mod_counts) { + char mod_type = it2.first; + int mod_count = it2.second; + double mod_rate = 0.0; + if (mod_count + total_base_count > 0) { + mod_rate = (double) mod_count / total_base_count; + } + base_mod_rates[mod_type] = mod_rate; } - base_mod_rates[mod_type] = mod_rate; + // for (auto const &it2 : mod_counts) { + // total_mod_count += it2.second; + // } + // if (total_mod_count + total_base_count > 0) { + // mod_rate = (double) total_mod_count / (total_mod_count + total_base_count); + // } + // base_mod_rates[canonical_base] = mod_rate; } - final_output.updateReadModRate(read_length, read_mod_rate, base_mod_rates); // Update the output data + // for (auto const &it : base_mod_counts) { + // char mod_type = it.first; + // int mod_count = it.second; + // double mod_rate = 0.0; + // int total_base_count = base_primary_count[mod_type]; + // if (mod_count + unmod_count > 0) { + // mod_rate = (double) mod_count / (mod_count + unmod_count); + // } + // // if (read_length > 0) { + // // mod_rate = (double) mod_count / read_length; + // // } + // base_mod_rates[mod_type] = mod_rate; + // } + final_output.updateReadModRate(read_length, base_mod_rates); // Update the output data } } + // Summary of base modification counts + if (mod_analysis) { + printMessage("Base modification counts:"); + printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary)); + printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped)); + printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary)); + printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary)); + } + bam_destroy1(bam_record); bam_hdr_destroy(bam_header); sam_close(bam_file); diff --git a/src/output_data.cpp b/src/output_data.cpp index 02cd5a8..148fbeb 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -257,6 +257,18 @@ void Basic_Seq_Quality_Statistics::global_sum(){ // BAM output constructor Output_BAM::Output_BAM(){ + this->num_primary_alignment = 0; + this->num_secondary_alignment = 0; + this->num_supplementary_alignment = 0; + this->num_clip_bases = 0; + this->sample_modified_base_count = 0; + this->sample_modified_base_count_forward = 0; + this->sample_modified_base_count_reverse = 0; + this->forward_alignment = 0; + this->reverse_alignment = 0; + this->base_mod_counts = std::unordered_map<char, uint64_t>(); + this->base_mod_counts_forward = std::unordered_map<char, uint64_t>(); + this->base_mod_counts_reverse = std::unordered_map<char, uint64_t>(); } Output_BAM::~Output_BAM(){ @@ -278,20 +290,29 @@ void Output_BAM::updateBaseModCounts(char mod_type, int strand) } } -void Output_BAM::updateReadModRate(int read_length, double read_mod_rate, std::unordered_map<char, double> base_mod_rates) { +void Output_BAM::updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates) { ReadModData read_mod_data; read_mod_data.read_length = read_length; - read_mod_data.mod_rate = read_mod_rate; read_mod_data.base_mod_rates = base_mod_rates; this->read_mod_data.push_back(read_mod_data); } std::vector<char> Output_BAM::getBaseModTypes() { + printMessage("[TEST] Getting base modification types."); std::vector<char> base_mod_types; - for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) { - base_mod_types.push_back(it->first); + if (this->base_mod_counts.empty()) { + printError("No base modification counts found."); + return base_mod_types; } + + printMessage("[TEST2] Getting base modification types."); + for (const auto& it : this->base_mod_counts) { + base_mod_types.push_back(it.first); + } + // for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) { + // base_mod_types.push_back(it->first); + // } return base_mod_types; } @@ -305,11 +326,6 @@ int Output_BAM::getNthReadModLength(int read_index) return this->read_mod_data[read_index].read_length; } -double Output_BAM::getNthReadModRate(int read_index) -{ - return this->read_mod_data[read_index].mod_rate; -} - double Output_BAM::getNthReadModRate(int read_index, char mod_type) { double mod_rate = 0.0; diff --git a/src/plot_utils.py b/src/plot_utils.py index 2f25c7d..a47b958 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -20,8 +20,9 @@ MAX_READ_QUALITY = 100 PLOT_FONT_SIZE = 16 -# Return a dictionary of default plot filenames + def getDefaultPlotFilenames(): + """Create a default HTML plot data structure.""" plot_filenames = { # for fq/fa "read_length_distr": {'title': "Read Length", 'description': "Read Length Distribution"}, # for bam "read_alignments_bar": {'title': "Read Alignments", @@ -31,8 +32,6 @@ def getDefaultPlotFilenames(): "read_length_bar": {'title': "Read Length Statistics", 'description': "Read Length Statistics"}, "base_counts": {'title': "Base Counts", 'description': "Base Counts", 'summary': ""}, - "basic_info": {'title': "Basic Statistics", - 'description': "Basic Statistics", 'summary': ""}, "read_length_hist": {'title': "Read Length Histogram", 'description': "Read Length Histogram", 'summary': ""}, "gc_content_hist": {'title': "GC Content Histogram", 'description': "GC Content Histogram", 'summary': ""}, @@ -49,8 +48,9 @@ def getDefaultPlotFilenames(): return plot_filenames -# Wrap the text in the table + def wrap(label): + """Wrap the label text.""" # First split the string into a list of words words = label.split(' ') @@ -71,8 +71,9 @@ def wrap(label): return new_label -# Plot the read alignment numbers + def plot_read_length_stats(output_data, file_type, plot_filepaths): + """Plot the read length statistics.""" # Define the three categories category = ['N50', 'Mean', 'Median'] @@ -205,99 +206,10 @@ def plot_base_counts(output_data, filetype, plot_filepaths): layout = go.Layout(title='', xaxis=dict(title='Base'), yaxis=dict(title='Counts'), barmode='group', font=dict(size=PLOT_FONT_SIZE)) fig = go.Figure(data=all_traces, layout=layout) - # Generate the HTML - # html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) - - # return html_obj - # Generate the HTML plot_filepaths['base_counts']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) plot_filepaths['base_counts']['error_flag'] = error_flag -def plot_basic_info(output_data, file_type, plot_filepaths): - """Plot basic information about the reads in bar chart format.""" - html_obj = '' - if file_type == 'BAM': - - # Create a bar trace for each type of data - bar_titles = ['All Reads', 'Mapped Reads', 'Unmapped Reads'] - data_objects = [output_data.long_read_info, output_data.mapped_long_read_info, output_data.unmapped_long_read_info] - - # Create subplots for each category - fig = make_subplots(rows=2, cols=2, subplot_titles=("Number of Reads", "Number of Bases", "Longest Read", "GC Content"), horizontal_spacing=0.3, vertical_spacing=0.2) - - # Add traces for each category - key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length', 'gc_cnt'] - error_flag = False - for i in range(4): - # Get the data for this category - key_name = key_list[i] - - # Add the traces for each type of data - data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)] - - # Set the error flag if any of the values are zero (except for unmapped reads) - # if data[0] == 0 or data[1] == 0 or data[2] == 0: - if data[0] == 0 or data[1] == 0: - error_flag = True - - # Create the trace - trace = go.Bar(x=data, y=bar_titles, orientation='h') - - # Add the trace to the figure - fig.add_trace(trace, row=(i // 2) + 1, col=(i % 2) + 1) - fig.update_layout(showlegend=False) - - # Update the layout - fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE)) - - # Generate the HTML - # html_obj = fig.to_html(full_html=False, default_height=800, - # default_width=1200) - plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=800, default_width=1200) - plot_filepaths['basic_info']['error_flag'] = error_flag - - - elif file_type == 'SeqTxt': - - # Create a bar trace for each type of data - bar_titles = ['All Reads', 'Passed Reads', 'Failed Reads'] - data_objects = [output_data.all_long_read_info.long_read_info, output_data.passed_long_read_info.long_read_info, output_data.failed_long_read_info.long_read_info] - - # Create subplots for each category - fig = make_subplots(rows=1, cols=3, subplot_titles=("Number of Reads", "Number of Bases", "Longest Read"), horizontal_spacing=0.1) - - # Add traces for each category - key_list = ['total_num_reads', 'total_num_bases', 'longest_read_length'] - error_flag = False - for i in range(3): - # Get the data for this category - key_name = key_list[i] - - # Add the traces for each type of data - data = [getattr(data_objects[0], key_name), getattr(data_objects[1], key_name), getattr(data_objects[2], key_name)] - - # Set the error flag if any of the values are zero - if data[0] == 0 or data[1] == 0 or data[2] == 0: - error_flag = True - - # Create the trace - trace = go.Bar(x=data, y=bar_titles, orientation='h') - - # Add the trace to the figure - fig.add_trace(trace, row=1, col=i + 1) - - # Update the layout - fig.update_layout(showlegend=False, font=dict(size=PLOT_FONT_SIZE)) - - # Generate the HTML - # html_obj = fig.to_html(full_html=False, default_height=500, - # default_width=1600) - plot_filepaths['basic_info']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1600) - plot_filepaths['basic_info']['error_flag'] = error_flag - - return html_obj - # Plot the read length histograms def read_lengths_histogram(data, font_size, plot_filepaths): @@ -399,8 +311,6 @@ def read_lengths_histogram(data, font_size, plot_filepaths): fig.update_annotations(font_size=annotation_size) # Generate the HTML - # html_obj = fig.to_html(full_html=False, default_height=500, - # default_width=1200) plot_filepaths['read_length_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1200) @@ -542,7 +452,12 @@ def plot(output_data, para_dict, file_type): create_summary_table(output_data, plot_filepaths, file_type) # Create the summary table # Modified base table and plots - if file_type == 'BAM' and para_dict["mod"] > 0: + try: + para_dict["mod"] + except KeyError: + para_dict["mod"] = False + + if file_type == 'BAM' and para_dict["mod"]: # Output file for the read length vs. modification rates plot output_folder = para_dict["output_folder"] read_length_mod_rate_file = os.path.join(output_folder, 'read_length_hist.png') @@ -555,6 +470,11 @@ def plot(output_data, para_dict, file_type): logging.warning("WARNING: Modified base table not created") # Create the TIN table if available + try: + para_dict["genebed"] + except KeyError: + para_dict["genebed"] = "" + if file_type == 'BAM' and para_dict["genebed"] != "": input_files = para_dict["input_files"] create_tin_table(output_data, input_files, plot_filepaths) @@ -565,16 +485,8 @@ def plot(output_data, para_dict, file_type): else: logging.warning("WARNING: TIN table not created") - # Generate plots - # plot_filepaths['base_counts']['dynamic'] = plot_base_counts(output_data, - # file_type) plot_base_counts(output_data, file_type, plot_filepaths) - # Plot basic information - # plot_filepaths['basic_info']['dynamic'] = plot_basic_info(output_data, - # file_type) - plot_basic_info(output_data, file_type, plot_filepaths) - # Read length histogram if file_type == 'SeqTxt': long_read_data = output_data.all_long_read_info.long_read_info @@ -582,29 +494,15 @@ def plot(output_data, para_dict, file_type): long_read_data = output_data.long_read_info if file_type != 'FAST5s': - # plot_filepaths['read_length_hist']['dynamic'] = - # read_lengths_histogram(long_read_data, font_size) read_lengths_histogram(long_read_data, font_size, plot_filepaths) - - # plot_filepaths['read_length_bar']['dynamic'] = - # plot_read_length_stats(output_data, file_type) plot_read_length_stats(output_data, file_type, plot_filepaths) # GC content histogram if file_type != 'FAST5s' and file_type != 'SeqTxt': - if file_type == 'BAM': - # plot_filepaths['gc_content_hist']['dynamic'] = - # read_gc_content_histogram(output_data.mapped_long_read_info, - # font_size) read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths) elif file_type == 'SeqTxt': - # plot_filepaths['gc_content_hist']['dynamic'] = - # read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, - # font_size) read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths) else: - # plot_filepaths['gc_content_hist']['dynamic'] = - # read_gc_content_histogram(output_data.long_read_info, font_size) read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths) # Base quality histogram @@ -613,25 +511,17 @@ def plot(output_data, para_dict, file_type): # Base quality histogram base_quality(seq_quality_info, font_size, plot_filepaths) - # plot_filepaths['base_quality']['dynamic'] = - # base_quality(seq_quality_info, font_size) # Read average base quality histogram if file_type == 'FASTQ': - # read_quality_dynamic = read_avg_base_quality(seq_quality_info, font_size) - # plot_filepaths['read_avg_base_quality']['dynamic'] = - # read_quality_dynamic read_avg_base_quality(seq_quality_info, font_size, plot_filepaths) if file_type == 'BAM': # Plot read alignment QC plot_alignment_numbers(output_data, plot_filepaths) - # plot_filepaths['read_alignments_bar']['dynamic'] = - # plot_alignment_numbers(output_data) # Plot base alignment and error QC plot_errors(output_data, plot_filepaths) - # plot_filepaths['base_alignments_bar']['dynamic'] = plot_errors(output_data) elif file_type == 'FAST5s': plot_filepaths['ont_signal']['dynamic'] = plot_signal(output_data, para_dict) @@ -1114,6 +1004,12 @@ def create_summary_table(output_data, plot_filepaths, file_type): plot_filepaths["basic_st"]['error_flag'] = table_error_flag +def get_axis_name(row, axis_type='x'): + """Get the axis name for the plot.""" + axis_number = row + 1 + return f"{axis_type}axis{axis_number}" if axis_number > 1 else f"{axis_type}axis" + + def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold): """Create a summary table for the base modifications.""" plot_filepaths["base_mods"] = {} @@ -1125,6 +1021,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Print the types of modifications logging.info("Getting base modification types") base_mod_types = output_data.getBaseModTypes() + logging.info("[TEST] Modification types: ") if base_mod_types: logging.info("Modification types: ") for mod_type in base_mod_types: @@ -1146,9 +1043,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Get the read length vs. base modification rate data for each # modification type in the sampled reads - # for i in range(read_mod_data_size): - # if i not in read_indices: - # continue for i in read_indices: for mod_type in base_mod_types: if mod_type not in read_length_mod_rates: @@ -1174,10 +1068,20 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Create a plot of read length vs. base modification rate for each # modification type # Make subplots vertically for each modification type - fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1) + subplot_titles = [] + for mod_type in base_mod_types: + try: + mod_name = mod_char_to_name[mod_type] + except KeyError: + logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) + mod_name = mod_type + + subplot_titles.append('Read Length vs. {} Modification Rate'.format(mod_name)) + + + fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=subplot_titles) min_x = float('inf') max_x = 0 - # for mod_type in base_mod_types: for i, mod_type in enumerate(base_mod_types): # Format the data @@ -1185,17 +1089,24 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th mod_rates = [data[1] * 100 for data in mod_data] x_vals = [data[0] for data in mod_data] + # Remove outlier read lengths using the IQR method + if len(x_vals) > 1: + x_vals_np = np.array(x_vals) + q1 = np.percentile(x_vals_np, 25) + q3 = np.percentile(x_vals_np, 75) + iqr = q3 - q1 + lower_bound = q1 - 1.5 * iqr + upper_bound = q3 + 1.5 * iqr + + # Filter the data to remove outliers + filtered_data = [(x, y) for x, y in zip(x_vals, mod_rates) if lower_bound <= x <= upper_bound] + x_vals, mod_rates = zip(*filtered_data) + # Generate evenly-spaced x values and labels (10 ticks across the # range) with the read lengths being a multiple of 1000 x_tick_values = np.linspace(0, max(x_vals), num=10) read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values] - # read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_vals] - - # Update the min and max x values - # min_x = min(min_x, *x_vals) - # max_x = max(max_x, *x_vals) - # Get the modification name try: mod_name = mod_char_to_name[mod_type] @@ -1203,26 +1114,19 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) mod_name = mod_type - fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name), row=i + 1, col=1) + fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1) # Update the layout - fig.update_layout(title='Read Length vs. {} Modification Rate'.format(mod_name), - xaxis_title='Read Length', - yaxis_title='Modification Rate (%)', - showlegend=False, - yaxis=dict(range=[0, 100]), - xaxis=dict(tickvals=x_tick_values, ticktext=read_lengths, range=[0, max(x_vals)]), - font=dict(size=PLOT_FONT_SIZE)) + x_axis_name = get_axis_name(i) + y_axis_name = get_axis_name(i, 'y') + logging.info("Index: {}, Y index: {}".format(i, y_axis_name)) - # Get the X tick values generated by Plotly and format the read lengths - # x_tick_values = fig.layout.x - # if x_tick_values: - # read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values] - - # # Update the X tick labels - # fig.update_xaxes(tickvals=x_tick_values, ticktext=read_lengths, row=i + 1, col=1) + # Auto range the axes + fig.update_layout( + **{f"{x_axis_name}_title": 'Read Length (bp)', + f"{y_axis_name}_title": 'Modification Rate (%)'}) - # xaxis=dict(tickvals=x_vals, ticktext=read_lengths, range=[0, max_x_range]), + fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) # Save the plot image # if len(base_mod_types) > 0: @@ -1234,15 +1138,16 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # html_obj = fig.to_html(full_html=False, default_height=500, # default_width=700) if len(base_mod_types) > 0: + plot_height = 500 * len(base_mod_types) logging.info("Saving the read length vs. modification rates plot") - plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) + plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=700) # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj else: logging.warning("WARNING: No modification types found") # Create the base modification statistics table table_str = "<table>\n<tbody>" - row_str, row_flag = format_row("Total Predictions", [output_data.modified_prediction_count], 'int', None) + row_str, row_flag = format_row("Total Unfiltered Predictions", [output_data.modified_prediction_count], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag @@ -1262,22 +1167,14 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None) + row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total modified CpG Sites in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None) + row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - # table_str += "<tr><td>Total Predictions</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.modified_prediction_count) - # table_str += "<tr><td>Probability Threshold</td><td style=\"text-align:right\">{:.2f}</td></tr>".format(base_modification_threshold) - # table_str += "<tr><td>Total Modified Bases in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count) - # table_str += "<tr><td>Total in the Forward Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_forward) - # table_str += "<tr><td>Total in the Reverse Strand</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_modified_base_count_reverse) - # table_str += "<tr><td>Total modified CpG Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_forward_count) - # table_str += "<tr><td>Total modified CpG Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(output_data.sample_cpg_reverse_count) - # Add the modification type data for mod_type in base_mod_types: # mod_name = mod_char_to_name[mod_type] @@ -1291,20 +1188,17 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th mod_count_fwd = output_data.getModTypeCount(mod_type, 0) mod_count_rev = output_data.getModTypeCount(mod_type, 1) - row_str, row_flag = format_row("Total {} Sites in the Sample".format(mod_name), [mod_count], 'int', None) + row_str, row_flag = format_row("Total {} Counts in the Sample".format(mod_name), [mod_count], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total {} Sites in the Sample (Forward Strand)".format(mod_name), [mod_count_fwd], 'int', None) + row_str, row_flag = format_row("Total {} Counts in the Sample (Forward Strand)".format(mod_name), [mod_count_fwd], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total {} Sites in the Sample (Reverse Strand)".format(mod_name), [mod_count_rev], 'int', None) + row_str, row_flag = format_row("Total {} Counts in the Sample (Reverse Strand)".format(mod_name), [mod_count_rev], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - # table_str += "<tr><td>Total {} Sites in the Sample</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count) - # table_str += "<tr><td>Total {} Sites in the Sample (Forward Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_fwd) - # table_str += "<tr><td>Total {} Sites in the Sample (Reverse Strand)</td><td style=\"text-align:right\">{:,d}</td></tr>".format(mod_name, mod_count_rev) # Finish the table table_str += "\n</tbody>\n</table>" @@ -1320,7 +1214,7 @@ def create_tin_table(output_data, input_files, plot_filepaths): # Create a table with the first column showing the BAM filepath, and the # following columns showing TIN count, mean, median, and standard deviation - table_str = "<table>\n<thead>\n<tr><th>BAM File</th><th>Count</th><th>Mean</th><th>Median</th><th>StdDev</th></tr>\n</thead>" + table_str = "<table>\n<thead>\n<tr><th>BAM File</th><th>Median TIN Score</th><th>Number of Transcripts</th></tr>\n</thead>" table_str += "\n<tbody>" # Loop through each BAM file @@ -1330,18 +1224,18 @@ def create_tin_table(output_data, input_files, plot_filepaths): bam_filename = os.path.basename(bam_file) # Get the file data - tin_count = output_data.getTINCount(bam_file) - tin_mean = output_data.getTINMean(bam_file) + # tin_count = output_data.getTINCount(bam_file) + # tin_mean = output_data.getTINMean(bam_file) tin_median = output_data.getTINMedian(bam_file) - tin_std = output_data.getTINStdDev(bam_file) + # tin_std = output_data.getTINStdDev(bam_file) # Add the data to the table - row_str, row_flag = format_row(bam_filename, [tin_count, tin_mean, tin_median, tin_std], 'float', None) + # row_str, row_flag = format_row(bam_filename, [tin_count, tin_mean, + # tin_median, tin_std], 'float', None) + row_str, row_flag = format_row(bam_filename, [tin_median, output_data.getTINCount(bam_file)], 'float', None) table_str += row_str error_flag = error_flag or row_flag - # table_str += "<tr><td>{}</td><td style=\"text-align:right\">{:,d}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td><td style=\"text-align:right\">{:.1f}</td></tr>".format(bam_filename, tin_count, tin_mean, tin_median, tin_std) - table_str += "\n</tbody>\n</table>" # Add the table to the plot filepaths From bf61e23c7546fbeb5e546879d66a39abcad3eb58 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Sun, 12 Jan 2025 19:04:37 -0500 Subject: [PATCH 13/25] Work on help icon and fix forward vs reverse counts --- src/generate_html.py | 45 +++++++++++++++-- src/hts_reader.cpp | 4 +- src/plot_utils.py | 112 +++++++++++++++++++++++++++++-------------- 3 files changed, 121 insertions(+), 40 deletions(-) diff --git a/src/generate_html.py b/src/generate_html.py index 4cc3cf0..dce7237 100644 --- a/src/generate_html.py +++ b/src/generate_html.py @@ -218,6 +218,48 @@ def generate_header(self): li { margin: 10px 0; } +.help-icon { + position: relative; + display: inline-block; + cursor: pointer; + color: #555; + font-size: 18px; /* Adjust size of the icon */ + margin-top: 10px; /* Adjust spacing if needed */ +} + +.help-icon:hover .tooltip { + visibility: visible; + opacity: 1; +} + +.tooltip { + visibility: hidden; + width: 200px; + background-color: #333; + color: #fff; + text-align: left; + border-radius: 4px; + padding: 8px; + font-size: 14px; + position: absolute; + top: 50%; /* Position the tooltip */ + left: 120%; /* Position the tooltip */ + transform: translateY(-50%); + opacity: 0; + transition: opacity 0.3s; + z-index: 1; +} + +.tooltip::after { + content: ''; + position: absolute; + top: 50%; /* Position the arrow in the middle of the tooltip */ + left: 0; /* Position the arrow on the left edge of the tooltip */ + transform: translateY(-50%); + border-width: 5px; + border-style: solid; + border-color: #333 transparent transparent transparent; +} </style>''') self.html_writer.write("</head>") @@ -306,9 +348,6 @@ def generate_right(self): self.html_writer.write('<div class="module">') self.html_writer.write('<h2 id="lrst' + str(key_index) + '">File Count = ' + str( len(self.input_para["input_files"])) + '</h2><p>') - # for _af in self.input_para["input_files"]: - # self.html_writer.write("<br/>" + _af) - # Write the input files in format "1.\tfile1\n2.\tfile2\n..." self.html_writer.write("<br/>" + "<br/>".join([f"{i+1}.\t{af}" for i, af in enumerate(self.input_para["input_files"])])) self.html_writer.write('</p></div>') key_index += 1 diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 585254d..6c4db22 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -218,9 +218,9 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // Determine if this is a forward or reverse read if (record->core.flag & BAM_FREVERSE) { - output_data.forward_alignment++; - } else { output_data.reverse_alignment++; + } else { + output_data.forward_alignment++; } // Loop through the cigar string and count the number of insertions, deletions, and matches diff --git a/src/plot_utils.py b/src/plot_utils.py index a47b958..f9b8239 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -211,8 +211,8 @@ def plot_base_counts(output_data, filetype, plot_filepaths): plot_filepaths['base_counts']['error_flag'] = error_flag -# Plot the read length histograms def read_lengths_histogram(data, font_size, plot_filepaths): + """Plot the read length histogram.""" linear_bin_count = 10 log_bin_count = 10 @@ -319,6 +319,22 @@ def read_gc_content_histogram(data, font_size, plot_filepaths): bin_size = 1 gc_content = np.array(data.read_gc_content_count) + # Calculate the percentage of reads with a GC content of <30% + gc_content_below_30 = np.sum(gc_content[:30]) + logging.info("[TEST] Percentage of reads with GC content <30%: {}".format(gc_content_below_30 / np.sum(gc_content))) + + # Calculate the percentage of reads with a GC content of >70% + gc_content_above_70 = np.sum(gc_content[70:]) + logging.info("[TEST] Percentage of reads with GC content >70%: {}".format(gc_content_above_70 / np.sum(gc_content))) + + # Calculate the percentage of reads with a GC content of <20% + gc_content_below_20 = np.sum(gc_content[:20]) + logging.info("[TEST] Percentage of reads with GC content <20%: {}".format(gc_content_below_20 / np.sum(gc_content))) + + # Calculate the percentage of reads with a GC content of >60% + gc_content_above_60 = np.sum(gc_content[60:]) + logging.info("[TEST] Percentage of reads with GC content >60%: {}".format(gc_content_above_60 / np.sum(gc_content))) + # Set the error flag if the GC content is below 20% for more than 10% of the # reads error_flag = False @@ -357,7 +373,6 @@ def read_gc_content_histogram(data, font_size, plot_filepaths): fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0) fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) # Set font size - # return fig.to_html(full_html=False, default_height=500, default_width=700) plot_filepaths['gc_content_hist']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=700) plot_filepaths['gc_content_hist']['error_flag'] = error_flag @@ -447,6 +462,7 @@ def plot_base_modifications(base_modifications): def plot(output_data, para_dict, file_type): """Generate the plots for the output data.""" + logging.info("Generating plots for file type: {}".format(file_type)) plot_filepaths = getDefaultPlotFilenames() font_size = 14 # Font size for the plots create_summary_table(output_data, plot_filepaths, file_type) # Create the summary table @@ -498,12 +514,12 @@ def plot(output_data, para_dict, file_type): plot_read_length_stats(output_data, file_type, plot_filepaths) # GC content histogram - if file_type != 'FAST5s' and file_type != 'SeqTxt': - read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths) - elif file_type == 'SeqTxt': - read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths) - else: - read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths) + if file_type == 'BAM': + read_gc_content_histogram(output_data.mapped_long_read_info, font_size, plot_filepaths) + elif file_type == 'SeqTxt': + read_gc_content_histogram(output_data.passed_long_read_info.long_read_info, font_size, plot_filepaths) + elif file_type == 'FASTQ' or file_type == 'FASTA': + read_gc_content_histogram(output_data.long_read_info, font_size, plot_filepaths) # Base quality histogram if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt': @@ -1000,6 +1016,12 @@ def create_summary_table(output_data, plot_filepaths, file_type): table_error_flag = table_error_flag or row_flag table_str += "\n</tbody>\n</table>" + # table_str += """ + # <div class="help-icon"> + # 💡 + # <div class="tooltip">This is your help text explaining the feature!</div> + # </div> + # """ plot_filepaths["basic_st"]['detail'] = table_str plot_filepaths["basic_st"]['error_flag'] = table_error_flag @@ -1012,6 +1034,13 @@ def get_axis_name(row, axis_type='x'): def create_modified_base_table(output_data, plot_filepaths, base_modification_threshold): """Create a summary table for the base modifications.""" + help_text = "Total unfiltered predictions are all predictions prior to applying the base modification probability threshold.\n" \ + "This threshold is set by the user (default: 0.5) and is used to filter out low-confidence base modifications.\n" \ + "Total modification counts are the number of base modifications that pass the threshold.\n" \ + "These counts are also separated by forward and reverse strand predictions.\n" \ + "CpG modification counts are the total CpG modifications that pass the threshold.\n" \ + "These are total counts and not site-specific counts." \ + plot_filepaths["base_mods"] = {} plot_filepaths["base_mods"]['file'] = "" plot_filepaths["base_mods"]['title'] = "Base Modifications" @@ -1101,11 +1130,13 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Filter the data to remove outliers filtered_data = [(x, y) for x, y in zip(x_vals, mod_rates) if lower_bound <= x <= upper_bound] x_vals, mod_rates = zip(*filtered_data) + + # Normalize the read lengths to the maximum read length (0-100) + x_vals = [100 * x / max(x_vals) for x in x_vals] - # Generate evenly-spaced x values and labels (10 ticks across the - # range) with the read lengths being a multiple of 1000 - x_tick_values = np.linspace(0, max(x_vals), num=10) - read_lengths = ['{:,}Mb'.format(int(val / 1000000)) if val > 1000000 else '{:,}kb'.format(int(val / 1000)) if val > 1000 else '{:,}bp'.format(int(val)) for val in x_tick_values] + # Use 0-100 for the x-axis ticks and labels + x_tick_values = np.arange(0, 101, 10) + x_tick_labels = ['{:,}%'.format(int(val)) for val in x_tick_values] # Get the modification name try: @@ -1114,7 +1145,10 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) mod_name = mod_type - fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1) + # fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1) + + # Create a heatmap plot + fig.add_trace(go.Histogram2dContour(x=x_vals, y=mod_rates, colorscale='Viridis', showlegend=False), row=i + 1, col=1) # Update the layout x_axis_name = get_axis_name(i) @@ -1123,25 +1157,21 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Auto range the axes fig.update_layout( - **{f"{x_axis_name}_title": 'Read Length (bp)', - f"{y_axis_name}_title": 'Modification Rate (%)'}) + **{f"{x_axis_name}_title": 'Normalized Read Length (%)', + f"{y_axis_name}_title": 'Modification Rate (%)'}, + **{f"{x_axis_name}_tickmode": 'array', + f"{x_axis_name}_tickvals": x_tick_values, + f"{x_axis_name}_ticktext": x_tick_labels}, + **{f"{y_axis_name}_range": [0, 100]} + ) fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) - - # Save the plot image - # if len(base_mod_types) > 0: - # fig_file = plot_filepaths["read_length_mod_rates"]['file'] - # logging.info("Saving the read length vs. modification rates plot to: {}".format(fig_file)) - # fig.write_image(fig_file, format='png', width=700, height=500) # Generate the HTML - # html_obj = fig.to_html(full_html=False, default_height=500, - # default_width=700) if len(base_mod_types) > 0: plot_height = 500 * len(base_mod_types) logging.info("Saving the read length vs. modification rates plot") plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=700) - # plot_filepaths["read_length_mod_rates"]["dynamic"] = html_obj else: logging.warning("WARNING: No modification types found") @@ -1155,23 +1185,23 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total Modified Bases in the Sample", [output_data.sample_modified_base_count], 'int', None) + row_str, row_flag = format_row("Total Modification Counts", [output_data.sample_modified_base_count], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total in the Forward Strand", [output_data.sample_modified_base_count_forward], 'int', None) + row_str, row_flag = format_row("Total Modification Counts (Forward Strand Only)", [output_data.sample_modified_base_count_forward], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total in the Reverse Strand", [output_data.sample_modified_base_count_reverse], 'int', None) + row_str, row_flag = format_row("Total Modification Counts (Reverse Strand Only)", [output_data.sample_modified_base_count_reverse], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Forward Strand)", [output_data.sample_cpg_forward_count], 'int', None) + row_str, row_flag = format_row("Total CpG Modification Counts (Forward Strand Only)", [output_data.sample_cpg_forward_count], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag - row_str, row_flag = format_row("Total modified CpG Counts in the Sample (Reverse Strand)", [output_data.sample_cpg_reverse_count], 'int', None) + row_str, row_flag = format_row("Total CpG Modification Counts (Reverse Strand Only)", [output_data.sample_cpg_reverse_count], 'int', None) table_str += row_str table_error_flag = table_error_flag or row_flag @@ -1202,6 +1232,21 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Finish the table table_str += "\n</tbody>\n</table>" + + # Add the help text + table_str += """ + <div class="help-icon"> + 💡 + <div class="tooltip">{}</div> + </div> + """.format(help_text) + + # Add text below the table suggesting the user to use Modkit for more + # detailed analysis on per-site modification rates + table_str += "<p><i>For per-site modification rates, please use \ + <a href=\"https://github.com/nanoporetech/modkit\">Modkit</a> by Oxford Nanopore Technologies..</i></p>" + + plot_filepaths["base_mods"]['detail'] = table_str plot_filepaths["base_mods"]['error_flag'] = table_error_flag @@ -1276,6 +1321,9 @@ def plot_alignment_numbers(data, plot_filepaths): # Set the error flag if primary alignments equal 0 error_flag = data.num_primary_alignment == 0 + logging.info("[TEST] Number of reverse alignments: {}".format(data.reverse_alignment)) + logging.info("[TEST] Number of forward alignments: {}".format(data.forward_alignment)) + # Create a horizontally aligned bar plot trace from the data using plotly trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, @@ -1291,12 +1339,6 @@ def plot_alignment_numbers(data, plot_filepaths): # Create the figure object fig = go.Figure(data=[trace], layout=layout) - # Generate the HTML object for the plot - # html_obj = fig.to_html(full_html=False, default_height=500, - # default_width=1000) - - # return html_obj, error_flag - # Update the HTML data for the plot plot_filepaths['read_alignments_bar']['dynamic'] = fig.to_html(full_html=False, default_height=500, default_width=1000) plot_filepaths['read_alignments_bar']['error_flag'] = error_flag From 66e2b8a74afc4dfba924abee2e3ff67233459673 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 14 Jan 2025 17:07:57 -0500 Subject: [PATCH 14/25] Work on pct read length vs mod prob --- include/hts_reader.h | 5 +- include/output_data.h | 5 + include/tin.h | 2 +- src/bam_module.cpp | 25 ++- src/hts_reader.cpp | 389 +++++++++++++++++++++++------------------- src/output_data.cpp | 40 +++++ src/plot_utils.py | 157 +++++++---------- src/tin.cpp | 12 +- 8 files changed, 353 insertions(+), 282 deletions(-) diff --git a/include/hts_reader.h b/include/hts_reader.h index 00900b2..5d3a628 100644 --- a/include/hts_reader.h +++ b/include/hts_reader.h @@ -47,7 +47,10 @@ class HTSReader { bool hasNextRecord(); // Return the number of records in the BAM file using the BAM index - int64_t getNumRecords(const std::string &bam_file_name, Output_BAM &final_output, bool mod_analysis, double base_mod_threshold); + int getNumRecords(const std::string &bam_file_name, int thread_count); + + // Run base modification analysis + void runBaseModificationAnalysis(const std::string &bam_filename, Output_BAM& final_output, double base_mod_threshold, int read_count, int sample_count, int thread_count); std::map<int, int> getQueryToRefMap(bam1_t* record); diff --git a/include/output_data.h b/include/output_data.h index fa66cdb..bca521f 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -207,6 +207,8 @@ class Output_BAM : public Output_FQ std::unordered_map<char, uint64_t> base_mod_counts_forward; // Counts for each base modification type exceeding the threshold on the forward strand std::unordered_map<char, uint64_t> base_mod_counts_reverse; // Counts for each base modification type exceeding the threshold on the reverse strand + std::unordered_map<char, std::vector<std::pair<double, double>>> read_pct_len_vs_mod_prob; // Read length (%) vs. base modification probability for each base modification type + // Signal data section int read_count = ZeroDefault; int base_count = ZeroDefault; @@ -231,6 +233,8 @@ class Output_BAM : public Output_FQ double getNthReadModRate(int read_index, char mod_type); // Get the base modification rate for the nth read for a specific base modification type uint64_t getModTypeCount(char mod_type); // Get the count of a specific base modification type uint64_t getModTypeCount(char mod_type, int strand); // Get the count of a specific base modification type for a specific strand + double getNthReadLenPct(int read_index, char mod_type); // Get the read length percentage for the nth read for a specific base modification type + double getNthReadModProb(int read_index, char mod_type); // Get the base modification probability for the nth read for a specific base modification type // POD5 signal data functions int getReadCount(); @@ -241,6 +245,7 @@ class Output_BAM : public Output_FQ int getReadSequenceEnd(std::string read_id); void updateBaseModCounts(char mod_type, int strand); // Update base modification counts for predictions exceeding the threshold + void updateBaseModProbabilities(char mod_type, double pct_len, double probability); // Update base modification probabilities void updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates); // Update read length vs. base modification rate data // Add TIN data for a single BAM file diff --git a/include/tin.h b/include/tin.h index b7b73f6..195596e 100644 --- a/include/tin.h +++ b/include/tin.h @@ -15,7 +15,7 @@ typedef std::unordered_map<std::string, std::tuple<std::string, int, int, double // Calculate the TIN score for each transcript in the gene BED file // (Reference: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-016-0922-z#Sec11) -void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder); +void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count); std::unordered_map<int, int> getReadDepths(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::string chr, int start, int end); diff --git a/src/bam_module.cpp b/src/bam_module.cpp index 058f831..d2a96eb 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -80,7 +80,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ std::cout << "Calculating TIN scores for file: " << filepath << std::endl; TINStats tin_stats; - calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder); + calculateTIN(&tin_stats, gene_bed, input_params.input_files[i], min_cov, sample_size, input_params.output_folder, input_params.threads); // Print the TIN stats std::cout << "Number of transcripts: " << tin_stats.num_transcripts << std::endl; @@ -113,7 +113,7 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ // process base modifications and TINs if available. // Note: This section utilizes one thread. std::cout << "Getting number of records..." << std::endl; - int num_records = reader.getNumRecords(filepath, final_output, mod_analysis, base_mod_threshold); + int num_records = reader.getNumRecords(filepath, thread_count); std::cout << "Number of records = " << num_records << std::endl; // Exit if there are no records @@ -123,6 +123,13 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ return exit_code; } + // Run base modification analysis if the flag is set + if (mod_analysis){ + std::cout << "Running base modification analysis..." << std::endl; + int sample_count = 10000; + reader.runBaseModificationAnalysis(filepath, final_output, base_mod_threshold, num_records, sample_count, thread_count); + } + // Determine the batch sizes if the user-specified thread count is greater than 1 int batch_size = 0; if (thread_count > 1) { @@ -147,7 +154,14 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ // Calculate statistics in batches printMemoryUsage("Before batch processing"); + + // TEST + // int max_reads = 10; + // int current_reads = 0; + while (reader.hasNextRecord()){ + // while (current_reads < max_reads && reader.hasNextRecord()){ + // Read the next batch of records std::cout << "Generating " << thread_count << " thread(s)..." << std::endl; std::vector<std::thread> thread_vector; for (int thread_index=0; thread_index<thread_count; thread_index++){ @@ -172,6 +186,9 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ } printMemoryUsage("After thread " + std::to_string(thread_index)); thread_index++; + + // TEST - Increment the current reads + // current_reads += batch_size; } std::cout << "All threads joined." << std::endl; } @@ -219,15 +236,13 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ std::cout << "Calculating summary QC..." << std::endl; final_output.global_sum(); std::cout << "QC complete" << std::endl; - - // Save the summary statistics to a file std::cout << "Saving summary statistics to file..." << std::endl; // If in RRMS mode, append RRMS accepted/rejected to the output prefix std::string output_prefix = "bam"; if (input_params.rrms_csv != ""){ output_prefix += input_params.rrms_filter ? "_rrms_accepted" : "_rrms_rejected"; - } + } std::string summary_filepath = input_params.output_folder + "/" + output_prefix + "_summary.txt"; final_output.save_summary(summary_filepath, input_params, final_output); std::cout << "Saved file: " << summary_filepath << std::endl; diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 6c4db22..01e0524 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -351,16 +351,40 @@ bool HTSReader::hasNextRecord(){ } // Return the number of records in the BAM file using the BAM index -int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &final_output, bool mod_analysis, double base_mod_threshold) { +int HTSReader::getNumRecords(const std::string& bam_filename, int thread_count) { samFile* bam_file = sam_open(bam_filename.c_str(), "r"); + hts_set_threads(bam_file, thread_count); // Enable multi-threading + bam_hdr_t* bam_header = sam_hdr_read(bam_file); + bam1_t* bam_record = bam_init1(); + int num_reads = 0; + while (sam_read1(bam_file, bam_header, bam_record) >= 0) { + num_reads++; + } + + return num_reads; +} + +void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Output_BAM &final_output, double base_mod_threshold, int read_count, int sample_count, int thread_count) +{ + samFile* bam_file = sam_open(bam_filename.c_str(), "r"); + hts_set_threads(bam_file, thread_count); // Enable multi-threading bam_hdr_t* bam_header = sam_hdr_read(bam_file); bam1_t* bam_record = bam_init1(); int64_t num_reads = 0; - // Data structure for storing read length vs. base modification rate - std::vector<int> read_lengths; // Read lengths - std::vector<double> read_mod_rates; // Total base modification rate for each read length - std::vector<std::unordered_map<char, double>> read_base_mod_rates; // Type-specific base modification rates for each read length + // Create a list of read indices to sample, and only keep the first + // sample_count reads + std::vector<int> read_indices; + for (int i = 0; i < read_count; i++) { + read_indices.push_back(i); + } + std::random_shuffle(read_indices.begin(), read_indices.end()); + read_indices.resize(sample_count); + + // Convert to a set for fast lookup + std::unordered_set<int> read_indices_set(read_indices.begin(), read_indices.end()); + + std::cout << "Number of sampled reads = " << read_indices_set.size() << std::endl; // Keep track of number of modified bases on the primary alignment vs other // alignments (secondary, supplementary, unmapped) @@ -370,207 +394,214 @@ int64_t HTSReader::getNumRecords(const std::string & bam_filename, Output_BAM &f int num_modified_bases_supplementary = 0; while (sam_read1(bam_file, bam_header, bam_record) >= 0) { + + if (read_indices_set.find(num_reads) == read_indices_set.end()) { + num_reads++; + continue; + } num_reads++; - if (mod_analysis) { - - // Base modification tag analysis - // Follow here to get base modification tags: - // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c - // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274 - int read_length = bam_record->core.l_qseq; - hts_base_mod_state *state = hts_base_mod_state_alloc(); - std::vector<std::pair<int32_t, int>> c_modified_positions; // C-modified positions for CpG analysis (chr->(position, strand)) - // std::unordered_map<char, int> base_mod_counts; // Type-specific - // base modification counts for the alignment - std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts; // Type-specific base modification counts (canonical base -> modified base -> count) - std::unordered_map<char, int> base_primary_count; // Total base counts for the alignment - - // Parse the base modification tags if a primary alignment - int read_mod_count = 0; - int ret = bam_parse_basemod(bam_record, state); - bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); - - // Update the number of reads with base modifications for the - // primary alignment vs other alignments - if (ret >= 0) { - if (is_primary) { - num_modified_bases_primary++; - } else if (bam_record->core.flag & BAM_FUNMAP) { - num_modified_bases_unmapped++; - } else if (bam_record->core.flag & BAM_FSECONDARY) { - num_modified_bases_secondary++; - } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) { - num_modified_bases_supplementary++; - } + // Base modification tag analysis + // Follow here to get base modification tags: + // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/sam_mods.c + // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2274 + int read_length = bam_record->core.l_qseq; + hts_base_mod_state *state = hts_base_mod_state_alloc(); + std::vector<std::pair<int32_t, int>> c_modified_positions; // C-modified positions for CpG analysis (chr->(position, strand)) + // std::unordered_map<char, int> base_mod_counts; // Type-specific + // base modification counts for the alignment + // std::unordered_map<char, std::unordered_map<char, int>> + // base_mod_counts; // Type-specific base modification counts + // (canonical base -> modified base -> count) + std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts; // Type-specific base modification probabilities (canonical base -> modified base -> [read length %, probability]) + std::unordered_map<char, int> base_primary_count; // Total base counts for the alignment + + // Parse the base modification tags if a primary alignment + int read_mod_count = 0; + int ret = bam_parse_basemod(bam_record, state); + bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); + + // Update the number of reads with base modifications for the + // primary alignment vs other alignments + if (ret >= 0) { + if (is_primary) { + num_modified_bases_primary++; + } else if (bam_record->core.flag & BAM_FUNMAP) { + num_modified_bases_unmapped++; + } else if (bam_record->core.flag & BAM_FSECONDARY) { + num_modified_bases_secondary++; + } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) { + num_modified_bases_supplementary++; } + } - if (ret >= 0 && is_primary) { - // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); + if (ret >= 0 && is_primary) { + // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); - // Get the chromosome if alignments are present - bool alignments_present = true; - std::string chr; - std::map<int, int> query_to_ref_map; - if (bam_record->core.tid < 0) { - alignments_present = false; - } else { - chr = bam_header->target_name[bam_record->core.tid]; + // Get the chromosome if alignments are present + bool alignments_present = true; + std::string chr; + std::map<int, int> query_to_ref_map; + if (bam_record->core.tid < 0) { + alignments_present = false; + } else { + chr = bam_header->target_name[bam_record->core.tid]; - // Get the query to reference position mapping - query_to_ref_map = this->getQueryToRefMap(bam_record); - } + // Get the query to reference position mapping + query_to_ref_map = this->getQueryToRefMap(bam_record); + } - // Get the strand from the alignment flag (hts_base_mod uses 0 for positive and 1 for negative, - // but it always yields 0...) - int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0; + // Get the strand from the alignment flag (hts_base_mod uses 0 for positive and 1 for negative, + // but it always yields 0...) + int strand = (bam_record->core.flag & BAM_FREVERSE) ? 1 : 0; - // Get the number of each type of base for the read - uint8_t *seq = bam_get_seq(bam_record); - for (int i = 0; i < read_length; i++) { - char base = seq_nt16_str[bam_seqi(seq, i)]; - base_primary_count[std::toupper(base)]++; - } + // Get the number of each type of base for the read + uint8_t *seq = bam_get_seq(bam_record); + for (int i = 0; i < read_length; i++) { + char base = seq_nt16_str[bam_seqi(seq, i)]; + base_primary_count[std::toupper(base)]++; + } - // Iterate over the state object to get the base modification tags - // using bam_next_basemod - hts_base_mod mods[10]; - int n = 0; - int32_t pos = 0; - std::vector<int> query_pos; - bool first_mod_found = false; - while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) { - - for (int i = 0; i < n; i++) { - // Update the modified prediction counts - read_mod_count++; // Read-specific count - final_output.modified_prediction_count++; // Cumulative count - char canonical_base_char = std::toupper(mods[i].canonical_base); - char mod_type = mods[i].modified_base; - // base_mod_counts[mod_type]++; // Update the type-specific count - - // Note: The modified base value can be a positive char (e.g. 'm', - // 'h') (DNA Mods DB) or negative integer (ChEBI ID): - // https://github.com/samtools/hts-specs/issues/741 - // DNA Mods: https://dnamod.hoffmanlab.org/ - // ChEBI: https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:21839 - // Header line: - // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2215 - - // Determine the probability of the modification (-1 if - // unknown) - double probability = -1; - if (mods[i].qual != -1) { - probability = mods[i].qual / 256.0; - - // Update counts for predictions exceeding the threshold - if (probability >= base_mod_threshold) { - final_output.updateBaseModCounts(mod_type, strand); // Update the base modification counts - // base_mod_counts[mod_type]++; // Update the - // type-specific count - base_mod_counts[canonical_base_char][mod_type]++; // Update the type-specific count - - // Store the modified positions for later CpG - // analysis if a C modification on a primary alignment - if (canonical_base_char == 'C' && mod_type != 'C') { - - // Convert the query position to reference position if available - if (alignments_present) { - if (query_to_ref_map.find(pos) != query_to_ref_map.end()) { - int32_t ref_pos = query_to_ref_map[pos]; - c_modified_positions.push_back(std::make_pair(ref_pos, strand)); - } + // Iterate over the state object to get the base modification tags + // using bam_next_basemod + hts_base_mod mods[10]; + int n = 0; + int32_t pos = 0; + std::vector<int> query_pos; + bool first_mod_found = false; + while ((n=bam_next_basemod(bam_record, state, mods, 10, &pos)) > 0) { + + for (int i = 0; i < n; i++) { + // Update the modified prediction counts + read_mod_count++; // Read-specific count + final_output.modified_prediction_count++; // Cumulative count + char canonical_base_char = std::toupper(mods[i].canonical_base); + char mod_type = mods[i].modified_base; + // base_mod_counts[mod_type]++; // Update the type-specific count + + // Note: The modified base value can be a positive char (e.g. 'm', + // 'h') (DNA Mods DB) or negative integer (ChEBI ID): + // https://github.com/samtools/hts-specs/issues/741 + // DNA Mods: https://dnamod.hoffmanlab.org/ + // ChEBI: https://www.ebi.ac.uk/chebi/searchId.do?chebiId=CHEBI:21839 + // Header line: + // https://github.com/samtools/htslib/blob/11205a9ba5e4fc39cc8bb9844d73db2a63fb8119/htslib/sam.h#L2215 + + // Determine the probability of the modification (-1 if + // unknown) + double probability = -1; + if (mods[i].qual != -1) { + probability = mods[i].qual / 256.0; + + // Update the read length % and probability for the + // modification + double read_len_pct = (double) (pos + 1) / read_length; + std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl; + final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability); // Update the base modification probabilities + + // Update counts for predictions exceeding the threshold + if (probability >= base_mod_threshold) { + final_output.updateBaseModCounts(mod_type, strand); // Update the base modification counts + // base_mod_counts[mod_type]++; // Update the + // type-specific count + // base_mod_counts[canonical_base_char][mod_type]++; // Update the type-specific count + + // Store the modified positions for later CpG + // analysis if a C modification on a primary alignment + if (canonical_base_char == 'C' && mod_type != 'C') { + + // Convert the query position to reference position if available + if (alignments_present) { + if (query_to_ref_map.find(pos) != query_to_ref_map.end()) { + int32_t ref_pos = query_to_ref_map[pos]; + c_modified_positions.push_back(std::make_pair(ref_pos, strand)); } } } - // } else { - // base_primary_count[mod_type]++; // Update the type-specific unmodified count - // } } + // } else { + // base_primary_count[mod_type]++; // Update the type-specific unmodified count + // } } } + } - // Append the modified positions to the output data - if (c_modified_positions.size() > 0) { - // Set the atomic flag and print a message if base - // modification tags are present in the file - if (!this->has_mm_ml_tags.test_and_set()) { - printMessage("Base modification data found (MM, ML tags)"); - } - - // Add the modified positions to the output data - if (final_output.sample_c_modified_positions.find(chr) == final_output.sample_c_modified_positions.end()) { - final_output.sample_c_modified_positions[chr] = c_modified_positions; - } else { - final_output.sample_c_modified_positions[chr].insert(final_output.sample_c_modified_positions[chr].end(), c_modified_positions.begin(), c_modified_positions.end()); - } + // Append the modified positions to the output data + if (c_modified_positions.size() > 0) { + // Set the atomic flag and print a message if base + // modification tags are present in the file + if (!this->has_mm_ml_tags.test_and_set()) { + printMessage("Base modification data found (MM, ML tags)"); } - } - hts_base_mod_state_free(state); // Deallocate the base modification state object - - // Calculate the base modification rate for the read - // double read_mod_rate = 0.0; - // if (read_length > 0) { - // read_mod_rate = (double) read_mod_count / read_length; - // } - - // Calculate the type-specific base modification rates for the read - std::unordered_map<char, double> base_mod_rates; - for (auto const &it : base_mod_counts) { - char canonical_base = it.first; - std::unordered_map<char, int> mod_counts = it.second; - double mod_rate = 0.0; - int total_base_count = base_primary_count[canonical_base]; - - // Calculate the modification rate for each modification type - for (auto const &it2 : mod_counts) { - char mod_type = it2.first; - int mod_count = it2.second; - double mod_rate = 0.0; - if (mod_count + total_base_count > 0) { - mod_rate = (double) mod_count / total_base_count; - } - base_mod_rates[mod_type] = mod_rate; + + // Add the modified positions to the output data + if (final_output.sample_c_modified_positions.find(chr) == final_output.sample_c_modified_positions.end()) { + final_output.sample_c_modified_positions[chr] = c_modified_positions; + } else { + final_output.sample_c_modified_positions[chr].insert(final_output.sample_c_modified_positions[chr].end(), c_modified_positions.begin(), c_modified_positions.end()); } - // for (auto const &it2 : mod_counts) { - // total_mod_count += it2.second; - // } - // if (total_mod_count + total_base_count > 0) { - // mod_rate = (double) total_mod_count / (total_mod_count + total_base_count); - // } - // base_mod_rates[canonical_base] = mod_rate; } - // for (auto const &it : base_mod_counts) { - // char mod_type = it.first; - // int mod_count = it.second; - // double mod_rate = 0.0; - // int total_base_count = base_primary_count[mod_type]; - // if (mod_count + unmod_count > 0) { - // mod_rate = (double) mod_count / (mod_count + unmod_count); - // } - // // if (read_length > 0) { - // // mod_rate = (double) mod_count / read_length; - // // } - // base_mod_rates[mod_type] = mod_rate; - // } - final_output.updateReadModRate(read_length, base_mod_rates); // Update the output data } + hts_base_mod_state_free(state); // Deallocate the base modification state object + + // Calculate the base modification rate for the read + // double read_mod_rate = 0.0; + // if (read_length > 0) { + // read_mod_rate = (double) read_mod_count / read_length; + // } + + // Calculate the type-specific base modification rates for the read + // std::unordered_map<char, double> base_mod_rates; + // for (auto const &it : base_mod_counts) { + // char canonical_base = it.first; + // std::unordered_map<char, int> mod_counts = it.second; + // double mod_rate = 0.0; + // int total_base_count = base_primary_count[canonical_base]; + + // // Calculate the modification rate for each modification type + // for (auto const &it2 : mod_counts) { + // char mod_type = it2.first; + // int mod_count = it2.second; + // double mod_rate = 0.0; + // if (mod_count + total_base_count > 0) { + // mod_rate = (double) mod_count / total_base_count; + // } + // base_mod_rates[mod_type] = mod_rate; + // } + // // for (auto const &it2 : mod_counts) { + // // total_mod_count += it2.second; + // // } + // // if (total_mod_count + total_base_count > 0) { + // // mod_rate = (double) total_mod_count / (total_mod_count + total_base_count); + // // } + // // base_mod_rates[canonical_base] = mod_rate; + // } + // for (auto const &it : base_mod_counts) { + // char mod_type = it.first; + // int mod_count = it.second; + // double mod_rate = 0.0; + // int total_base_count = base_primary_count[mod_type]; + // if (mod_count + unmod_count > 0) { + // mod_rate = (double) mod_count / (mod_count + unmod_count); + // } + // // if (read_length > 0) { + // // mod_rate = (double) mod_count / read_length; + // // } + // base_mod_rates[mod_type] = mod_rate; + // } + // final_output.updateReadModRate(read_length, base_mod_rates); // Update the output data } // Summary of base modification counts - if (mod_analysis) { - printMessage("Base modification counts:"); - printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary)); - printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped)); - printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary)); - printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary)); - } + printMessage("Base modification counts:"); + printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary)); + printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped)); + printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary)); + printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary)); bam_destroy1(bam_record); bam_hdr_destroy(bam_header); sam_close(bam_file); - - return num_reads; } // Get the mapping of query positions to reference positions for a given alignment record diff --git a/src/output_data.cpp b/src/output_data.cpp index 148fbeb..edf85d2 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -290,6 +290,12 @@ void Output_BAM::updateBaseModCounts(char mod_type, int strand) } } +void Output_BAM::updateBaseModProbabilities(char mod_type, double pct_len, double probability) +{ + // Update the base modification probabilities + this->read_pct_len_vs_mod_prob[mod_type].push_back(std::make_pair(pct_len, probability)); +} + void Output_BAM::updateReadModRate(int read_length, const std::unordered_map<char, double>& base_mod_rates) { ReadModData read_mod_data; read_mod_data.read_length = read_length; @@ -357,6 +363,40 @@ uint64_t Output_BAM::getModTypeCount(char mod_type, int strand) } } +double Output_BAM::getNthReadLenPct(int read_index, char mod_type) +{ + double read_len_pct = 0.0; + try { + this->read_pct_len_vs_mod_prob.at(mod_type); + } catch (const std::out_of_range& oor) { + std::cerr << "Error: Read length percentage not found for type " << mod_type << std::endl; + } + try { + read_len_pct = this->read_pct_len_vs_mod_prob[mod_type].at(read_index).first; + } catch (const std::out_of_range& oor) { + std::cerr << "Error: Read length percentage not found for read index " << read_index << " and type " << mod_type << std::endl; + return 0.0; + } + return read_len_pct; +} + +double Output_BAM::getNthReadModProb(int read_index, char mod_type) +{ + double mod_prob = 0.0; + try { + this->read_pct_len_vs_mod_prob.at(mod_type); + } catch (const std::out_of_range& oor) { + std::cerr << "Error: Modification probability not found for type " << mod_type << std::endl; + } + try { + mod_prob = this->read_pct_len_vs_mod_prob[mod_type].at(read_index).second; + } catch (const std::out_of_range& oor) { + std::cerr << "Error: Modification probability not found for read index " << read_index << " and type " << mod_type << std::endl; + return 0.0; + } + return mod_prob; +} + int Output_BAM::getReadCount() { return this->read_move_table.size(); diff --git a/src/plot_utils.py b/src/plot_utils.py index f9b8239..f6bc76f 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -1059,33 +1059,32 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th logging.info("Getting base modification statistics") # Get the read length vs. base modification rate data for each - # modification type - logging.info("Getting mod data size") - read_mod_data_size = output_data.getReadModDataSize() - logging.info("Mod data size: {}".format(read_mod_data_size)) - - # Choose a maximum of 10,000 reads to randomly sample for the plot - max_reads = min(read_mod_data_size, 10000) - # read_indices = set(sample(range(read_mod_data_size), max_reads)) - read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False) - read_length_mod_rates = {} - - # Get the read length vs. base modification rate data for each - # modification type in the sampled reads - for i in read_indices: - for mod_type in base_mod_types: - if mod_type not in read_length_mod_rates: - read_length_mod_rates[mod_type] = [] - - # logging.info("Getting read length for read {}".format(i)) - # read_length = output_data.getNthReadModLength(i) - read_length = output_data.getNthReadModLength(int(i)) - # logging.info("Getting read length vs. {} modification rate".format(mod_type)) - # mod_rate = output_data.getNthReadModRate(i, mod_type) - mod_rate = output_data.getNthReadModRate(int(i), mod_type) - # logging.info("Read length: {}, {} modification rate: {}".format(read_length, mod_type, mod_rate)) - read_length_mod_rates[mod_type].append((read_length, mod_rate)) - + # # modification type + # logging.info("Getting mod data size") + # read_mod_data_size = output_data.getReadModDataSize() + # logging.info("Mod data size: {}".format(read_mod_data_size)) + + # # Choose a maximum of 10,000 reads to randomly sample for the plot + # max_reads = min(read_mod_data_size, 10000) + # # read_indices = set(sample(range(read_mod_data_size), max_reads)) + # read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False) + # read_length_mod_rates = {} + + # Get the read length (%) vs. base modification probability data for + # each sampled read + sample_count = 10000 + read_len_pct = [] + mod_prob = [] + for mod_type in base_mod_types: + for i in range(sample_count): + try: + pct = output_data.getNthReadLenPct(i, mod_type) + prob = output_data.getNthReadModProb(i, mod_type) + read_len_pct.append(pct) + mod_prob.append(prob) + except Exception as e: + logging.error(f"Error getting read length vs. base modification probability data: {e}") + # Dictionary of modification character to full name mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \ 'g': '5hmU', 'e': '5fu', 'b': '5caU', \ @@ -1094,78 +1093,46 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th 'N': 'Amb. N', \ 'v': 'pseU'} - # Create a plot of read length vs. base modification rate for each - # modification type - # Make subplots vertically for each modification type - subplot_titles = [] - for mod_type in base_mod_types: - try: - mod_name = mod_char_to_name[mod_type] - except KeyError: - logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) - mod_name = mod_type + # Create a plot of pct read length vs. base modification probability for + # each modification type, as well as a histogram of the average base + # modification probability for 100 bins of the read length + + # Make a subplot of two columns for the read length vs. base + # modification probability and the histogram of the average base + # modification probability for each modification type + fig = make_subplots(rows=len(base_mod_types), cols=2, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=[f"{mod_char_to_name[mod_type]} Modification Probability" for mod_type in base_mod_types]) - subplot_titles.append('Read Length vs. {} Modification Rate'.format(mod_name)) - - - fig = make_subplots(rows=len(base_mod_types), cols=1, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=subplot_titles) - min_x = float('inf') - max_x = 0 for i, mod_type in enumerate(base_mod_types): + logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}") - # Format the data - mod_data = read_length_mod_rates[mod_type] - mod_rates = [data[1] * 100 for data in mod_data] - x_vals = [data[0] for data in mod_data] - - # Remove outlier read lengths using the IQR method - if len(x_vals) > 1: - x_vals_np = np.array(x_vals) - q1 = np.percentile(x_vals_np, 25) - q3 = np.percentile(x_vals_np, 75) - iqr = q3 - q1 - lower_bound = q1 - 1.5 * iqr - upper_bound = q3 + 1.5 * iqr - - # Filter the data to remove outliers - filtered_data = [(x, y) for x, y in zip(x_vals, mod_rates) if lower_bound <= x <= upper_bound] - x_vals, mod_rates = zip(*filtered_data) - - # Normalize the read lengths to the maximum read length (0-100) - x_vals = [100 * x / max(x_vals) for x in x_vals] - - # Use 0-100 for the x-axis ticks and labels - x_tick_values = np.arange(0, 101, 10) - x_tick_labels = ['{:,}%'.format(int(val)) for val in x_tick_values] - - # Get the modification name - try: - mod_name = mod_char_to_name[mod_type] - except KeyError: - logging.warning("WARNING: Unknown modification type: {}".format(mod_type)) - mod_name = mod_type - - # fig.add_trace(go.Scattergl(x=x_vals, y=mod_rates, mode='markers', name=mod_name, showlegend=False), row=i + 1, col=1) - - # Create a heatmap plot - fig.add_trace(go.Histogram2dContour(x=x_vals, y=mod_rates, colorscale='Viridis', showlegend=False), row=i + 1, col=1) - - # Update the layout - x_axis_name = get_axis_name(i) - y_axis_name = get_axis_name(i, 'y') - logging.info("Index: {}, Y index: {}".format(i, y_axis_name)) + # Add the trace for the read length vs. base modification + # probability scatter plot + fig.add_trace(go.Scatter + (x=read_len_pct, y=mod_prob, mode='markers', name=mod_char_to_name[mod_type], marker=dict(size=5), showlegend=False), + row=i + 1, col=1) - # Auto range the axes - fig.update_layout( - **{f"{x_axis_name}_title": 'Normalized Read Length (%)', - f"{y_axis_name}_title": 'Modification Rate (%)'}, - **{f"{x_axis_name}_tickmode": 'array', - f"{x_axis_name}_tickvals": x_tick_values, - f"{x_axis_name}_ticktext": x_tick_labels}, - **{f"{y_axis_name}_range": [0, 100]} - ) - - fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) + # Add a bar plot of the average base modification probability for + # 100 bins of the read length + bins = np.linspace(0, 100, 101) + bin_indices = np.digitize(read_len_pct, bins) + avg_prob_per_bin = np.zeros(100) + bin_centers = (bins[:-1] + bins[1:]) / 2 + + for j in range(100): + bin_mask = bin_indices == j + avg_prob_per_bin[j] = np.mean([mod_prob[k] for k in range(len(read_len_pct)) if bin_mask[k]]) + + # Create the bar plot + fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2) + + # Update the plot style + fig.update_xaxes(title="Read Length (%)", row=i + 1, col=1) + fig.update_yaxes(title="Modification Probability", row=i + 1, col=1) + fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2) + fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2) + + # Update the plot layout + fig.update_layout(title="Read Length vs. Base Modification Probability", font=dict(size=PLOT_FONT_SIZE)) # Generate the HTML if len(base_mod_types) > 0: diff --git a/src/tin.cpp b/src/tin.cpp index 0c2d0c4..3beae15 100644 --- a/src/tin.cpp +++ b/src/tin.cpp @@ -171,7 +171,7 @@ bool checkMinReads(htsFile* bam_file, hts_idx_t* idx, bam_hdr_t* header, std::st return min_reads_met; } -void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder) +void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::string& bam_filepath, int min_cov, int sample_size, const std::string& output_folder, int thread_count) { std::cout << "Using TIN minimum coverage " << min_cov << " and sample size " << sample_size << std::endl; @@ -182,6 +182,9 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s exit(1); } + // Enable multi-threading + hts_set_threads(bam_file, thread_count); + // Read the BAM header bam_hdr_t* header = sam_hdr_read(bam_file); if (header == NULL) { @@ -206,6 +209,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s // Loop through the gene BED file and calculate the TIN score for each // transcript + std::cout << "Calculating TIN scores for each transcript..." << std::endl; std::vector<double> TIN_scores; std::vector<std::string> gene_ids; std::string line; @@ -396,6 +400,11 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s // Store the TIN score for the transcript tin_map[name] = std::make_tuple(chrom, start, end, TIN); + + // Log every 1000 transcripts + if (gene_ids.size() % 1000 == 0) { + std::cout << "Processed " << gene_ids.size() << " transcripts" << std::endl; + } } // Close the BAM file @@ -413,6 +422,7 @@ void calculateTIN(TINStats* tin_stats, const std::string& gene_bed, const std::s if (TIN_scores.size() == 0) { std::cerr << "No TIN scores calculated" << std::endl; } else { + std::cout << "Calculating TIN summary for " << TIN_scores.size() << " transcripts..." << std::endl; // Print the TIN mean, median, and standard deviation double TIN_sum = 0; From 1607fb075fc537bb1b98db2f13abdd4dbfeef74d Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Wed, 15 Jan 2025 18:36:23 -0500 Subject: [PATCH 15/25] Fix bam cleanup error --- src/hts_reader.cpp | 21 +++++++++++++-- src/plot_utils.py | 64 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 01e0524..f46606d 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -12,6 +12,7 @@ Class for reading a set number of records from a BAM file. Used for multi-thread #include <fstream> #include <math.h> #include <algorithm> // std::find +#include <random> #include <htslib/sam.h> #include "utils.h" @@ -361,6 +362,11 @@ int HTSReader::getNumRecords(const std::string& bam_filename, int thread_count) num_reads++; } + // Close the BAM file + bam_destroy1(bam_record); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + return num_reads; } @@ -372,15 +378,26 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out bam1_t* bam_record = bam_init1(); int64_t num_reads = 0; + // Create a random number generator and seed it with the current time + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::default_random_engine generator(seed); + // Create a list of read indices to sample, and only keep the first // sample_count reads std::vector<int> read_indices; for (int i = 0; i < read_count; i++) { read_indices.push_back(i); } - std::random_shuffle(read_indices.begin(), read_indices.end()); + std::shuffle(read_indices.begin(), read_indices.end(), generator); read_indices.resize(sample_count); + // Print first 100 read indices sorted + // std::sort(read_indices.begin(), read_indices.end()); + // std::cout << "First 100 read indices: " << std::endl; + // for (int i = 0; i < 100; i++) { + // std::cout << read_indices[i] << std::endl; + // } + // Convert to a set for fast lookup std::unordered_set<int> read_indices_set(read_indices.begin(), read_indices.end()); @@ -496,7 +513,7 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out // Update the read length % and probability for the // modification double read_len_pct = (double) (pos + 1) / read_length; - std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl; + // std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl; final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability); // Update the base modification probabilities // Update counts for predictions exceeding the threshold diff --git a/src/plot_utils.py b/src/plot_utils.py index f6bc76f..7686d1f 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -1084,6 +1084,10 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th mod_prob.append(prob) except Exception as e: logging.error(f"Error getting read length vs. base modification probability data: {e}") + + # Convert the lists to numpy arrays + read_len_pct = np.array(read_len_pct) * 100 # Convert to percentage + mod_prob = np.array(mod_prob) # Dictionary of modification character to full name mod_char_to_name = {'m': '5mC', 'h': '5hmC', 'f': '5fC', 'c': '5caC', \ @@ -1111,25 +1115,53 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th (x=read_len_pct, y=mod_prob, mode='markers', name=mod_char_to_name[mod_type], marker=dict(size=5), showlegend=False), row=i + 1, col=1) + # Print the first 50 pairs sorted by read length for debugging + # read_len_pct, mod_prob = zip(*sorted(zip(read_len_pct, mod_prob))) + # if i == 0: + # for j in range(50): + # logging.info(f"Read length: {read_len_pct[j]}, Modification probability: {mod_prob[j]}") + + # # Create a histogram of the base modification probabilities + # base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20) + # fig.add_trace(base_mod_prob_hist, row=i + 1, col=2) + # Add a bar plot of the average base modification probability for # 100 bins of the read length - bins = np.linspace(0, 100, 101) - bin_indices = np.digitize(read_len_pct, bins) - avg_prob_per_bin = np.zeros(100) - bin_centers = (bins[:-1] + bins[1:]) / 2 - - for j in range(100): - bin_mask = bin_indices == j - avg_prob_per_bin[j] = np.mean([mod_prob[k] for k in range(len(read_len_pct)) if bin_mask[k]]) - - # Create the bar plot - fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2) + # bins = np.linspace(0, 100, 11) # 10 bins (0-10%, 10-20%, ..., 90-100%) + # bin_centers = (bins[:-1] + bins[1:]) / 2 # Bin centers for plotting + + # # Get the average probability per bin + # avg_prob_per_bin = np.zeros(10) + # bin_indices = np.digitize(read_len_pct, bins) - 1 + # for j in range(10): # Loop over bins + # bin_mask = (bin_indices == j) + # if np.any(bin_mask): + # avg_prob_per_bin[j] = np.mean(mod_prob[bin_mask]) + # logging.info(f"Bin {j}: {avg_prob_per_bin[j]}") + + # # Create the bar plot + + # # Print the bins and read length percentages for the first 10 reads + # # for debugging + # if i == 0: + # logging.info("Bins: {}".format(bins)) + # logging.info("Bin indices: {}".format(bin_indices[:10])) + # logging.info("Read length percentages: {}".format(read_len_pct[:10])) + + # # Create the bar plot + # fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2) # Update the plot style fig.update_xaxes(title="Read Length (%)", row=i + 1, col=1) fig.update_yaxes(title="Modification Probability", row=i + 1, col=1) - fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2) - fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2) + fig.update_xaxes(title="Modification Probability", row=i + 1, col=2) + fig.update_yaxes(title="Frequency", row=i + 1, col=2) + # fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2) + # fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2) + + # Set the range of the y-axis to 0-1 + fig.update_yaxes(range=[0, 1], row=i + 1, col=1) + # fig.update_yaxes(range=[0, 1], row=i + 1, col=2) # Update the plot layout fig.update_layout(title="Read Length vs. Base Modification Probability", font=dict(size=PLOT_FONT_SIZE)) @@ -1137,12 +1169,14 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Generate the HTML if len(base_mod_types) > 0: plot_height = 500 * len(base_mod_types) + plot_width = 700 * 2 logging.info("Saving the read length vs. modification rates plot") - plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=700) + plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=plot_width) else: logging.warning("WARNING: No modification types found") - # Create the base modification statistics table + # Create the base modification statistics table' + logging.info("Creating the base modification statistics table") table_str = "<table>\n<tbody>" row_str, row_flag = format_row("Total Unfiltered Predictions", [output_data.modified_prediction_count], 'int', None) table_str += row_str From 3e2efdd0853cbefae9a660d6d1eca7cead0f410f Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Thu, 16 Jan 2025 16:17:55 -0500 Subject: [PATCH 16/25] Fix Q-score distribution --- src/fastq_module.cpp | 66 ++++++++++++++++++++++++++++++++++++++------ src/output_data.cpp | 43 +++++++++++++++++++++++------ src/plot_utils.py | 6 ++-- 3 files changed, 96 insertions(+), 19 deletions(-) diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp index e16dadd..44e70bb 100644 --- a/src/fastq_module.cpp +++ b/src/fastq_module.cpp @@ -9,6 +9,7 @@ #include <fstream> #include <iostream> +#include <sstream> #include <sys/stat.h> #include <sys/types.h> @@ -20,7 +21,7 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out int exit_code = 0; int read_len; double read_gc_cnt; - double read_mean_base_qual; + // double read_mean_base_qual; Basic_Seq_Statistics &long_read_info = output_data.long_read_info; Basic_Seq_Quality_Statistics &seq_quality_info = output_data.seq_quality_info; long_read_info.total_num_reads = ZeroDefault; // total number of long reads @@ -62,10 +63,33 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out // Store the read length long_read_info.read_lengths.push_back(read_len); + // Access base quality data + // printMessage("[TEST1] Base quality string: " + raw_read_qual); + char value; + std::vector<int> base_quality_values; + // std::string base_quality_str = raw_read_qual; + std::istringstream iss(raw_read_qual); + while (iss >> value) + { + int base_quality_value = value - '!'; + base_quality_values.push_back(base_quality_value); + // printMessage("[TEST1] Base quality value: " + std::to_string(base_quality_value)); + } + + // Ensure that the base quality string has the same length as + // the read sequence + if (base_quality_values.size() != read_len) + { + printError("Error: Base quality string length does not match read sequence length"); + exit_code = 1; + break; + } + // Process base and quality information read_gc_cnt = 0; - read_mean_base_qual = 0; - uint64_t base_quality_value; + // read_mean_base_qual = 0; + int base_quality_value; + double cumulative_base_prob = 0; // Read cumulative base quality probability for (int i = 0; i < read_len; i++) { if (read_seq[i] == 'A' || read_seq[i] == 'a') @@ -86,15 +110,30 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out { long_read_info.total_tu_cnt += 1; } - base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset; + + // Get the base quality (Phred) value + base_quality_value = base_quality_values[i]; + // base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset; try { seq_quality_info.base_quality_distribution[base_quality_value] += 1; } catch (const std::out_of_range& oor) { printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value"); } - read_mean_base_qual += (double) base_quality_value; + // read_mean_base_qual += (double) base_quality_value; + + // Convert the Phred quality value to a probability + double base_quality_prob = pow(10, -base_quality_value / 10.0); + cumulative_base_prob += base_quality_prob; } + // Calculate the mean base quality probability + cumulative_base_prob /= (double)read_len; + + // Convert the mean base quality probability to a Phred quality + // value + double read_mean_base_qual = -10.0 * log10(cumulative_base_prob); + // printMessage("Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual)); + // Update the per-read GC content distribution double gc_content_pct = (100.0 * read_gc_cnt) / static_cast<double>(read_len); int gc_content_int = static_cast<int>(std::round(gc_content_pct)); @@ -105,13 +144,24 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out } // Update the per-read base quality distribution - double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len); - unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct)); + // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len); + // unsigned int read_mean_base_qual_int = static_cast<unsigned + // int>(std::round(read_mean_base_qual_pct)); + int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual)); + + // printMessage("Rounded Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual_int)); + try { - seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1; + seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1; } catch (const std::out_of_range& oor) { printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); } + + // try { + // seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1; + // } catch (const std::out_of_range& oor) { + // printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); + // } fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual); // Write to file } diff --git a/src/output_data.cpp b/src/output_data.cpp index edf85d2..2401847 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -707,11 +707,17 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ base_quality_values.push_back(base_quality_value); } + // Ensure the base quality values match the sequence length + if (base_quality_values.size() != base_count) { + printError("Warning: Base quality values do not match the sequence length for read ID " + std::string(read_name)); + } + // Update the base quality and GC content information int gc_count = 0; - double read_mean_base_qual = 0; + // double read_mean_base_qual = 0; + double cumulative_base_prob = 0; // Read cumulative base quality probability char current_base; - uint64_t base_quality_value; + int base_quality_value; for (int i = 0; i < base_count; i++) { current_base = sequence_data_str[i]; @@ -733,16 +739,30 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ { long_read_info.total_tu_cnt += 1; } - // Get the base quality - base_quality_value = (uint64_t)base_quality_values[i]; + // Get the base quality (Phred) value + base_quality_value = base_quality_values[i]; + + // Update the per-base quality distribution try { seq_quality_info.base_quality_distribution[base_quality_value] += 1; } catch (const std::out_of_range& oor) { printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value"); } - read_mean_base_qual += (double)base_quality_value; + + // Convert the Phred quality value to a probability + double base_quality_prob = pow(10, -base_quality_value / 10.0); + // read_mean_base_qual += (double)base_quality_value; + cumulative_base_prob += base_quality_prob; } + // Calculate the mean base quality probability + cumulative_base_prob /= (double)base_count; + + // Convert the mean base quality probability to a Phred quality value + double read_mean_base_qual = -10.0 * log10(cumulative_base_prob); + + // printMessage("Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual)); + // Calculate percent guanine & cytosine // gc_content_pct = 100.0 *( (double)gc_count / (double)base_count ); @@ -756,10 +776,17 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ } // Update the per-read base quality distribution - double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count); - unsigned int read_mean_base_qual_int = static_cast<unsigned int>(std::round(read_mean_base_qual_pct)); + // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count); + // unsigned int read_mean_base_qual_int = static_cast<unsigned + // int>(std::round(read_mean_base_qual_pct)); + int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual)); + + // printMessage("Rounded Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual_int)); + try { - seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1; + // seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] + // += 1; + seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1; } catch (const std::out_of_range& oor) { printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); } diff --git a/src/plot_utils.py b/src/plot_utils.py index 7686d1f..472f935 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -1121,9 +1121,9 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # for j in range(50): # logging.info(f"Read length: {read_len_pct[j]}, Modification probability: {mod_prob[j]}") - # # Create a histogram of the base modification probabilities - # base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20) - # fig.add_trace(base_mod_prob_hist, row=i + 1, col=2) + # Create a histogram of the base modification probabilities + base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20) + fig.add_trace(base_mod_prob_hist, row=i + 1, col=2) # Add a bar plot of the average base modification probability for # 100 bins of the read length From e9006da117abf33227b45be46787130031c43797 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Mon, 20 Jan 2025 15:11:42 -0500 Subject: [PATCH 17/25] Add bam read avg base quality plots --- README.md | 7 +- include/hts_reader.h | 2 +- include/output_data.h | 2 +- src/bam_module.cpp | 23 ++---- src/cli.py | 4 +- src/fast5_module.cpp | 12 +--- src/fastq_module.cpp | 41 ++++------- src/hts_reader.cpp | 158 +++++++++++------------------------------- src/output_data.cpp | 37 +++------- src/plot_utils.py | 122 ++++++++------------------------ 10 files changed, 110 insertions(+), 298 deletions(-) diff --git a/README.md b/README.md index 1ce0df6..8184491 100644 --- a/README.md +++ b/README.md @@ -258,7 +258,12 @@ longreadsum bam -i $INPUT_FILE -o $OUTPUT_DIRECTORY # ONT POD5 This section describes how to generate QC reports for ONT POD5 (signal) files and their corresponding basecalled BAM files (data shown is HG002 using ONT -R10.4.1 and LSK114 downloaded from the tutorial https://github.com/epi2me-labs/wf-basecalling). +R10.4.1 and LSK114 downloaded from the tutorial +https://github.com/epi2me-labs/wf-basecalling). + +> [!NOTE] +> This requires generating basecalled BAM files with the move table output. For +> example, for [dorado](https://github.com/nanoporetech/dorado), the parameter is `--emit-moves`  diff --git a/include/hts_reader.h b/include/hts_reader.h index 5d3a628..bc8f5d8 100644 --- a/include/hts_reader.h +++ b/include/hts_reader.h @@ -38,7 +38,7 @@ class HTSReader { bool reading_complete = false; // Update read and base counts - int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t *base_quality_distribution, bool is_primary); + int updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, Basic_Seq_Quality_Statistics& seq_quality_info, bool is_primary); // Read the next batch of records from the BAM file int readNextRecords(int batch_size, Output_BAM & output_data, std::mutex & read_mutex, std::unordered_set<std::string>& read_ids, double base_mod_threshold); diff --git a/include/output_data.h b/include/output_data.h index bca521f..2e1610d 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -79,7 +79,7 @@ class Basic_Seq_Quality_Statistics //std::vector<uint64_t> base_quality_distribution; // Array of base quality distribution initialized to 0 uint64_t base_quality_distribution[MAX_BASE_QUALITY] = {ZeroDefault}; - std::vector<int> read_average_base_quality_distribution; + std::vector<int> read_average_base_quality_distribution; // Read average base quality distribution int min_base_quality = MoneDefault; // minimum base quality; int max_base_quality = MoneDefault; // maximum base quality; std::vector<int> pos_quality_distribution; diff --git a/src/bam_module.cpp b/src/bam_module.cpp index d2a96eb..0014509 100644 --- a/src/bam_module.cpp +++ b/src/bam_module.cpp @@ -154,41 +154,27 @@ int BAM_Module::calculateStatistics(Input_Para &input_params, Output_BAM &final_ // Calculate statistics in batches printMemoryUsage("Before batch processing"); - - // TEST - // int max_reads = 10; - // int current_reads = 0; while (reader.hasNextRecord()){ - // while (current_reads < max_reads && reader.hasNextRecord()){ // Read the next batch of records - std::cout << "Generating " << thread_count << " thread(s)..." << std::endl; + // std::cout << "Generating " << thread_count << " thread(s)..." << + // std::endl; + printMessage("Generating " + std::to_string(thread_count) + " thread(s)..."); std::vector<std::thread> thread_vector; for (int thread_index=0; thread_index<thread_count; thread_index++){ - - // Copy the input read IDs to a new vector std::unordered_set<std::string> rrms_read_ids_copy = input_params.rrms_read_ids; - - // Create a thread std::thread t((BAM_Module::batchStatistics), std::ref(reader), batch_size, rrms_read_ids_copy,std::ref(final_output), std::ref(bam_mutex), std::ref(output_mutex), std::ref(cout_mutex), base_mod_threshold); - - // Add the thread to the vector thread_vector.push_back(std::move(t)); } // Join the threads in thread_vector - std::cout<<"Joining threads..."<<std::endl; + // std::cout<<"Joining threads..."<<std::endl; int thread_index = 0; for (auto& t : thread_vector){ - // Join the thread if it is joinable if (t.joinable()){ t.join(); } - printMemoryUsage("After thread " + std::to_string(thread_index)); thread_index++; - - // TEST - Increment the current reads - // current_reads += batch_size; } std::cout << "All threads joined." << std::endl; } @@ -257,6 +243,7 @@ void BAM_Module::batchStatistics(HTSReader& reader, int batch_size, std::unorder { // Read the next N records Output_BAM record_output; + printMessage("Reading next batch of records... " + std::to_string(batch_size)); reader.readNextRecords(batch_size, record_output, bam_mutex, read_ids, base_mod_threshold); // Update the final output diff --git a/src/cli.py b/src/cli.py index ebf10d6..80b23ba 100644 --- a/src/cli.py +++ b/src/cli.py @@ -165,7 +165,7 @@ def fq_module(margs): def fa_module(margs): - # Run the FASTA filetype module. + """FASTA file input module.""" # Get the filetype-specific parameters param_dict = get_common_param(margs) @@ -253,7 +253,7 @@ def bam_module(margs): plot_filepaths = plot(bam_output, param_dict, 'BAM') # Set the list of QC information to display - qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality"] + qc_info_list = ["basic_st", "read_alignments_bar", "base_alignments_bar", "read_length_bar", "read_length_hist", "gc_content_hist", "base_counts", "base_quality", "read_avg_base_quality"] # If base modifications were found, add the base modification plots # after the first table diff --git a/src/fast5_module.cpp b/src/fast5_module.cpp index ceab46f..11b2909 100644 --- a/src/fast5_module.cpp +++ b/src/fast5_module.cpp @@ -470,12 +470,6 @@ static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_dat { int exit_code = 0; -// // Open the CSV files -// std::ofstream raw_csv; -// raw_csv.open(signal_raw_csv); -// std::ofstream qc_csv; -// qc_csv.open(signal_qc_csv); - // Run QC on the HDF5 file //H5::Exception::dontPrint(); // Disable error printing try { @@ -554,11 +548,7 @@ static int writeSignalQCDetails(const char *input_file, Output_FAST5 &output_dat catch (std::exception& e) { std::cerr << "Exception caught : " << e.what() << std::endl; } - -// // Close the CSV files -// raw_csv.close(); -// qc_csv.close(); - + return exit_code; } diff --git a/src/fastq_module.cpp b/src/fastq_module.cpp index 44e70bb..3d87a32 100644 --- a/src/fastq_module.cpp +++ b/src/fastq_module.cpp @@ -21,13 +21,13 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out int exit_code = 0; int read_len; double read_gc_cnt; - // double read_mean_base_qual; Basic_Seq_Statistics &long_read_info = output_data.long_read_info; Basic_Seq_Quality_Statistics &seq_quality_info = output_data.seq_quality_info; long_read_info.total_num_reads = ZeroDefault; // total number of long reads long_read_info.longest_read_length = ZeroDefault; // the length of longest reads std::ifstream input_file_stream(input_file); + int count = 0; if (!input_file_stream.is_open()) { fprintf(stderr, "Failed to open file for reading: %s\n", input_file); @@ -38,6 +38,7 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out { if (line[0] == '@') { + count++; read_name = line.substr(1); read_name = read_name.substr(0, read_name.find_first_of(" \t")); std::getline(input_file_stream, read_seq); @@ -64,16 +65,13 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out long_read_info.read_lengths.push_back(read_len); // Access base quality data - // printMessage("[TEST1] Base quality string: " + raw_read_qual); char value; std::vector<int> base_quality_values; - // std::string base_quality_str = raw_read_qual; std::istringstream iss(raw_read_qual); while (iss >> value) { int base_quality_value = value - '!'; base_quality_values.push_back(base_quality_value); - // printMessage("[TEST1] Base quality value: " + std::to_string(base_quality_value)); } // Ensure that the base quality string has the same length as @@ -87,7 +85,6 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out // Process base and quality information read_gc_cnt = 0; - // read_mean_base_qual = 0; int base_quality_value; double cumulative_base_prob = 0; // Read cumulative base quality probability for (int i = 0; i < read_len; i++) @@ -113,13 +110,11 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out // Get the base quality (Phred) value base_quality_value = base_quality_values[i]; - // base_quality_value = (uint64_t)raw_read_qual[i] - (uint64_t)fastq_base_qual_offset; try { seq_quality_info.base_quality_distribution[base_quality_value] += 1; } catch (const std::out_of_range& oor) { printError("Warning: Base quality value " + std::to_string(base_quality_value) + " exceeds maximum value"); } - // read_mean_base_qual += (double) base_quality_value; // Convert the Phred quality value to a probability double base_quality_prob = pow(10, -base_quality_value / 10.0); @@ -132,7 +127,14 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out // Convert the mean base quality probability to a Phred quality // value double read_mean_base_qual = -10.0 * log10(cumulative_base_prob); - // printMessage("Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual)); + + // Update the per-read base quality distribution + int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual)); + try { + seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1; + } catch (const std::out_of_range& oor) { + printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); + } // Update the per-read GC content distribution double gc_content_pct = (100.0 * read_gc_cnt) / static_cast<double>(read_len); @@ -142,28 +144,9 @@ int qc1fastq(const char *input_file, char fastq_base_qual_offset, Output_FQ &out } catch (const std::out_of_range& oor) { printError("Warning: Invalid GC content value " + std::to_string(gc_content_int)); } - - // Update the per-read base quality distribution - // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(read_len); - // unsigned int read_mean_base_qual_int = static_cast<unsigned - // int>(std::round(read_mean_base_qual_pct)); - int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual)); - - // printMessage("Rounded Mean Q Score for read ID " + read_name + " is " + std::to_string(read_mean_base_qual_int)); - try { - seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1; - } catch (const std::out_of_range& oor) { - printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); - } - - // try { - // seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] += 1; - // } catch (const std::out_of_range& oor) { - // printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); - // } - - fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual); // Write to file + // Write read details to file + fprintf(read_details_fp, "%s\t%d\t%.2f\t%.2f\n", read_name.c_str(), read_len, gc_content_pct, read_mean_base_qual); } } input_file_stream.close(); diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index f46606d..1a7e53f 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -36,7 +36,7 @@ HTSReader::~HTSReader(){ } // Update read and base counts -int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, uint64_t* base_quality_distribution, bool is_primary) { +int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& basic_qc, Basic_Seq_Quality_Statistics& seq_quality_info, bool is_primary) { // Update read QC basic_qc.total_num_reads++; // Update the total number of reads @@ -47,11 +47,16 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& bas // Get base counts, quality, and GC content double read_gc_count = 0.0; // For GC content calculation double read_base_total = 0.0; // For GC content calculation + double cumulative_base_prob = 0.0; // For mean base quality probability calculation uint8_t *seq = bam_get_seq(record); for (int i = 0; i < read_length; i++) { // Get the base quality and update the base quality histogram - uint64_t base_quality = (uint64_t)bam_get_qual(record)[i]; - base_quality_distribution[base_quality]++; + int base_quality = (int)bam_get_qual(record)[i]; + seq_quality_info.base_quality_distribution[(uint64_t)base_quality]++; + + // Convert the Phred quality value to a probability + double base_quality_prob = pow(10, -base_quality / 10.0); + cumulative_base_prob += base_quality_prob; // Get the base and update the base count char base = seq_nt16_str[bam_seqi(seq, i)]; @@ -84,6 +89,20 @@ int HTSReader::updateReadAndBaseCounts(bam1_t* record, Basic_Seq_Statistics& bas } } + // Calculate the mean base quality probability + cumulative_base_prob /= (double)read_length; + + // Convert the mean base quality probability to a Phred quality value + double read_mean_base_qual = -10.0 * log10(cumulative_base_prob); + + // Update the per-read mean base quality distribution + int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual)); + try { + seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int]++; + } catch (const std::out_of_range& oor) { + printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); + } + // Calculate the read GC content percentage if a primary alignment if (is_primary) { double gc_content = read_gc_count / read_base_total; @@ -117,9 +136,6 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu } } - // Access the base quality histogram from the output_data object - uint64_t *base_quality_distribution = output_data.seq_quality_info.base_quality_distribution; - // Do QC on each record and store the results in the output_data object while ((record_count < batch_size) && (exit_code >= 0)) { // Create a record object @@ -210,11 +226,13 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // Unmapped reads if (record->core.flag & BAM_FUNMAP) { Basic_Seq_Statistics& basic_qc = output_data.unmapped_long_read_info; - this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, false); + Basic_Seq_Quality_Statistics& seq_quality_info = output_data.unmapped_seq_quality_info; + this->updateReadAndBaseCounts(record, basic_qc, seq_quality_info, false); } else { // Calculate base alignment statistics on non-secondary alignments Basic_Seq_Statistics& basic_qc = output_data.mapped_long_read_info; + Basic_Seq_Quality_Statistics& seq_quality_info = output_data.seq_quality_info; if (!(record->core.flag & BAM_FSECONDARY)) { // Determine if this is a forward or reverse read @@ -328,9 +346,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu break; } } - - // Update read and base QC - this->updateReadAndBaseCounts(record, basic_qc, base_quality_distribution, true); + this->updateReadAndBaseCounts(record, basic_qc, seq_quality_info, true); } else { printError("Error: Unknown alignment type with flag " + std::to_string(record->core.flag)); @@ -376,7 +392,7 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out hts_set_threads(bam_file, thread_count); // Enable multi-threading bam_hdr_t* bam_header = sam_hdr_read(bam_file); bam1_t* bam_record = bam_init1(); - int64_t num_reads = 0; + int64_t read_index = 0; // Create a random number generator and seed it with the current time unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); @@ -390,33 +406,15 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out } std::shuffle(read_indices.begin(), read_indices.end(), generator); read_indices.resize(sample_count); - - // Print first 100 read indices sorted - // std::sort(read_indices.begin(), read_indices.end()); - // std::cout << "First 100 read indices: " << std::endl; - // for (int i = 0; i < 100; i++) { - // std::cout << read_indices[i] << std::endl; - // } - - // Convert to a set for fast lookup std::unordered_set<int> read_indices_set(read_indices.begin(), read_indices.end()); - - std::cout << "Number of sampled reads = " << read_indices_set.size() << std::endl; - - // Keep track of number of modified bases on the primary alignment vs other - // alignments (secondary, supplementary, unmapped) - int num_modified_bases_primary = 0; - int num_modified_bases_unmapped = 0; - int num_modified_bases_secondary = 0; - int num_modified_bases_supplementary = 0; + printMessage("Number of sampled reads for base modification analysis = " + std::to_string(read_indices_set.size())); while (sam_read1(bam_file, bam_header, bam_record) >= 0) { - if (read_indices_set.find(num_reads) == read_indices_set.end()) { - num_reads++; - continue; - } - num_reads++; + // if (read_indices_set.find(read_index) == read_indices_set.end()) { + // read_index++; + // continue; + // } // Base modification tag analysis // Follow here to get base modification tags: @@ -425,11 +423,6 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out int read_length = bam_record->core.l_qseq; hts_base_mod_state *state = hts_base_mod_state_alloc(); std::vector<std::pair<int32_t, int>> c_modified_positions; // C-modified positions for CpG analysis (chr->(position, strand)) - // std::unordered_map<char, int> base_mod_counts; // Type-specific - // base modification counts for the alignment - // std::unordered_map<char, std::unordered_map<char, int>> - // base_mod_counts; // Type-specific base modification counts - // (canonical base -> modified base -> count) std::unordered_map<char, std::unordered_map<char, int>> base_mod_counts; // Type-specific base modification probabilities (canonical base -> modified base -> [read length %, probability]) std::unordered_map<char, int> base_primary_count; // Total base counts for the alignment @@ -438,23 +431,7 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out int ret = bam_parse_basemod(bam_record, state); bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); - // Update the number of reads with base modifications for the - // primary alignment vs other alignments - if (ret >= 0) { - if (is_primary) { - num_modified_bases_primary++; - } else if (bam_record->core.flag & BAM_FUNMAP) { - num_modified_bases_unmapped++; - } else if (bam_record->core.flag & BAM_FSECONDARY) { - num_modified_bases_secondary++; - } else if (bam_record->core.flag & BAM_FSUPPLEMENTARY) { - num_modified_bases_supplementary++; - } - } - if (ret >= 0 && is_primary) { - // bool is_primary = !(bam_record->core.flag & BAM_FSECONDARY) && !(bam_record->core.flag & BAM_FSUPPLEMENTARY) && !(bam_record->core.flag & BAM_FUNMAP); - // Get the chromosome if alignments are present bool alignments_present = true; std::string chr; @@ -513,15 +490,18 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out // Update the read length % and probability for the // modification double read_len_pct = (double) (pos + 1) / read_length; - // std::cout << "Read length %: " << read_len_pct << ", probability: " << probability << std::endl; - final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability); // Update the base modification probabilities + // std::cout << "Read length %: " << read_len_pct << ", + // probability: " << probability << std::endl; + + // Update the base modification probabilities for + // sampled reads only (10,000 maximum) + if (read_indices_set.find(read_index) != read_indices_set.end()) { + final_output.updateBaseModProbabilities(mod_type, read_len_pct, probability); // Update the base modification probabilities + } // Update counts for predictions exceeding the threshold if (probability >= base_mod_threshold) { final_output.updateBaseModCounts(mod_type, strand); // Update the base modification counts - // base_mod_counts[mod_type]++; // Update the - // type-specific count - // base_mod_counts[canonical_base_char][mod_type]++; // Update the type-specific count // Store the modified positions for later CpG // analysis if a C modification on a primary alignment @@ -536,9 +516,6 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out } } } - // } else { - // base_primary_count[mod_type]++; // Update the type-specific unmodified count - // } } } } @@ -561,61 +538,9 @@ void HTSReader::runBaseModificationAnalysis(const std::string &bam_filename, Out } hts_base_mod_state_free(state); // Deallocate the base modification state object - // Calculate the base modification rate for the read - // double read_mod_rate = 0.0; - // if (read_length > 0) { - // read_mod_rate = (double) read_mod_count / read_length; - // } - - // Calculate the type-specific base modification rates for the read - // std::unordered_map<char, double> base_mod_rates; - // for (auto const &it : base_mod_counts) { - // char canonical_base = it.first; - // std::unordered_map<char, int> mod_counts = it.second; - // double mod_rate = 0.0; - // int total_base_count = base_primary_count[canonical_base]; - - // // Calculate the modification rate for each modification type - // for (auto const &it2 : mod_counts) { - // char mod_type = it2.first; - // int mod_count = it2.second; - // double mod_rate = 0.0; - // if (mod_count + total_base_count > 0) { - // mod_rate = (double) mod_count / total_base_count; - // } - // base_mod_rates[mod_type] = mod_rate; - // } - // // for (auto const &it2 : mod_counts) { - // // total_mod_count += it2.second; - // // } - // // if (total_mod_count + total_base_count > 0) { - // // mod_rate = (double) total_mod_count / (total_mod_count + total_base_count); - // // } - // // base_mod_rates[canonical_base] = mod_rate; - // } - // for (auto const &it : base_mod_counts) { - // char mod_type = it.first; - // int mod_count = it.second; - // double mod_rate = 0.0; - // int total_base_count = base_primary_count[mod_type]; - // if (mod_count + unmod_count > 0) { - // mod_rate = (double) mod_count / (mod_count + unmod_count); - // } - // // if (read_length > 0) { - // // mod_rate = (double) mod_count / read_length; - // // } - // base_mod_rates[mod_type] = mod_rate; - // } - // final_output.updateReadModRate(read_length, base_mod_rates); // Update the output data + read_index++; // Update the read index } - // Summary of base modification counts - printMessage("Base modification counts:"); - printMessage("Primary alignment: " + std::to_string(num_modified_bases_primary)); - printMessage("Unmapped alignment: " + std::to_string(num_modified_bases_unmapped)); - printMessage("Secondary alignment: " + std::to_string(num_modified_bases_secondary)); - printMessage("Supplementary alignment: " + std::to_string(num_modified_bases_supplementary)); - bam_destroy1(bam_record); bam_hdr_destroy(bam_header); sam_close(bam_file); @@ -648,7 +573,6 @@ std::map<int, int> HTSReader::getQueryToRefMap(bam1_t *record) query_to_ref_map[current_query_pos] = current_ref_pos + 1; // Use 1-indexed positions current_ref_pos++; current_query_pos++; - // query_to_ref_map[current_query_pos] = current_ref_pos + 1; // Use 1-indexed positions } break; case BAM_CINS: diff --git a/src/output_data.cpp b/src/output_data.cpp index 2401847..283f458 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -305,20 +305,16 @@ void Output_BAM::updateReadModRate(int read_length, const std::unordered_map<cha std::vector<char> Output_BAM::getBaseModTypes() { - printMessage("[TEST] Getting base modification types."); std::vector<char> base_mod_types; if (this->base_mod_counts.empty()) { printError("No base modification counts found."); return base_mod_types; } - printMessage("[TEST2] Getting base modification types."); for (const auto& it : this->base_mod_counts) { base_mod_types.push_back(it.first); } - // for (auto it = this->base_mod_counts.begin(); it != this->base_mod_counts.end(); ++it) { - // base_mod_types.push_back(it->first); - // } + return base_mod_types; } @@ -382,17 +378,17 @@ double Output_BAM::getNthReadLenPct(int read_index, char mod_type) double Output_BAM::getNthReadModProb(int read_index, char mod_type) { - double mod_prob = 0.0; + double mod_prob = -1.0; try { this->read_pct_len_vs_mod_prob.at(mod_type); } catch (const std::out_of_range& oor) { - std::cerr << "Error: Modification probability not found for type " << mod_type << std::endl; + return mod_prob; } try { mod_prob = this->read_pct_len_vs_mod_prob[mod_type].at(read_index).second; } catch (const std::out_of_range& oor) { - std::cerr << "Error: Modification probability not found for read index " << read_index << " and type " << mod_type << std::endl; - return 0.0; + // std::cerr << "Error: Modification probability not found for read index " << read_index << " and type " << mod_type << std::endl; + return -1.0; } return mod_prob; } @@ -465,6 +461,11 @@ void Output_BAM::add(Output_BAM &output_data) this->seq_quality_info.base_quality_distribution[i] += output_data.seq_quality_info.base_quality_distribution[i]; } + // Update the read average base quality vector if it is not empty + for (int i=0; i<MAX_READ_QUALITY; i++){ + this->seq_quality_info.read_average_base_quality_distribution[i] += output_data.seq_quality_info.read_average_base_quality_distribution[i]; + } + this->num_matched_bases += output_data.num_matched_bases; this->num_mismatched_bases += output_data.num_mismatched_bases; this->num_ins_bases += output_data.num_ins_bases; @@ -686,9 +687,7 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ std::string read_name_str; std::getline( iss_header, read_name_str, ' ' ); read_name = read_name_str.c_str(); - - // Access the sequence data - std::string sequence_data_str = fq[1]; + std::string sequence_data_str = fq[1]; // Access the sequence data // Update the total number of bases int base_count = sequence_data_str.length(); @@ -714,7 +713,6 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ // Update the base quality and GC content information int gc_count = 0; - // double read_mean_base_qual = 0; double cumulative_base_prob = 0; // Read cumulative base quality probability char current_base; int base_quality_value; @@ -751,7 +749,6 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ // Convert the Phred quality value to a probability double base_quality_prob = pow(10, -base_quality_value / 10.0); - // read_mean_base_qual += (double)base_quality_value; cumulative_base_prob += base_quality_prob; } @@ -761,11 +758,6 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ // Convert the mean base quality probability to a Phred quality value double read_mean_base_qual = -10.0 * log10(cumulative_base_prob); - // printMessage("Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual)); - - // Calculate percent guanine & cytosine - // gc_content_pct = 100.0 *( (double)gc_count / (double)base_count ); - // Update the per-read GC content distribution double gc_content_pct = (100.0 * gc_count) / static_cast<double>(base_count); int gc_content_int = static_cast<int>(std::round(gc_content_pct)); @@ -776,16 +768,9 @@ void Output_FAST5::addReadFastq(std::vector<std::string> fq, FILE *read_details_ } // Update the per-read base quality distribution - // double read_mean_base_qual_pct = read_mean_base_qual / static_cast<double>(base_count); - // unsigned int read_mean_base_qual_int = static_cast<unsigned - // int>(std::round(read_mean_base_qual_pct)); int read_mean_base_qual_int = static_cast<int>(std::round(read_mean_base_qual)); - // printMessage("Rounded Mean Q Score for read ID " + std::string(read_name) + " is " + std::to_string(read_mean_base_qual_int)); - try { - // seq_quality_info.read_average_base_quality_distribution[read_mean_base_qual_int] - // += 1; seq_quality_info.read_quality_distribution[read_mean_base_qual_int] += 1; } catch (const std::out_of_range& oor) { printError("Warning: Base quality value " + std::to_string(read_mean_base_qual_int) + " exceeds maximum value"); diff --git a/src/plot_utils.py b/src/plot_utils.py index 472f935..da6d017 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -320,20 +320,20 @@ def read_gc_content_histogram(data, font_size, plot_filepaths): gc_content = np.array(data.read_gc_content_count) # Calculate the percentage of reads with a GC content of <30% - gc_content_below_30 = np.sum(gc_content[:30]) - logging.info("[TEST] Percentage of reads with GC content <30%: {}".format(gc_content_below_30 / np.sum(gc_content))) + # gc_content_below_30 = np.sum(gc_content[:30]) + # logging.info("[TEST] Percentage of reads with GC content <30%: {}".format(gc_content_below_30 / np.sum(gc_content))) - # Calculate the percentage of reads with a GC content of >70% - gc_content_above_70 = np.sum(gc_content[70:]) - logging.info("[TEST] Percentage of reads with GC content >70%: {}".format(gc_content_above_70 / np.sum(gc_content))) + # # Calculate the percentage of reads with a GC content of >70% + # gc_content_above_70 = np.sum(gc_content[70:]) + # logging.info("[TEST] Percentage of reads with GC content >70%: {}".format(gc_content_above_70 / np.sum(gc_content))) - # Calculate the percentage of reads with a GC content of <20% - gc_content_below_20 = np.sum(gc_content[:20]) - logging.info("[TEST] Percentage of reads with GC content <20%: {}".format(gc_content_below_20 / np.sum(gc_content))) + # # Calculate the percentage of reads with a GC content of <20% + # gc_content_below_20 = np.sum(gc_content[:20]) + # logging.info("[TEST] Percentage of reads with GC content <20%: {}".format(gc_content_below_20 / np.sum(gc_content))) - # Calculate the percentage of reads with a GC content of >60% - gc_content_above_60 = np.sum(gc_content[60:]) - logging.info("[TEST] Percentage of reads with GC content >60%: {}".format(gc_content_above_60 / np.sum(gc_content))) + # # Calculate the percentage of reads with a GC content of >60% + # gc_content_above_60 = np.sum(gc_content[60:]) + # logging.info("[TEST] Percentage of reads with GC content >60%: {}".format(gc_content_above_60 / np.sum(gc_content))) # Set the error flag if the GC content is below 20% for more than 10% of the # reads @@ -381,6 +381,8 @@ def base_quality(data, font_size, plot_filepaths): """Plot the base quality distribution.""" xd = np.arange(MAX_BASE_QUALITY) yd = np.array(data.base_quality_distribution) + xd = xd[:60] + yd = yd[:60] fig = go.Figure() customdata = np.dstack((xd, yd))[0, :, :] @@ -411,9 +413,10 @@ def read_avg_base_quality(data, font_size, plot_filepaths): """Plot the read average base quality distribution.""" xd = np.arange(MAX_READ_QUALITY) yd = np.array(data.read_average_base_quality_distribution) + xd = xd[:60] + yd = yd[:60] fig = go.Figure() fig.add_trace(go.Bar(x=xd, y=yd, marker_color='#36a5c7')) - fig.update_xaxes(ticks="outside", dtick=10, title_text='Average Base Quality', title_standoff=0) fig.update_yaxes(ticks="outside", title_text='Number of Reads', title_standoff=0) fig.update_layout(font=dict(size=PLOT_FONT_SIZE)) # Set font size @@ -524,19 +527,15 @@ def plot(output_data, para_dict, file_type): # Base quality histogram if file_type != 'FASTA' and file_type != 'FAST5s' and file_type != 'SeqTxt': seq_quality_info = output_data.seq_quality_info - - # Base quality histogram base_quality(seq_quality_info, font_size, plot_filepaths) # Read average base quality histogram - if file_type == 'FASTQ': + if file_type == 'FASTQ' or file_type == 'FAST5' or file_type == 'BAM': read_avg_base_quality(seq_quality_info, font_size, plot_filepaths) + # Plot the read alignments and base alignments if the file type is BAM if file_type == 'BAM': - # Plot read alignment QC plot_alignment_numbers(output_data, plot_filepaths) - - # Plot base alignment and error QC plot_errors(output_data, plot_filepaths) elif file_type == 'FAST5s': @@ -659,7 +658,6 @@ def plot_pod5(pod5_output, para_dict, bam_output=None): xaxis=dict(range=[0, 100]) ) fig.update_traces(marker={'size': marker_size}) - # fig.update_xaxes(title="Index") # Append the dynamic HTML object to the output structure dynamic_html = fig.to_html(full_html=False) @@ -1048,9 +1046,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th table_error_flag = False # Print the types of modifications - logging.info("Getting base modification types") base_mod_types = output_data.getBaseModTypes() - logging.info("[TEST] Modification types: ") if base_mod_types: logging.info("Modification types: ") for mod_type in base_mod_types: @@ -1058,18 +1054,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th logging.info("Getting base modification statistics") - # Get the read length vs. base modification rate data for each - # # modification type - # logging.info("Getting mod data size") - # read_mod_data_size = output_data.getReadModDataSize() - # logging.info("Mod data size: {}".format(read_mod_data_size)) - - # # Choose a maximum of 10,000 reads to randomly sample for the plot - # max_reads = min(read_mod_data_size, 10000) - # # read_indices = set(sample(range(read_mod_data_size), max_reads)) - # read_indices = np.random.choice(read_mod_data_size, max_reads, replace=False) - # read_length_mod_rates = {} - # Get the read length (%) vs. base modification probability data for # each sampled read sample_count = 10000 @@ -1078,8 +1062,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th for mod_type in base_mod_types: for i in range(sample_count): try: - pct = output_data.getNthReadLenPct(i, mod_type) prob = output_data.getNthReadModProb(i, mod_type) + if prob == -1: # Skip if no modifications for the read + continue + + pct = output_data.getNthReadLenPct(i, mod_type) read_len_pct.append(pct) mod_prob.append(prob) except Exception as e: @@ -1097,73 +1084,28 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th 'N': 'Amb. N', \ 'v': 'pseU'} - # Create a plot of pct read length vs. base modification probability for - # each modification type, as well as a histogram of the average base - # modification probability for 100 bins of the read length - - # Make a subplot of two columns for the read length vs. base - # modification probability and the histogram of the average base - # modification probability for each modification type fig = make_subplots(rows=len(base_mod_types), cols=2, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=[f"{mod_char_to_name[mod_type]} Modification Probability" for mod_type in base_mod_types]) for i, mod_type in enumerate(base_mod_types): logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}") - # Add the trace for the read length vs. base modification + # Add the trace for the read length (%) vs. base modification # probability scatter plot fig.add_trace(go.Scatter (x=read_len_pct, y=mod_prob, mode='markers', name=mod_char_to_name[mod_type], marker=dict(size=5), showlegend=False), row=i + 1, col=1) - # Print the first 50 pairs sorted by read length for debugging - # read_len_pct, mod_prob = zip(*sorted(zip(read_len_pct, mod_prob))) - # if i == 0: - # for j in range(50): - # logging.info(f"Read length: {read_len_pct[j]}, Modification probability: {mod_prob[j]}") - # Create a histogram of the base modification probabilities base_mod_prob_hist = go.Histogram(x=mod_prob, name=mod_char_to_name[mod_type], showlegend=False, nbinsx=20) fig.add_trace(base_mod_prob_hist, row=i + 1, col=2) - - # Add a bar plot of the average base modification probability for - # 100 bins of the read length - # bins = np.linspace(0, 100, 11) # 10 bins (0-10%, 10-20%, ..., 90-100%) - # bin_centers = (bins[:-1] + bins[1:]) / 2 # Bin centers for plotting - - # # Get the average probability per bin - # avg_prob_per_bin = np.zeros(10) - # bin_indices = np.digitize(read_len_pct, bins) - 1 - # for j in range(10): # Loop over bins - # bin_mask = (bin_indices == j) - # if np.any(bin_mask): - # avg_prob_per_bin[j] = np.mean(mod_prob[bin_mask]) - # logging.info(f"Bin {j}: {avg_prob_per_bin[j]}") - - # # Create the bar plot - - # # Print the bins and read length percentages for the first 10 reads - # # for debugging - # if i == 0: - # logging.info("Bins: {}".format(bins)) - # logging.info("Bin indices: {}".format(bin_indices[:10])) - # logging.info("Read length percentages: {}".format(read_len_pct[:10])) - - # # Create the bar plot - # fig.add_trace(go.Bar(x=bin_centers, y=avg_prob_per_bin, name=mod_char_to_name[mod_type], showlegend=False), row=i + 1, col=2) # Update the plot style fig.update_xaxes(title="Read Length (%)", row=i + 1, col=1) fig.update_yaxes(title="Modification Probability", row=i + 1, col=1) fig.update_xaxes(title="Modification Probability", row=i + 1, col=2) fig.update_yaxes(title="Frequency", row=i + 1, col=2) - # fig.update_xaxes(title="Read Length (%)", row=i + 1, col=2) - # fig.update_yaxes(title="Average Modification Probability", row=i + 1, col=2) - - # Set the range of the y-axis to 0-1 fig.update_yaxes(range=[0, 1], row=i + 1, col=1) - # fig.update_yaxes(range=[0, 1], row=i + 1, col=2) - # Update the plot layout fig.update_layout(title="Read Length vs. Base Modification Probability", font=dict(size=PLOT_FONT_SIZE)) # Generate the HTML @@ -1175,7 +1117,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th else: logging.warning("WARNING: No modification types found") - # Create the base modification statistics table' + # Create the base modification statistics table logging.info("Creating the base modification statistics table") table_str = "<table>\n<tbody>" row_str, row_flag = format_row("Total Unfiltered Predictions", [output_data.modified_prediction_count], 'int', None) @@ -1208,7 +1150,6 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Add the modification type data for mod_type in base_mod_types: - # mod_name = mod_char_to_name[mod_type] try: mod_name = mod_char_to_name[mod_type] except KeyError: @@ -1234,13 +1175,13 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Finish the table table_str += "\n</tbody>\n</table>" - # Add the help text - table_str += """ - <div class="help-icon"> - 💡 - <div class="tooltip">{}</div> - </div> - """.format(help_text) + # # Add the help text + # table_str += """ + # <div class="help-icon"> + # 💡 + # <div class="tooltip">{}</div> + # </div> + # """.format(help_text) # Add text below the table suggesting the user to use Modkit for more # detailed analysis on per-site modification rates @@ -1322,9 +1263,6 @@ def plot_alignment_numbers(data, plot_filepaths): # Set the error flag if primary alignments equal 0 error_flag = data.num_primary_alignment == 0 - logging.info("[TEST] Number of reverse alignments: {}".format(data.reverse_alignment)) - logging.info("[TEST] Number of forward alignments: {}".format(data.forward_alignment)) - # Create a horizontally aligned bar plot trace from the data using plotly trace = go.Bar(x=[data.num_primary_alignment, data.num_supplementary_alignment, data.num_secondary_alignment, data.num_reads_with_supplementary_alignment, data.num_reads_with_secondary_alignment, From c257da611a4ae2cfcb4c6c42fcb504f808d644f9 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 11:24:59 -0500 Subject: [PATCH 18/25] Reduce debug output --- src/cli.py | 2 +- src/plot_utils.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cli.py b/src/cli.py index 80b23ba..bbee8cf 100644 --- a/src/cli.py +++ b/src/cli.py @@ -258,7 +258,7 @@ def bam_module(margs): # If base modifications were found, add the base modification plots # after the first table if bam_output.sample_modified_base_count > 0: - logging.info("Base modifications found. Adding base modification plots to the HTML report.") + # logging.info("Base modifications found. Adding base modification plots to the HTML report.") qc_info_list.insert(1, "read_length_mod_rates") # Read length modification rates qc_info_list.insert(1, "base_mods") diff --git a/src/plot_utils.py b/src/plot_utils.py index da6d017..3e17554 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -1048,11 +1048,11 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th # Print the types of modifications base_mod_types = output_data.getBaseModTypes() if base_mod_types: - logging.info("Modification types: ") - for mod_type in base_mod_types: - logging.info(mod_type) + # logging.info("Modification types: ") + # for mod_type in base_mod_types: + # logging.info(mod_type) - logging.info("Getting base modification statistics") + # logging.info("Getting base modification statistics") # Get the read length (%) vs. base modification probability data for # each sampled read @@ -1087,7 +1087,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th fig = make_subplots(rows=len(base_mod_types), cols=2, shared_xaxes=False, shared_yaxes=False, vertical_spacing=0.1, subplot_titles=[f"{mod_char_to_name[mod_type]} Modification Probability" for mod_type in base_mod_types]) for i, mod_type in enumerate(base_mod_types): - logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}") + # logging.info(f"Creating trace for modification type: {mod_type} at row: {i + 1}") # Add the trace for the read length (%) vs. base modification # probability scatter plot @@ -1112,7 +1112,7 @@ def create_modified_base_table(output_data, plot_filepaths, base_modification_th if len(base_mod_types) > 0: plot_height = 500 * len(base_mod_types) plot_width = 700 * 2 - logging.info("Saving the read length vs. modification rates plot") + logging.info("Generating the read length vs. modification rates plot") plot_filepaths["read_length_mod_rates"]['dynamic'] = fig.to_html(full_html=False, default_height=plot_height, default_width=plot_width) else: logging.warning("WARNING: No modification types found") From 82cddc463042e3b0d95ea6b30ccb4afe3b84ca12 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 11:46:26 -0500 Subject: [PATCH 19/25] Debug compilation --- Makefile | 3 +++ conda/meta.yaml | 8 ++++---- src/hts_reader.cpp | 2 +- src/plot_utils.py | 18 +++--------------- 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 5b6392f..529dd03 100644 --- a/Makefile +++ b/Makefile @@ -14,6 +14,9 @@ all: swig_build compile swig_build: swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i +# Create the lib directory if it doesn't exist + mkdir -p $(LIB_DIR) + # Compile the C++ shared libraries into lib/ compile: LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \ diff --git a/conda/meta.yaml b/conda/meta.yaml index edf847d..9253229 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,14 +1,14 @@ {% set version = "1.4.0" %} -# {% set revision = "b06670513616fd6342233c1c77e6d0bcf138b3bc" %} +{% set revision = "c257da611a4ae2cfcb4c6c42fcb504f808d644f9" %} package: name: longreadsum version: {{ version }} source: - path: ../ - # git_url: https://github.com/WGLab/LongReadSum.git - # git_rev: {{ revision }} + git_url: https://github.com/WGLab/LongReadSum.git + git_rev: {{ revision }} + # path: ../ channels: - conda-forge diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 1a7e53f..70a6410 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -184,7 +184,7 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu // Set the atomic flag and print a message if the POD5 tags are // present if (!this->has_pod5_tags.test_and_set()) { - printMessage("POD5 tags found (ts, ns, mv)"); + printMessage("POD5 basecall move table tags found (ts, ns, mv)"); } // Get the ts and ns tags diff --git a/src/plot_utils.py b/src/plot_utils.py index 3e17554..c5f2bc3 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -437,27 +437,16 @@ def read_avg_base_quality(data, font_size, plot_filepaths): def plot_base_modifications(base_modifications): """Plot the base modifications per location.""" - # Get the modification types - modification_types = list(base_modifications.keys()) - # Create the figure + # Add a plot for each modification type fig = go.Figure() - - # Add a trace for each modification type + modification_types = list(base_modifications.keys()) for mod_type in modification_types: - # Get the modification data mod_data = base_modifications[mod_type] - - # Create the trace trace = go.Scattergl(x=mod_data['positions'], y=mod_data['counts'], mode='markers', name=mod_type) - - # Add the trace to the figure fig.add_trace(trace) - # Update the layout fig.update_layout(title='Base Modifications', xaxis_title='Position', yaxis_title='Counts', showlegend=True, font=dict(size=PLOT_FONT_SIZE)) - - # Generate the HTML html_obj = fig.to_html(full_html=False, default_height=500, default_width=700) return html_obj @@ -545,10 +534,9 @@ def plot(output_data, para_dict, file_type): def plot_pod5(pod5_output, para_dict, bam_output=None): """Plot the ONT POD5 signal data for a random sample of reads.""" + out_path = para_dict["output_folder"] plot_filepaths = getDefaultPlotFilenames() - - # Create the summary table create_pod5_table(pod5_output, plot_filepaths) # Generate the signal plots From 6d588bbb1ba19bd8d28a32eec798b1302437e144 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 12:13:40 -0500 Subject: [PATCH 20/25] revert makefile --- Makefile | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 529dd03..b2d1097 100644 --- a/Makefile +++ b/Makefile @@ -3,9 +3,11 @@ SRC_DIR := $(CURDIR)/src LIB_DIR := $(CURDIR)/lib # Set the library paths for the compiler -CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX) -LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib -INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include +# CONDA_PREFIX ?= $(shell echo $$CONDA_PREFIX) +# LIBRARY_PATHS := -L$(LIB_DIR) -L$(CONDA_PREFIX)/lib +# INCLUDE_PATHS := -I$(INCL_DIR) -I$(CONDA_PREFIX)/include +LIBRARY_PATHS := -L$(LIB_DIR) -L/usr/share/miniconda/envs/longreadsum/lib +INCLUDE_PATHS := -I$(INCL_DIR) -I/usr/share/miniconda/envs/longreadsum/include # All targets all: swig_build compile @@ -19,9 +21,11 @@ swig_build: # Compile the C++ shared libraries into lib/ compile: - LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \ + LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):/usr/share/miniconda/envs/longreadsum/lib \ CXXFLAGS="$(INCLUDE_PATHS)" LDFLAGS="$(LIBRARY_PATHS)" python3 setup.py build_ext --build-lib $(LIB_DIR) # Clean the build directory clean: $(RM) -r $(LIB_DIR)/*.so $(LIB_DIR)/*.py $(SRC_DIR)/lrst_wrap.cpp build/ + +# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \ From c6c34f1f7e2521d166a15378a27440d73d792f3f Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 12:56:12 -0500 Subject: [PATCH 21/25] Add build debug output --- conda/build.sh | 9 +++++++++ environment.yml | 2 -- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/conda/build.sh b/conda/build.sh index 95f8d11..61720bc 100644 --- a/conda/build.sh +++ b/conda/build.sh @@ -3,18 +3,27 @@ # Add the library path to the LD_LIBRARY_PATH export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${PREFIX}/lib +# Ensure the lib directory exists +mkdir -p "${SRC_DIR}"/lib + # Generate the SWIG files +echo "Generating SWIG files..." swig -c++ -python -outdir "${SRC_DIR}"/lib -I"${SRC_DIR}"/include -I"${PREFIX}"/include -o "${SRC_DIR}"/src/lrst_wrap.cpp "${SRC_DIR}"/src/lrst.i # Generate the shared library +echo "Building the shared library..." $PYTHON setup.py -I"${PREFIX}"/include -L"${PREFIX}"/lib install # Create the src directory mkdir -p "${PREFIX}"/src # Copy source files to the bin directory +echo "Copying source files..." cp -r "${SRC_DIR}"/src/*.py "${PREFIX}"/bin # Copy the SWIG generated library to the lib directory +echo "Copying SWIG generated library..." cp -r "${SRC_DIR}"/lib/*.py "${PREFIX}"/lib cp -r "${SRC_DIR}"/lib/*.so "${PREFIX}"/lib + +echo "Build complete." diff --git a/environment.yml b/environment.yml index a76645c..c1d3ef4 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,6 @@ channels: - bioconda - defaults - jannessp # for pod5 - - plotly # for kaleido dependencies: - python=3.9 - numpy @@ -16,4 +15,3 @@ dependencies: - pytest - pod5 - pyarrow - - python-kaleido From da5cb8d06ba0e8e9b8551c37de8e5cf292e3d399 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 13:09:07 -0500 Subject: [PATCH 22/25] add gh actions verbose output --- .github/workflows/build-test.yml | 2 +- environment.yml | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 9694194..6c61557 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -35,7 +35,7 @@ jobs: - name: Build LongReadSum shell: bash --login {0} # --login enables PATH variable access - run: make + run: make -d - name: Run tests shell: bash --login {0} diff --git a/environment.yml b/environment.yml index c1d3ef4..2cc96c0 100644 --- a/environment.yml +++ b/environment.yml @@ -1,11 +1,11 @@ name: longreadsum channels: - conda-forge + - jannessp # for pod5 - bioconda - defaults - - jannessp # for pod5 dependencies: - - python=3.9 + - python - numpy - hdf5 - ont_vbz_hdf_plugin @@ -13,5 +13,5 @@ dependencies: - swig - plotly - pytest - - pod5 + - jannessp::pod5 - pyarrow From 906c70098115715e0ff12717f4ed915f86e965f1 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 13:18:01 -0500 Subject: [PATCH 23/25] update makefile --- Makefile | 6 +----- environment.yml | 10 +++++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index b2d1097..1c23e60 100644 --- a/Makefile +++ b/Makefile @@ -14,10 +14,8 @@ all: swig_build compile # Generate the SWIG Python/C++ wrappers swig_build: - swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i - -# Create the lib directory if it doesn't exist mkdir -p $(LIB_DIR) + swig -c++ -python -outdir $(LIB_DIR) -I$(INCL_DIR) -o $(SRC_DIR)/lrst_wrap.cpp $(SRC_DIR)/lrst.i # Compile the C++ shared libraries into lib/ compile: @@ -27,5 +25,3 @@ compile: # Clean the build directory clean: $(RM) -r $(LIB_DIR)/*.so $(LIB_DIR)/*.py $(SRC_DIR)/lrst_wrap.cpp build/ - -# LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(CONDA_PREFIX)/lib \ diff --git a/environment.yml b/environment.yml index 2cc96c0..91b434e 100644 --- a/environment.yml +++ b/environment.yml @@ -1,15 +1,15 @@ name: longreadsum channels: - conda-forge - - jannessp # for pod5 - - bioconda - defaults + - bioconda # for htslib + - jannessp # for pod5 dependencies: - python - numpy - - hdf5 - - ont_vbz_hdf_plugin - - htslib=1.20 + - hdf5=1.10.6 + - bioconda::ont_vbz_hdf_plugin + - bioconda::htslib=1.20 - swig - plotly - pytest From be0c09c09f1e470ef132b6b073d38e2f5523dc30 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 18:29:47 -0500 Subject: [PATCH 24/25] Fix conda build environment --- conda/meta.yaml | 7 +++---- environment.yml | 8 ++++---- setup.py | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index 9253229..e1d7004 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -29,18 +29,17 @@ requirements: host: - python=3.9 - swig - - hdf5 - htslib=1.20 + - ont_vbz_hdf_plugin # Contains HDF5 as a dependency as well # - jannessp::pod5 # - jannessp::lib-pod5 run: - python=3.9 - numpy - - hdf5 - ont_vbz_hdf_plugin - - htslib=1.20 + - bioconda::htslib=1.20 - plotly - - janessp::pod5 + - jannessp::pod5 - pyarrow # - janessp::lib-pod5 diff --git a/environment.yml b/environment.yml index 91b434e..b18d99e 100644 --- a/environment.yml +++ b/environment.yml @@ -1,14 +1,14 @@ name: longreadsum channels: - conda-forge - - defaults - - bioconda # for htslib - jannessp # for pod5 + - bioconda + - defaults + dependencies: - python - numpy - - hdf5=1.10.6 - - bioconda::ont_vbz_hdf_plugin + - ont_vbz_hdf_plugin - bioconda::htslib=1.20 - swig - plotly diff --git a/setup.py b/setup.py index 3c05b69..f136db5 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ # Set up the module setup(name="longreadsum", - version='1.4.0', + version='1.5.0', author="WGLab", description="""A fast and flexible QC tool for long read sequencing data""", ext_modules=[lrst_mod], From 3e091bb8f9d4cbf9fde79b6d48a21c8b041abea1 Mon Sep 17 00:00:00 2001 From: jonperdomo <jonperdomodb@gmail.com> Date: Tue, 21 Jan 2025 18:54:27 -0500 Subject: [PATCH 25/25] Update build commit --- conda/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda/meta.yaml b/conda/meta.yaml index e1d7004..ded0040 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ -{% set version = "1.4.0" %} -{% set revision = "c257da611a4ae2cfcb4c6c42fcb504f808d644f9" %} +{% set version = "1.5.0" %} +{% set revision = "47f1310e02ee06f32b8e34417e207f245828a319" %} package: name: longreadsum