diff --git a/include/output_data.h b/include/output_data.h index 1b5eba3..1552d6d 100644 --- a/include/output_data.h +++ b/include/output_data.h @@ -140,15 +140,14 @@ class Base_Signals class Base_Move_Table { public: - std::string read_name; std::string sequence_data_str; // Sequence of bases std::vector base_signal_index; // 2D vector of signal indices for each base // Methods - std::string getReadName(); std::string getSequenceString(); std::vector getBaseSignalIndex(); - Base_Move_Table(std::string read_name, std::string sequence_data_str, std::vector base_signal_index); + Base_Move_Table(std::string sequence_data_str, std::vector base_signal_index); + Base_Move_Table(); }; @@ -205,7 +204,9 @@ class Output_BAM : public Output_FQ // Signal data section int read_count; int base_count; - std::vector read_move_table; + // std::vector read_move_table; + std::unordered_map read_move_table; + // std::map read_move_table = {}; // POD5 signal-level information is stored in a map of read names to a map of // reference positions to a tuple of (ts, ns, move table vector) @@ -227,9 +228,8 @@ class Output_BAM : public Output_FQ int getReadCount(); void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector move_table); // void addReadBaseSignals(Base_Signals values); - std::vector getNthReadMoveTable(int read_index); - std::string getNthReadSequence(int read_index); - std::string getNthReadName(int read_index); + std::vector getReadMoveTable(std::string read_id); + std::string getReadSequence(std::string read_id); // Add a batch of records to the output void add(Output_BAM &t_output_bam); diff --git a/src/hts_reader.cpp b/src/hts_reader.cpp index 1859ada..b7518f4 100644 --- a/src/hts_reader.cpp +++ b/src/hts_reader.cpp @@ -181,11 +181,18 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu for (int i = 0; i < record->core.l_qseq; i++) { seq_str += seq_nt16_str[bam_seqi(bam_get_seq(record), i)]; } + + // Throw an error if the query name is empty + if (query_name.empty()) { + std::cerr << "Error: Query name is empty" << std::endl; + exit_code = 1; + break; + } output_data.addReadMoveTable(query_name, seq_str, signal_index_vector); // if (first_pod5_tag) { - // printMessage("Signal vector length: " \ - // + std::to_string(signal_index_vector.size()) + ", Sequence string length: " \ + // printMessage("Signal vector length: " + // + std::to_string(signal_index_vector.size()) + ", Sequence string length: " // + std::to_string(seq_str.length())); // // printMessage("Base signal length: " + std::to_string(base_signal_length) + ", Sequence string length: " + std::to_string(seq_str.length())); diff --git a/src/output_data.cpp b/src/output_data.cpp index 5a20aa3..8d44b06 100644 --- a/src/output_data.cpp +++ b/src/output_data.cpp @@ -297,34 +297,65 @@ void Output_BAM::add_modification(std::string chr, int32_t ref_pos, char mod_typ int Output_BAM::getReadCount() { - return this->read_count; + // return this->read_count; + return this->read_move_table.size(); } void Output_BAM::addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector signal_index) { - Base_Move_Table values(read_name, sequence_data_str, signal_index); - this->read_move_table.push_back(values); - this->read_count++; + Base_Move_Table values(sequence_data_str, signal_index); + + // Add the read name to the move table + this->read_move_table[read_name] = values; + // this->read_move_table[read_name] = values; + // try { + // this->read_move_table.at(read_name); + // } catch (const std::out_of_range& oor) { + // this->read_move_table[read_name] = values; + // } + // this->read_move_table[read_name] = values; + // Base_Move_Table values(read_name, sequence_data_str, signal_index); + // this->read_move_table.push_back(values); + // this->read_count++; } -std::vector Output_BAM::getNthReadMoveTable(int read_index) +std::vector Output_BAM::getReadMoveTable(std::string read_id) { - return this->read_move_table[read_index].getBaseSignalIndex(); + try { + this->read_move_table.at(read_id); + } catch (const std::out_of_range& oor) { + std::cerr << "Error: Read name " << read_id << " is not in the move table." << std::endl; + } + return this->read_move_table[read_id].getBaseSignalIndex(); + // try { + // this->read_move_table.at(read_index); + // } catch (const std::out_of_range& oor) { + // std::cerr << "Error: Move table read index " << read_index << " is out of range." << std::endl; + // } + // return this->read_move_table[read_index].getBaseSignalIndex(); } // Get the Nth read's sequence string -std::string Output_BAM::getNthReadSequence(int read_index){ - Base_Move_Table signal_data = this->read_move_table[read_index]; +std::string Output_BAM::getReadSequence(std::string read_id) +{ + try { + this->read_move_table.at(read_id); + } catch (const std::out_of_range& oor) { + std::cerr << "Error: Read name " << read_id << " is not in the move table." << std::endl; + } + + Base_Move_Table signal_data = this->read_move_table[read_id]; + // try { + // this->read_move_table.at(read_index); + // } catch (const std::out_of_range& oor) { + // std::cerr << "Error: Sequence read index " << read_index << " is out of range." << std::endl; + // } + // Base_Move_Table signal_data = this->read_move_table[read_index]; std::string sequence_str(signal_data.getSequenceString()); return sequence_str; } -std::string Output_BAM::getNthReadName(int read_index) -{ - return this->read_move_table[read_index].getReadName(); -} - std::map> Output_BAM::get_modifications() { return this->base_modifications; @@ -383,10 +414,24 @@ void Output_BAM::add(Output_BAM &output_data) // Add move table data std::cout << "Adding move table data" << std::endl; - std::cout << "Output data read count: " << output_data.getReadCount() << std::endl; - for (int i=0; iaddReadMoveTable(output_data.getNthReadName(i), output_data.getNthReadSequence(i), output_data.getNthReadMoveTable(i)); + // Update the map + for ( auto it = output_data.read_move_table.begin(); it != output_data.read_move_table.end(); ++it ){ + std::string read_id = it->first; + std::vector signal_index = it->second.getBaseSignalIndex(); + std::string sequence_data_str = it->second.getSequenceString(); + this->addReadMoveTable(read_id, sequence_data_str, signal_index); } + // std::cout << "Output data read count: " << output_data.getReadCount() << std::endl; + // for (int i=0; i < output_data.getReadCount(); i++) { + // std::cout << "Read name: " << output_data.getNthReadName(i) << std::endl; + // std::cout << "Read sequence " << std::endl; + // output_data.getNthReadSequence(i); + // std::cout << "Read signal index " << std::endl; + // output_data.getNthReadMoveTable(i); + // std::cout << "Adding read move table data" << std::endl; + // // this->addReadMoveTable(output_data.getNthReadName(i), output_data.getNthReadSequence(i), output_data.getNthReadMoveTable(i)); + // } + std::cout << "Move table data added" << std::endl; } void Output_BAM::global_sum(){ @@ -725,10 +770,10 @@ std::vector Output_FAST5::getNthReadKurtosis(int read_index){ return output; } -std::string Base_Move_Table::getReadName() -{ - return this->read_name; -} +// std::string Base_Move_Table::getReadName() +// { +// return this->read_name; +// } std::string Base_Move_Table::getSequenceString() { @@ -740,9 +785,13 @@ std::vector Base_Move_Table::getBaseSignalIndex() return this->base_signal_index; } -Base_Move_Table::Base_Move_Table(std::string read_name, std::string sequence_data_str, std::vector base_signal_index) +Base_Move_Table::Base_Move_Table(std::string sequence_data_str, std::vector base_signal_index) { - this->read_name = read_name; + // this->read_name = read_name; this->sequence_data_str = sequence_data_str; this->base_signal_index = base_signal_index; } + +Base_Move_Table::Base_Move_Table() +{ +} diff --git a/src/plot_utils.py b/src/plot_utils.py index 1e2882e..f88f940 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -468,19 +468,17 @@ def plot_pod5(pod5_output, para_dict, bam_output=None): # Get the basecall read data if bam_output: - logging.info("Getting number of basecall reads") - read_count = bam_output.getReadCount() - logging.info("Total number of reads: {}".format(read_count)) - logging.info("Getting basecall data for read {}".format(nth_read_name)) - bc_nth_read_sequence = bam_output.getNthReadSequence(read_index) - logging.info("Basecall sequence for read {}: {}".format(nth_read_name, bc_nth_read_sequence[:10])) - bc_nth_read_index = bam_output.getNthReadMoveTable(read_index) - logging.info("Basecall index for read {}: {}".format(nth_read_name, bc_nth_read_index)) - bc_nth_read_name = bam_output.getNthReadName(read_index) - - # Throw an error if read names don't match - if bc_nth_read_name != nth_read_name: - raise ValueError("Read names don't match between the signal and basecall data: {} vs {}".format(nth_read_name, bc_nth_read_name)) + move_table = bam_output.getReadMoveTable(nth_read_name) + read_sequence = bam_output.getReadSequence(nth_read_name) + + # Print the first couple of indices from the table. + # Each index in the move table represents a k-mer move. Thus, for + # each base, the signal is between two indices in the move table, starting + # from the first index. + logging.info("Move table for read {}: {}".format(nth_read_name, move_table[:5])) + logging.info("Read sequence for read {}: {}".format(nth_read_name, read_sequence[:5])) + logging.info("Read sequence length for read {}: {}".format(nth_read_name, len(read_sequence))) + logging.info("Signal data length for read {}: {}".format(nth_read_name, len(move_table))) # Set up the output CSV csv_qc_filepath = os.path.join(out_path, nth_read_name + '_QC.csv')