Skip to content

Commit

Permalink
Work on pod5
Browse files Browse the repository at this point in the history
  • Loading branch information
jonperdomo committed Jul 24, 2024
1 parent c7a16f6 commit 562c2ed
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 44 deletions.
14 changes: 7 additions & 7 deletions include/output_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,14 @@ class Base_Signals
class Base_Move_Table
{
public:
std::string read_name;
std::string sequence_data_str; // Sequence of bases
std::vector<int> base_signal_index; // 2D vector of signal indices for each base

// Methods
std::string getReadName();
std::string getSequenceString();
std::vector<int> getBaseSignalIndex();
Base_Move_Table(std::string read_name, std::string sequence_data_str, std::vector<int> base_signal_index);
Base_Move_Table(std::string sequence_data_str, std::vector<int> base_signal_index);
Base_Move_Table();
};


Expand Down Expand Up @@ -205,7 +204,9 @@ class Output_BAM : public Output_FQ
// Signal data section
int read_count;
int base_count;
std::vector<Base_Move_Table> read_move_table;
// std::vector<Base_Move_Table> read_move_table;
std::unordered_map<std::string, Base_Move_Table> read_move_table;
// std::map<std::string, Base_Move_Table> read_move_table = {};

// POD5 signal-level information is stored in a map of read names to a map of
// reference positions to a tuple of (ts, ns, move table vector)
Expand All @@ -227,9 +228,8 @@ class Output_BAM : public Output_FQ
int getReadCount();
void addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> move_table);
// void addReadBaseSignals(Base_Signals values);
std::vector<int> getNthReadMoveTable(int read_index);
std::string getNthReadSequence(int read_index);
std::string getNthReadName(int read_index);
std::vector<int> getReadMoveTable(std::string read_id);
std::string getReadSequence(std::string read_id);

// Add a batch of records to the output
void add(Output_BAM &t_output_bam);
Expand Down
11 changes: 9 additions & 2 deletions src/hts_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,18 @@ int HTSReader::readNextRecords(int batch_size, Output_BAM & output_data, std::mu
for (int i = 0; i < record->core.l_qseq; i++) {
seq_str += seq_nt16_str[bam_seqi(bam_get_seq(record), i)];
}

// Throw an error if the query name is empty
if (query_name.empty()) {
std::cerr << "Error: Query name is empty" << std::endl;
exit_code = 1;
break;
}
output_data.addReadMoveTable(query_name, seq_str, signal_index_vector);

// if (first_pod5_tag) {
// printMessage("Signal vector length: " \
// + std::to_string(signal_index_vector.size()) + ", Sequence string length: " \
// printMessage("Signal vector length: "
// + std::to_string(signal_index_vector.size()) + ", Sequence string length: "
// + std::to_string(seq_str.length()));
// // printMessage("Base signal length: " + std::to_string(base_signal_length) + ", Sequence string length: " + std::to_string(seq_str.length()));

Expand Down
93 changes: 71 additions & 22 deletions src/output_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,34 +297,65 @@ void Output_BAM::add_modification(std::string chr, int32_t ref_pos, char mod_typ

int Output_BAM::getReadCount()
{
return this->read_count;
// return this->read_count;
return this->read_move_table.size();
}

void Output_BAM::addReadMoveTable(std::string read_name, std::string sequence_data_str, std::vector<int> signal_index)
{
Base_Move_Table values(read_name, sequence_data_str, signal_index);
this->read_move_table.push_back(values);
this->read_count++;
Base_Move_Table values(sequence_data_str, signal_index);

// Add the read name to the move table
this->read_move_table[read_name] = values;

// this->read_move_table[read_name] = values;
// try {
// this->read_move_table.at(read_name);
// } catch (const std::out_of_range& oor) {
// this->read_move_table[read_name] = values;
// }
// this->read_move_table[read_name] = values;
// Base_Move_Table values(read_name, sequence_data_str, signal_index);
// this->read_move_table.push_back(values);
// this->read_count++;
}

std::vector<int> Output_BAM::getNthReadMoveTable(int read_index)
std::vector<int> Output_BAM::getReadMoveTable(std::string read_id)
{
return this->read_move_table[read_index].getBaseSignalIndex();
try {
this->read_move_table.at(read_id);
} catch (const std::out_of_range& oor) {
std::cerr << "Error: Read name " << read_id << " is not in the move table." << std::endl;
}
return this->read_move_table[read_id].getBaseSignalIndex();
// try {
// this->read_move_table.at(read_index);
// } catch (const std::out_of_range& oor) {
// std::cerr << "Error: Move table read index " << read_index << " is out of range." << std::endl;
// }
// return this->read_move_table[read_index].getBaseSignalIndex();
}

// Get the Nth read's sequence string
std::string Output_BAM::getNthReadSequence(int read_index){
Base_Move_Table signal_data = this->read_move_table[read_index];
std::string Output_BAM::getReadSequence(std::string read_id)
{
try {
this->read_move_table.at(read_id);
} catch (const std::out_of_range& oor) {
std::cerr << "Error: Read name " << read_id << " is not in the move table." << std::endl;
}

Base_Move_Table signal_data = this->read_move_table[read_id];
// try {
// this->read_move_table.at(read_index);
// } catch (const std::out_of_range& oor) {
// std::cerr << "Error: Sequence read index " << read_index << " is out of range." << std::endl;
// }
// Base_Move_Table signal_data = this->read_move_table[read_index];
std::string sequence_str(signal_data.getSequenceString());
return sequence_str;
}

std::string Output_BAM::getNthReadName(int read_index)
{
return this->read_move_table[read_index].getReadName();
}

std::map<std::string, std::map<int32_t, Base_Modification>> Output_BAM::get_modifications()
{
return this->base_modifications;
Expand Down Expand Up @@ -383,10 +414,24 @@ void Output_BAM::add(Output_BAM &output_data)

// Add move table data
std::cout << "Adding move table data" << std::endl;
std::cout << "Output data read count: " << output_data.getReadCount() << std::endl;
for (int i=0; i<output_data.getReadCount(); i++){
this->addReadMoveTable(output_data.getNthReadName(i), output_data.getNthReadSequence(i), output_data.getNthReadMoveTable(i));
// Update the map
for ( auto it = output_data.read_move_table.begin(); it != output_data.read_move_table.end(); ++it ){
std::string read_id = it->first;
std::vector<int> signal_index = it->second.getBaseSignalIndex();
std::string sequence_data_str = it->second.getSequenceString();
this->addReadMoveTable(read_id, sequence_data_str, signal_index);
}
// std::cout << "Output data read count: " << output_data.getReadCount() << std::endl;
// for (int i=0; i < output_data.getReadCount(); i++) {
// std::cout << "Read name: " << output_data.getNthReadName(i) << std::endl;
// std::cout << "Read sequence " << std::endl;
// output_data.getNthReadSequence(i);
// std::cout << "Read signal index " << std::endl;
// output_data.getNthReadMoveTable(i);
// std::cout << "Adding read move table data" << std::endl;
// // this->addReadMoveTable(output_data.getNthReadName(i), output_data.getNthReadSequence(i), output_data.getNthReadMoveTable(i));
// }
std::cout << "Move table data added" << std::endl;
}

void Output_BAM::global_sum(){
Expand Down Expand Up @@ -725,10 +770,10 @@ std::vector<double> Output_FAST5::getNthReadKurtosis(int read_index){
return output;
}

std::string Base_Move_Table::getReadName()
{
return this->read_name;
}
// std::string Base_Move_Table::getReadName()
// {
// return this->read_name;
// }

std::string Base_Move_Table::getSequenceString()
{
Expand All @@ -740,9 +785,13 @@ std::vector<int> Base_Move_Table::getBaseSignalIndex()
return this->base_signal_index;
}

Base_Move_Table::Base_Move_Table(std::string read_name, std::string sequence_data_str, std::vector<int> base_signal_index)
Base_Move_Table::Base_Move_Table(std::string sequence_data_str, std::vector<int> base_signal_index)
{
this->read_name = read_name;
// this->read_name = read_name;
this->sequence_data_str = sequence_data_str;
this->base_signal_index = base_signal_index;
}

Base_Move_Table::Base_Move_Table()
{
}
24 changes: 11 additions & 13 deletions src/plot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,19 +468,17 @@ def plot_pod5(pod5_output, para_dict, bam_output=None):

# Get the basecall read data
if bam_output:
logging.info("Getting number of basecall reads")
read_count = bam_output.getReadCount()
logging.info("Total number of reads: {}".format(read_count))
logging.info("Getting basecall data for read {}".format(nth_read_name))
bc_nth_read_sequence = bam_output.getNthReadSequence(read_index)
logging.info("Basecall sequence for read {}: {}".format(nth_read_name, bc_nth_read_sequence[:10]))
bc_nth_read_index = bam_output.getNthReadMoveTable(read_index)
logging.info("Basecall index for read {}: {}".format(nth_read_name, bc_nth_read_index))
bc_nth_read_name = bam_output.getNthReadName(read_index)

# Throw an error if read names don't match
if bc_nth_read_name != nth_read_name:
raise ValueError("Read names don't match between the signal and basecall data: {} vs {}".format(nth_read_name, bc_nth_read_name))
move_table = bam_output.getReadMoveTable(nth_read_name)
read_sequence = bam_output.getReadSequence(nth_read_name)

# Print the first couple of indices from the table.
# Each index in the move table represents a k-mer move. Thus, for
# each base, the signal is between two indices in the move table, starting
# from the first index.
logging.info("Move table for read {}: {}".format(nth_read_name, move_table[:5]))
logging.info("Read sequence for read {}: {}".format(nth_read_name, read_sequence[:5]))
logging.info("Read sequence length for read {}: {}".format(nth_read_name, len(read_sequence)))
logging.info("Signal data length for read {}: {}".format(nth_read_name, len(move_table)))

# Set up the output CSV
csv_qc_filepath = os.path.join(out_path, nth_read_name + '_QC.csv')
Expand Down

0 comments on commit 562c2ed

Please sign in to comment.