diff --git a/pybwa/libbwaindex.pyx b/pybwa/libbwaindex.pyx index 5c830d9..9e58f76 100644 --- a/pybwa/libbwaindex.pyx +++ b/pybwa/libbwaindex.pyx @@ -3,7 +3,7 @@ from pathlib import Path from cpython cimport PyBytes_Check, PyUnicode_Check -from pysam import AlignmentFile, FastxFile +from pysam import AlignmentFile, FastxFile, samtools import enum @@ -96,22 +96,45 @@ cdef class BwaIndex: # Build the sequence dictionary dict_fn = Path(prefix.with_suffix(".dict")) - with FastxFile(filename=f"{fasta}") as reader, dict_fn.open("w") as writer: - writer.write("@HD\tVN:1.5\tSO:unsorted\n") - for rec in reader: - writer.write(f"@SQ\tSN:{rec.name}\tLN:{len(rec.sequence)}\n") + samtools.dict("-o", f"{dict_fn}", f"{fasta}") cdef _load_index(self, prefix, mode): - prefix = bwa_idx_infer_prefix(force_bytes(prefix)) - if not prefix: - raise Exception(f"Could not find the index at: {prefix}") - - self._delegate = bwa_idx_load(prefix, mode) - - # the SAM header from the sequence dictionary - seq_dict = Path(prefix.decode("utf-8")).with_suffix(".dict") - # TODO: error message when seq_dict is missing? + # infer the prefix from the hint + prefix_char_ptr = bwa_idx_infer_prefix(force_bytes(prefix)) + if not prefix_char_ptr: + raise FileNotFoundError(f"could not locate the index [prefix]: {prefix}") + + # the path to the inferred prefix + prefix_path: Path = Path(prefix_char_ptr.decode("utf-8")) + + # the path to the sequence dictionary + seq_dict = prefix_path.with_suffix(".dict") + + def assert_path_suffix_exists(suffix: str) -> None: + new_path = prefix_path.with_suffix(prefix_path.suffix + suffix) + if not new_path.exists(): + raise FileNotFoundError(f"could not locate the index file [{suffix}]: {new_path}") + + # Check that all index files exist + if mode & BWA_IDX_BWT == BWA_IDX_BWT: + assert_path_suffix_exists(".bwt") + assert_path_suffix_exists(".sa") + if mode & BWA_IDX_BNS == BWA_IDX_BNS: + assert_path_suffix_exists(".ann") + assert_path_suffix_exists(".amb") + assert_path_suffix_exists(".pac") + if mode & BWA_IDX_PAC == BWA_IDX_PAC: + assert_path_suffix_exists(".pac") + if not seq_dict.exists(): + raise FileNotFoundError( + f"could not locate the sequence dictionary [use `samtools dict`]: {seq_dict}" + ) + + # load the index + self._delegate = bwa_idx_load(prefix_char_ptr, mode) + + # load the SAM header from the sequence dictionary with seq_dict.open("r") as fh: with AlignmentFile(fh) as reader: self.header = reader.header diff --git a/tests/test_bwapy.py b/tests/test_bwapy.py index 182de8d..3557991 100644 --- a/tests/test_bwapy.py +++ b/tests/test_bwapy.py @@ -25,6 +25,12 @@ def test_bwa_index_build(ref_fasta: Path, tmp_path_factory: pytest.TempPathFacto # Build the index BwaIndex.index(fasta=ref_fasta, prefix=prefix) + + # Check the files exist + for suffix in [".bwt", ".sa", ".ann", ".pac"]: + assert prefix.with_suffix(prefix.suffix + suffix).exists() + assert prefix.with_suffix(".dict").exists() + # Load it BwaIndex(prefix=prefix)