Skip to content

Commit

Permalink
➖ remove samtools requirement and use python library pysam instead
Browse files Browse the repository at this point in the history
  • Loading branch information
sickler-alex committed Jan 12, 2024
1 parent 53591d3 commit 0821cc6
Showing 1 changed file with 19 additions and 22 deletions.
41 changes: 19 additions & 22 deletions d3b_dff_cli/modules/validation/check_readgroup.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,42 @@
#!/usr/bin/env python
import os
import sys
import subprocess
import argparse

def check_samtools():
try:
subprocess.run(['samtools', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
except subprocess.CalledProcessError as e:
print("Error: 'samtools' not found. Please install samtools.")
sys.exit(1)
import pysam

def check_rg(file_path):
"""Check if the BAM header contains an RG line."""
try:
# Check if the BAM header contains an RG line
header_result = subprocess.run(['samtools', 'view', '-H', file_path], stdout=subprocess.PIPE, check=True)
header_lines = header_result.stdout.decode().split('\n')
if not any(line.startswith('@RG') for line in header_lines):
sample_result = subprocess.run(['samtools', 'view', file_path], stdout=subprocess.PIPE, check=True)
sample_lines = sample_result.stdout.decode().split('\n')[:5000]
if not any('RG:' in line for line in sample_lines):
print(f"Error: No @RG found in the header or the first 5k reads of {file_path}.")
except subprocess.CalledProcessError as e:
bam = pysam.AlignmentFile(file_path, "rb")
header = bam.header.to_dict()
if not "RG" in header:
# search first 5k lines, and see if any start with @RG
sample_lines = bam.head(5000)
if not any("RG:" in line.to_string() for line in sample_lines):
print(
f"Error: No @RG found in the header or the first 5k reads of {file_path}."
)
except Exception as e:
print(f"Error processing {file_path}: {e}")


def main(args):
# Check if samtools is available before processing arguments
check_samtools()
for bam_file in args.bam_files:
# Check if bam format
if not bam_file.endswith((".bam", ".BAM")):
print(f"Error: Bam file {bam_file} must end with '.bam' or '.BAM'")
continue

if not os.path.exists(bam_file):
print(f"File not found: {bam_file}")
else:
#deprecated_check_rg(bam_file)
check_rg(bam_file)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Check the presence of @RG information in the header or the first 5k reads of BAM files.")
parser = argparse.ArgumentParser(
description="Check the presence of @RG information in the header or the first 5k reads of BAM files."
)
parser.add_argument("bam_files", nargs="+", help="One or more BAM files to check.")
args = parser.parse_args()
main(args)

0 comments on commit 0821cc6

Please sign in to comment.