From 53591d36101163c60a53c266a9d3880600f4c99a Mon Sep 17 00:00:00 2001 From: Alex Sickler Date: Fri, 12 Jan 2024 14:39:29 -0500 Subject: [PATCH 1/2] :heavy_plus_sign: regenerate requirements.txt --- requirements.txt | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 96ae92b..df12df2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,12 @@ -samtools==1.15.1 \ No newline at end of file +certifi==2023.11.17 +charset-normalizer==3.3.2 +idna==3.6 +numpy==1.24.4 +pandas==2.0.3 +pysam==0.22.0 +python-dateutil==2.8.2 +pytz==2023.3.post1 +requests==2.31.0 +six==1.16.0 +tzdata==2023.4 +urllib3==2.1.0 From 0821cc60272bd13186c152ced74bc99d45471ec5 Mon Sep 17 00:00:00 2001 From: Alex Sickler Date: Fri, 12 Jan 2024 16:10:35 -0500 Subject: [PATCH 2/2] :heavy_minus_sign: remove samtools requirement and use python library pysam instead --- .../modules/validation/check_readgroup.py | 41 +++++++++---------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/d3b_dff_cli/modules/validation/check_readgroup.py b/d3b_dff_cli/modules/validation/check_readgroup.py index cb28c84..f80215e 100644 --- a/d3b_dff_cli/modules/validation/check_readgroup.py +++ b/d3b_dff_cli/modules/validation/check_readgroup.py @@ -1,45 +1,42 @@ #!/usr/bin/env python import os -import sys -import subprocess import argparse - -def check_samtools(): - try: - subprocess.run(['samtools', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True) - except subprocess.CalledProcessError as e: - print("Error: 'samtools' not found. Please install samtools.") - sys.exit(1) +import pysam def check_rg(file_path): + """Check if the BAM header contains an RG line.""" try: - # Check if the BAM header contains an RG line - header_result = subprocess.run(['samtools', 'view', '-H', file_path], stdout=subprocess.PIPE, check=True) - header_lines = header_result.stdout.decode().split('\n') - if not any(line.startswith('@RG') for line in header_lines): - sample_result = subprocess.run(['samtools', 'view', file_path], stdout=subprocess.PIPE, check=True) - sample_lines = sample_result.stdout.decode().split('\n')[:5000] - if not any('RG:' in line for line in sample_lines): - print(f"Error: No @RG found in the header or the first 5k reads of {file_path}.") - except subprocess.CalledProcessError as e: + bam = pysam.AlignmentFile(file_path, "rb") + header = bam.header.to_dict() + if not "RG" in header: + # search first 5k lines, and see if any start with @RG + sample_lines = bam.head(5000) + if not any("RG:" in line.to_string() for line in sample_lines): + print( + f"Error: No @RG found in the header or the first 5k reads of {file_path}." + ) + except Exception as e: print(f"Error processing {file_path}: {e}") + def main(args): - # Check if samtools is available before processing arguments - check_samtools() for bam_file in args.bam_files: # Check if bam format if not bam_file.endswith((".bam", ".BAM")): print(f"Error: Bam file {bam_file} must end with '.bam' or '.BAM'") continue - + if not os.path.exists(bam_file): print(f"File not found: {bam_file}") else: + #deprecated_check_rg(bam_file) check_rg(bam_file) + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Check the presence of @RG information in the header or the first 5k reads of BAM files.") + parser = argparse.ArgumentParser( + description="Check the presence of @RG information in the header or the first 5k reads of BAM files." + ) parser.add_argument("bam_files", nargs="+", help="One or more BAM files to check.") args = parser.parse_args() main(args)