Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/use pysam #1

Merged
merged 2 commits into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 19 additions & 22 deletions d3b_dff_cli/modules/validation/check_readgroup.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,42 @@
#!/usr/bin/env python
import os
import sys
import subprocess
import argparse

def check_samtools():
try:
subprocess.run(['samtools', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
except subprocess.CalledProcessError as e:
print("Error: 'samtools' not found. Please install samtools.")
sys.exit(1)
import pysam

def check_rg(file_path):
"""Check if the BAM header contains an RG line."""
try:
# Check if the BAM header contains an RG line
header_result = subprocess.run(['samtools', 'view', '-H', file_path], stdout=subprocess.PIPE, check=True)
header_lines = header_result.stdout.decode().split('\n')
if not any(line.startswith('@RG') for line in header_lines):
sample_result = subprocess.run(['samtools', 'view', file_path], stdout=subprocess.PIPE, check=True)
sample_lines = sample_result.stdout.decode().split('\n')[:5000]
if not any('RG:' in line for line in sample_lines):
print(f"Error: No @RG found in the header or the first 5k reads of {file_path}.")
except subprocess.CalledProcessError as e:
bam = pysam.AlignmentFile(file_path, "rb")
header = bam.header.to_dict()
if not "RG" in header:
# search first 5k lines, and see if any start with @RG
sample_lines = bam.head(5000)
if not any("RG:" in line.to_string() for line in sample_lines):
print(
f"Error: No @RG found in the header or the first 5k reads of {file_path}."
)
except Exception as e:
print(f"Error processing {file_path}: {e}")


def main(args):
# Check if samtools is available before processing arguments
check_samtools()
for bam_file in args.bam_files:
# Check if bam format
if not bam_file.endswith((".bam", ".BAM")):
print(f"Error: Bam file {bam_file} must end with '.bam' or '.BAM'")
continue

if not os.path.exists(bam_file):
print(f"File not found: {bam_file}")
else:
#deprecated_check_rg(bam_file)
check_rg(bam_file)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Check the presence of @RG information in the header or the first 5k reads of BAM files.")
parser = argparse.ArgumentParser(
description="Check the presence of @RG information in the header or the first 5k reads of BAM files."
)
parser.add_argument("bam_files", nargs="+", help="One or more BAM files to check.")
args = parser.parse_args()
main(args)
13 changes: 12 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,12 @@
samtools==1.15.1
certifi==2023.11.17
charset-normalizer==3.3.2
idna==3.6
numpy==1.24.4
pandas==2.0.3
pysam==0.22.0
python-dateutil==2.8.2
pytz==2023.3.post1
requests==2.31.0
six==1.16.0
tzdata==2023.4
urllib3==2.1.0