From 512b378f13a33c778e382e6f0a3180992692160d Mon Sep 17 00:00:00 2001 From: Chanhee Park Date: Thu, 6 Feb 2020 16:43:44 -0600 Subject: [PATCH 1/5] Remove hisatgenotype codes --- Makefile | 2 - hisatgenotype.py | 490 --- hisatgenotype_build_genome.py | 505 ---- hisatgenotype_extract_reads.py | 541 ---- hisatgenotype_extract_vars.py | 1299 -------- hisatgenotype_hla_cyp.py | 1671 ----------- hisatgenotype_locus.py | 2631 ----------------- hisatgenotype_modules/__init__.py | 0 .../hisatgenotype_assembly_graph.py | 1902 ------------ .../hisatgenotype_typing_common.py | 1552 ---------- hisatgenotype_scripts/compare_HLA.py | 147 - hisatgenotype_scripts/compare_HLA_Omixon.py | 129 - hisatgenotype_scripts/extract_Omixon_HLA.py | 115 - .../hisatgenotype_HLA_genotyping_PGs.py | 199 -- .../hisatgenotype_convert_codis.py | 654 ---- .../hisatgenotype_extract_codis_data.py | 166 -- .../hisatgenotype_extract_cyp_data.py | 1061 ------- .../hisatgenotype_locus_samples.py | 354 --- hisatgenotype_scripts/run_extract_CP.sh | 11 - hisatgenotype_scripts/run_extract_ILMN.sh | 11 - hisatgenotype_scripts/run_genotype_build.sh | 10 - hisatgenotype_scripts/run_hisat2_build.sh | 10 - hisatgenotype_scripts/run_type_CP.sh | 10 - 23 files changed, 13470 deletions(-) delete mode 100755 hisatgenotype.py delete mode 100755 hisatgenotype_build_genome.py delete mode 100755 hisatgenotype_extract_reads.py delete mode 100755 hisatgenotype_extract_vars.py delete mode 100755 hisatgenotype_hla_cyp.py delete mode 100755 hisatgenotype_locus.py delete mode 100644 hisatgenotype_modules/__init__.py delete mode 100755 hisatgenotype_modules/hisatgenotype_assembly_graph.py delete mode 100755 hisatgenotype_modules/hisatgenotype_typing_common.py delete mode 100755 hisatgenotype_scripts/compare_HLA.py delete mode 100755 hisatgenotype_scripts/compare_HLA_Omixon.py delete mode 100755 hisatgenotype_scripts/extract_Omixon_HLA.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_convert_codis.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_extract_codis_data.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_locus_samples.py delete mode 100755 hisatgenotype_scripts/run_extract_CP.sh delete mode 100755 hisatgenotype_scripts/run_extract_ILMN.sh delete mode 100755 hisatgenotype_scripts/run_genotype_build.sh delete mode 100755 hisatgenotype_scripts/run_hisat2_build.sh delete mode 100755 hisatgenotype_scripts/run_type_CP.sh diff --git a/Makefile b/Makefile index b8399819..60445ce6 100644 --- a/Makefile +++ b/Makefile @@ -237,8 +237,6 @@ HT2LIB_PKG_SRC = \ GENERAL_LIST = $(wildcard scripts/*.sh) \ $(wildcard scripts/*.pl) \ $(wildcard *.py) \ - $(wildcard hisatgenotype_modules/*.py) \ - $(wildcard hisatgenotype_scripts/*.py) \ $(wildcard example/index/*.ht2) \ $(wildcard example/reads/*.fa) \ example/reference/22_20-21M.fa \ diff --git a/hisatgenotype.py b/hisatgenotype.py deleted file mode 100755 index cf433b48..00000000 --- a/hisatgenotype.py +++ /dev/null @@ -1,490 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re, resource -import inspect, random -import math -from datetime import datetime, date, time -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -Align reads, and sort the alignments into a BAM file -""" -def align_reads(base_fname, - read_fnames, - fastq, - threads, - verbose): - aligner_cmd = ["hisat2", - "--no-unal", - "-p", str(threads), - "--no-spliced-alignment", - "--max-altstried", "64"] - aligner_cmd += ["-X", "1000"] - # aligner_cmd += ["--mm"] - aligner_cmd += ["-x", "%s" % base_fname] - - assert len(read_fnames) > 0 - if not fastq: - aligner_cmd += ["-f"] - single = len(read_fnames) == 1 - if single: - aligner_cmd += ["-U", read_fnames[0]] - else: - aligner_cmd += ["-1", read_fnames[0], - "-2", read_fnames[1]] - - out_base_fname = read_fnames[0].split('/')[-1].split('.')[0] - - print >> sys.stderr, "%s Aligning %s to %s ..." % (str(datetime.now()), ' '.join(read_fnames), base_fname) - if verbose: - print >> sys.stderr, "\t%s" % (' '.join(aligner_cmd)) - - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - unsorted_bam_fname = "%s_unsorted.bam" % out_base_fname - sambam_cmd = ["samtools", - "view", - "-bS", - "-"] - sambam_proc = subprocess.Popen(sambam_cmd, - stdin=align_proc.stdout, - stdout=open(unsorted_bam_fname, 'w')) - sambam_proc.communicate() - - # Increase the maximum number of files that can be opened - resource.setrlimit(resource.RLIMIT_NOFILE, (10000, 10240)) - - print >> sys.stderr, "%s Sorting %s ..." % (str(datetime.now()), unsorted_bam_fname) - bam_fname = "%s.bam" % out_base_fname - bamsort_cmd = ["samtools", - "sort", - "--threads", str(threads), - "-m", "1536M", - unsorted_bam_fname, - "-o", bam_fname] - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(bamsort_cmd) - bamsort_proc = subprocess.call(bamsort_cmd) - os.remove(unsorted_bam_fname) - - index_bam(bam_fname, - verbose) - - return bam_fname - - -""" -""" -def index_bam(bam_fname, - verbose): - print >> sys.stderr, "%s Indexing %s ..." % (str(datetime.now()), bam_fname) - bamindex_cmd = ["samtools", - "index", - bam_fname] - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(bamindex_cmd) - bamindex_proc = subprocess.call(bamindex_cmd) - - -""" -""" -def extract_reads(bam_fname, - chr, - left, - right, - read_base_fname, # sample => sample.1.fq.gz and sample.2.fq.gz - paired, - fastq, - verbose): - out_read_dname = "hisatgenotype_out" - if not os.path.exists(out_read_dname): - os.mkdir(out_read_dname) - - read_fnames = [] - if paired: - read_fnames = [out_read_dname + "/" + read_base_fname + ".1.fq.gz", - out_read_dname + "/" + read_base_fname + ".2.fq.gz"] - else: - read_fnames = [out_read_dname + "/" + read_base_fname + ".fq.gz"] - - if paired: - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open(read_fnames[0], 'w'), - stderr=open("/dev/null", 'w')) - - gzip2_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open(read_fnames[1], 'w'), - stderr=open("/dev/null", 'w')) - else: - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open(read_fnames[0], 'w'), - stderr=open("/dev/null", 'w')) - - def write_read(gzip_proc, read_name, seq, qual): - if fastq: - gzip_proc.stdin.write("@%s\n" % read_name) - gzip_proc.stdin.write("%s\n" % seq) - gzip_proc.stdin.write("+\n") - gzip_proc.stdin.write("%s\n" % qual) - else: - gzip_proc.stdin.write(">%s\n" % prev_read_name) - gzip_proc.stdin.write("%s\n" % seq) - - bamview_cmd = ["samtools", "view", bam_fname, "%s:%d-%d" % (chr, left+1, right+1)] - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(bamview_cmd) - bamview_proc = subprocess.Popen(bamview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - prev_read_name, extract_read, read1, read2 = "", False, [], [] - for line in alignview_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11] - flag, pos = int(flag), int(pos) - strand = '-' if flag & 0x10 else '+' - AS, NH = "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - elif col.startswith("NH"): - NH = int(col[5:]) - - # DK - check this out - simulation = True - if (not simulation and read_name != prev_read_name) or \ - (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]): - if extract_read: - if paired: - if len(read1) == 2 and len(read2) == 2: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - write_read(gzip2_proc, prev_read_name, read2[0], read2[1]) - else: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - prev_read_name, extract_read, read1, read2 = read_name, False, [], [] - - if NH == 1: - extract_read = True - - if flag & 0x40 or not paired: # left read - if not read1: - if flag & 0x10: # reverse complement - read1 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read1 = [read, qual] - else: - assert flag & 0x80 # right read - if flag & 0x10: # reverse complement - read2 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read2 = [read, qual] - - if extract_read: - if paired: - if len(read1) == 2 and len(read2) == 2: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - write_read(gzip2_proc, prev_read_name, read2[0], read2[1]) - else: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - - gzip1_proc.stdin.close() - if paired: - gzip2_proc.stdin.close() - - return read_fnames - - -""" -""" -def perform_genotyping(base_fname, - database, - locus_list, - read_fnames, - fastq, - num_editdist, - assembly, - local_database, - threads, - verbose): - genotype_cmd = ["hisatgenotype_locus.py"] - if not local_database: - genotype_cmd += ["--genotype-genome", base_fname] - genotype_cmd += ["--base", database] - if len(locus_list) > 0: - genotype_cmd += ["--locus-list", ','.join(locus_list)] - genotype_cmd += ["-p", str(threads), - "--num-editdist", str(num_editdist)] - if not fastq: - genotype_cmd += ["-f"] - - if len(read_fnames) == 2: # paired - genotype_cmd += ["-1", read_fnames[0], - "-2", read_fnames[1]] - elif len(read_fnames) == 1: - genotype_cmd += ["-U", read_fnames[0]] - else: - assert len(read_fnames) == 0 - - if assembly: - genotype_cmd += ["--assembly"] - - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(genotype_cmd) - genotype_proc = subprocess.Popen(genotype_cmd) - genotype_proc.communicate() - - -""" -""" -def genotype(base_fname, - target_region_list, - fastq, - read_fnames, - alignment_fname, - threads, - num_editdist, - assembly, - local_database, - verbose, - debug): - # variants, backbone sequence, and other sequeces - genotype_fnames = ["%s.fa" % base_fname, - "%s.locus" % base_fname, - "%s.snp" % base_fname, - "%s.index.snp" % base_fname, - "%s.haplotype" % base_fname, - "%s.link" % base_fname, - "%s.coord" % base_fname, - "%s.clnsig" % base_fname] - # hisat2 graph index files - genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)] - if not typing_common.check_files(genotype_fnames): - print >> sys.stderr, "Error: some of the following files are missing!" - for fname in genotype_fnames: - print >> sys.stderr, "\t%s" % fname - sys.exit(1) - - # Read region alleles (names and sequences) - regions, region_loci = {}, {} - for line in open("%s.locus" % base_fname): - family, allele_name, chr, left, right = line.strip().split()[:5] - family = family.lower() - if len(target_region_list) > 0 and \ - family not in target_region_list: - continue - - locus_name = allele_name.split('*')[0] - if family in target_region_list and \ - len(target_region_list[family]) > 0 and \ - locus_name not in target_region_list[family]: - continue - - left, right = int(left), int(right) - if family not in region_loci: - region_loci[family] = [] - region_loci[family].append([locus_name, allele_name, chr, left, right]) - - if len(region_loci) <= 0: - print >> sys.stderr, "Warning: no region exists!" - sys.exit(1) - - # Align reads, and sort the alignments into a BAM file - if len(read_fnames) > 0: - alignment_fname = align_reads(base_fname, - read_fnames, - fastq, - threads, - verbose) - assert alignment_fname != "" and os.path.exists(alignment_fname) - if not os.path.exists(alignment_fname + ".bai"): - index_bam(alignment_fname, - verbose) - assert os.path.exists(alignment_fname + ".bai") - - # Extract reads and perform genotyping - for family, loci in region_loci.items(): - print >> sys.stderr, "Analyzing %s ..." % family.upper() - for locus_name, allele_name, chr, left, right in loci: - out_read_fname = "%s.%s" % (family, locus_name) - if verbose: - print >> sys.stderr, "\tExtracting reads beloning to %s-%s ..." % \ - (family, locus_name) - - extracted_read_fnames = extract_reads(alignment_fname, - chr, - left, - right, - out_read_fname, - len(read_fnames) != 1, # paired? - fastq, - verbose) - - perform_genotyping(base_fname, - family, - [locus_name], - extracted_read_fnames, - fastq, - num_editdist, - assembly, - local_database, - threads, - verbose) - print >> sys.stderr - - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='HISAT-genotype') - parser.add_argument("--base", "--base-name", - dest="base_fname", - type=str, - default="genotype_genome", - help="base filename for genotype genome") - parser.add_argument("--region-list", - dest="region_list", - type=str, - default="", - help="A comma-separated list of regions (default: empty)") - parser.add_argument("-f", "--fasta", - dest='fastq', - action='store_false', - help='FASTA file') - parser.add_argument("-U", - dest="read_fname_U", - type=str, - default="", - help="filename for single-end reads") - parser.add_argument("-1", - dest="read_fname_1", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("-2", - dest="read_fname_2", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("--alignment-file", - dest="alignment_fname", - type=str, - default="", - help="Sorted BAM alignment file name") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--num-editdist", - dest="num_editdist", - type=int, - default=2, - help="Maximum number of mismatches per read alignment to be considered (default: 2)") - parser.add_argument('--assembly', - dest='assembly', - action='store_true', - help='Perform assembly') - parser.add_argument('--local-database', - dest='local_database', - action='store_true', - help='Use local database') - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument("--debug", - dest="debug", - type=str, - default="", - help="e.g., test_id:10,read_id:10000,basic_test") - - args = parser.parse_args() - region_list = {} - if args.region_list != "": - for region in args.region_list.split(','): - region = region.split('.') - if len(region) < 1 or len(region) > 2: - print >> sys.stderr, "Error: --region-list is incorrectly formatted." - sys.exit(1) - - family = region[0].lower() - if len(region) == 2: - locus_name = region[1].upper() - if family not in region_list: - region_list[family] = set() - if len(region) == 2: - region_list[family].add(locus_name) - - read_fnames = [] - if args.alignment_fname != "": - if not os.path.exists(args.alignment_fname): - print >> sys.stderr, "Error: %s does not exist." % args.alignment_fname - elif args.read_fname_U != "": - read_fnames = [args.read_fname_U] - else: - if args.read_fname_1 == "" or args.read_fname_2 == "": - print >> sys.stderr, "Error: please specify read file names correctly: -U or -1 and -2" - sys.exit(1) - read_fnames = [args.read_fname_1, args.read_fname_2] - - debug = {} - if args.debug != "": - for item in args.debug.split(','): - if ':' in item: - key, value = item.split(':') - debug[key] = value - else: - debug[item] = 1 - - genotype(args.base_fname, - region_list, - args.fastq, - read_fnames, - args.alignment_fname, - args.threads, - args.num_editdist, - args.assembly, - args.local_database, - args.verbose, - debug) - - diff --git a/hisatgenotype_build_genome.py b/hisatgenotype_build_genome.py deleted file mode 100755 index 3d103d92..00000000 --- a/hisatgenotype_build_genome.py +++ /dev/null @@ -1,505 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2016, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import shutil -import inspect -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -""" -def read_clnsig(fname): - clnsig_dic = {} - for line in open(fname): - var_id, gene, clnsig = line.strip().split('\t') - clnsig_dic[var_id] = [gene, clnsig] - return clnsig_dic - - -""" -""" -def build_genotype_genome(base_fname, - inter_gap, - intra_gap, - threads, - database_list, - use_clinvar, - use_commonvar, - aligner, - graph_index, - verbose): - # Download HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index() - - # Load genomic sequences - chr_dic, chr_names, chr_full_names = typing_common.read_genome(open("genome.fa")) - - genotype_vars, genotype_haplotypes, genotype_clnsig = {}, {}, {} - if use_clinvar: - # Extract variants from the ClinVar database - CLINVAR_fnames = ["clinvar.vcf.gz", - "clinvar.snp", - "clinvar.haplotype", - "clinvar.clnsig"] - - if not typing_common.check_files(CLINVAR_fnames): - if not os.path.exists("clinvar.vcf.gz"): - os.system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive/2017/clinvar_20170404.vcf.gz") - assert os.path.exists("clinvar.vcf.gz") - - extract_cmd = ["hisat2_extract_snps_haplotypes_VCF.py"] - extract_cmd += ["--inter-gap", str(inter_gap), - "--intra-gap", str(intra_gap), - "--genotype-vcf", "clinvar.vcf.gz", - "genome.fa", "/dev/null", "clinvar"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not typing_common.check_files(CLINVAR_fnames): - print >> sys.stderr, "Error: extract variants from clinvar failed!" - sys.exit(1) - - # Read variants to be genotyped - genotype_vars = typing_common.read_variants("clinvar.snp") - - # Read haplotypes - genotype_haplotypes = typing_common.read_haplotypes("clinvar.haplotype") - - # Read information about clinical significance - genotype_clnsig = typing_common.read_clnsig("clinvar.clnsig") - - if use_commonvar: - # Extract variants from dbSNP database - commonvar_fbase = "snp144Common" - commonvar_fnames = ["%s.snp" % commonvar_fbase, - "%s.haplotype" % commonvar_fbase] - if not typing_common.check_files(commonvar_fnames): - if not os.path.exists("%s.txt.gz" % commonvar_fbase): - os.system("wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/%s.txt.gz" % commonvar_fbase) - assert os.path.exists("%s.txt.gz" % commonvar_fbase) - os.system("gzip -cd %s.txt.gz | awk 'BEGIN{OFS=\"\t\"} {if($2 ~ /^chr/) {$2 = substr($2, 4)}; if($2 == \"M\") {$2 = \"MT\"} print}' > %s.txt" % (commonvar_fbase, commonvar_fbase)) - extract_cmd = ["hisat2_extract_snps_haplotypes_UCSC.py", - "--inter-gap", str(inter_gap), - "--intra-gap", str(intra_gap), - "genome.fa", "%s.txt" % commonvar_fbase, commonvar_fbase] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not typing_common.check_files(commonvar_fnames): - print >> sys.stderr, "Error: extract variants from clinvar failed!" - sys.exit(1) - - # Read variants to be genotyped - genotype_vars = typing_common.read_variants("%s.snp" % commonvar_fbase) - - # Read haplotypes - genotype_haplotypes = typing_common.read_haplotypes("%s.haplotype" % commonvar_fbase) - - # Genes to be genotyped - genotype_genes = {} - - # Read genes or genomics regions - for database_name in database_list: - # Extract HLA variants, backbone sequence, and other sequeces - typing_common.extract_database_if_not_exists(database_name, - [], # locus_list - inter_gap, - intra_gap, - True, # partial? - verbose) - locus_fname = "%s.locus" % database_name - assert os.path.exists(locus_fname) - for line in open(locus_fname): - locus_name, chr, left, right, length, exon_str, strand = line.strip().split() - left, right = int(left), int(right) - length = int(length) - if chr not in chr_names: - continue - if chr not in genotype_genes: - genotype_genes[chr] = [] - genotype_genes[chr].append([left, right, length, locus_name, database_name, exon_str, strand]) - - # Write genotype genome - var_num, haplotype_num = 0, 0 - genome_out_file = open("%s.fa" % base_fname, 'w') - locus_out_file = open("%s.locus" % base_fname, 'w') - var_out_file = open("%s.snp" % base_fname, 'w') - index_var_out_file = open("%s.index.snp" % base_fname, 'w') - haplotype_out_file = open("%s.haplotype" % base_fname, 'w') - link_out_file = open("%s.link" % base_fname, 'w') - coord_out_file = open("%s.coord" % base_fname, 'w') - clnsig_out_file = open("%s.clnsig" % base_fname, 'w') - for c in range(len(chr_names)): - chr = chr_names[c] - chr_full_name = chr_full_names[c] - assert chr in chr_dic - chr_seq = chr_dic[chr] - chr_len = len(chr_seq) - if chr in genotype_genes: - chr_genes = genotype_genes[chr] - def gene_cmp(a, b): - a_left, a_right, a_length = a[:3] - b_left, b_right, b_length = b[:3] - if a_left != b_left: - return a_left - b_left - if a_right != b_right: - return a_right - b_right - return a_lenght - b_length - chr_genes = sorted(chr_genes, cmp=gene_cmp) - else: - chr_genes = [] - - chr_genotype_vars, chr_genotype_vari = [], 0 - if graph_index: - if chr in genotype_vars: - chr_genotype_vars = genotype_vars[chr] - chr_genotype_haplotypes, chr_genotype_hti = [], 0 - if chr in genotype_haplotypes: - chr_genotype_haplotypes = genotype_haplotypes[chr] - - def add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num): - # Output variants with clinical significance - while chr_genotype_vari < len(chr_genotype_vars): - var_left, var_type, var_data, var_id = chr_genotype_vars[chr_genotype_vari] - var_right = var_left - if var_type == "deletion": - var_right += var_data - if var_right > right: - break - if var_right >= left: - chr_genotype_vari += 1 - continue - - out_str = "%s\t%s\t%s\t%d\t%s" % (var_id, var_type, chr, var_left + off, var_data) - print >> var_out_file, out_str - print >> index_var_out_file, out_str - - if var_id in genotype_clnsig: - var_gene, clnsig = genotype_clnsig[var_id] - print >> clnsig_out_file, "%s\t%s\t%s" % \ - (var_id, var_gene, clnsig) - - chr_genotype_vari += 1 - - # Output haplotypes - while chr_genotype_hti < len(chr_genotype_haplotypes): - ht_left, ht_right, ht_vars = chr_genotype_haplotypes[chr_genotype_hti] - if ht_right > right: - break - if ht_right >= left: - chr_genotype_hti += 1 - continue - - print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (haplotype_num, chr, ht_left + off, ht_right + off, ','.join(ht_vars)) - chr_genotype_hti += 1 - haplotype_num += 1 - - return chr_genotype_vari, chr_genotype_hti, haplotype_num - - out_chr_seq = "" - - off = 0 - prev_right = 0 - for gene in chr_genes: - left, right, length, name, family, exon_str, strand = gene - - if not graph_index: - # Output gene (genotype_genome.gene) - print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \ - (family.upper(), name, chr, left, right, exon_str, strand) - continue - - chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num) - - # Read HLA backbone sequences - allele_seqs = typing_common.read_allele_sequences("%s_backbone.fa" % family) - - # Read HLA variants - allele_vars = typing_common.read_variants("%s.snp" % family) - allele_index_vars = typing_common.read_variants("%s.index.snp" % family) - - # Read HLA haplotypes - allele_haplotypes = typing_common.read_haplotypes("%s.haplotype" % family) - - # Read HLA link information between haplotypes and variants - links = typing_common.read_links("%s.link" % family) - - if name not in allele_seqs: - continue - if name not in allele_vars or name not in allele_index_vars: - vars, index_vars = [], [] - else: - vars, index_vars = allele_vars[name], allele_index_vars[name] - - allele_seq = allele_seqs[name] - index_var_ids = set() - for _, _, _, var_id in index_vars: - index_var_ids.add(var_id) - - if name not in allele_haplotypes: - haplotypes = [] - else: - haplotypes = allele_haplotypes[name] - assert length == len(allele_seq) - assert left < chr_len and right < chr_len - # Skipping overlapping genes - if left < prev_right: - print >> sys.stderr, "Warning: skipping %s ..." % (name) - continue - - varID2htID = {} - - assert left < right - prev_length = right - left + 1 - assert prev_length <= length - - if prev_right < left: - out_chr_seq += chr_seq[prev_right:left] - - # Output gene (genotype_genome.gene) - print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \ - (family.upper(), name, chr, len(out_chr_seq), len(out_chr_seq) + length - 1, exon_str, strand) - - # Output coord (genotype_genome.coord) - print >> coord_out_file, "%s\t%d\t%d\t%d" % \ - (chr, len(out_chr_seq), left, right - left + 1) - out_chr_seq += allele_seq - - # Output variants (genotype_genome.snp and genotype_genome.index.snp) - for var in vars: - var_left, var_type, var_data, var_id = var - new_var_id = "hv%d" % var_num - varID2htID[var_id] = new_var_id - new_var_left = var_left + left + off - assert var_type in ["single", "deletion", "insertion"] - assert new_var_left < len(out_chr_seq) - if var_type == "single": - assert out_chr_seq[new_var_left] != var_data - elif var_type == "deletion": - assert new_var_left + var_data <= len(out_chr_seq) - else: - assert var_type == "insertion" - - out_str = "%s\t%s\t%s\t%d\t%s" % (new_var_id, var_type, chr, new_var_left, var_data) - print >> var_out_file, out_str - if var_id in index_var_ids: - print >> index_var_out_file, out_str - var_num += 1 - - # Output haplotypes (genotype_genome.haplotype) - for haplotype in haplotypes: - ht_left, ht_right, ht_vars = haplotype - new_ht_left = ht_left + left + off - assert new_ht_left < len(out_chr_seq) - new_ht_right = ht_right + left + off - assert new_ht_left <= new_ht_right - assert new_ht_right <= len(out_chr_seq) - new_ht_vars = [] - for var_id in ht_vars: - assert var_id in varID2htID - new_ht_vars.append(varID2htID[var_id]) - print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (haplotype_num, chr, new_ht_left, new_ht_right, ','.join(new_ht_vars)) - haplotype_num += 1 - - # Output link information between alleles and variants (genotype_genome.link) - for link in links: - var_id, allele_names = link - if var_id not in varID2htID: - continue - new_var_id = varID2htID[var_id] - print >> link_out_file, "%s\t%s" % (new_var_id, allele_names) - - off += (length - prev_length) - - prev_right = right + 1 - - if not graph_index: - continue - - # Write the rest of the Vars - chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(sys.maxint, sys.maxint, chr_genotype_vari, chr_genotype_hti, haplotype_num) - - print >> coord_out_file, "%s\t%d\t%d\t%d" % \ - (chr, len(out_chr_seq), prev_right, len(chr_seq) - prev_right) - out_chr_seq += chr_seq[prev_right:] - - assert len(out_chr_seq) == len(chr_seq) + off - - # Output chromosome sequence - print >> genome_out_file, ">%s" % (chr_full_name) - line_width = 60 - for s in range(0, len(out_chr_seq), line_width): - print >> genome_out_file, out_chr_seq[s:s+line_width] - - genome_out_file.close() - locus_out_file.close() - var_out_file.close() - index_var_out_file.close() - haplotype_out_file.close() - link_out_file.close() - coord_out_file.close() - clnsig_out_file.close() - - allele_out_file = open("%s.allele" % base_fname, 'w') - if graph_index: - for database in database_list: - for line in open("%s.allele" % database): - allele_name = line.strip() - print >> allele_out_file, "%s\t%s" % (database.upper(), allele_name) - allele_out_file.close() - - partial_out_file = open("%s.partial" % base_fname, 'w') - if graph_index: - for database in database_list: - for line in open("%s.partial" % database): - allele_name = line.strip() - print >> partial_out_file, "%s\t%s" % (database.upper(), allele_name) - partial_out_file.close() - - if not graph_index: - shutil.copyfile("genome.fa", "%s.fa" % base_fname) - - # Index genotype_genome.fa - index_cmd = ["samtools", "faidx", "%s.fa" % base_fname] - subprocess.call(index_cmd) - - # Build indexes based on the above information - if graph_index: - assert aligner == "hisat2" - build_cmd = ["hisat2-build", - "-p", str(threads), - "--snp", "%s.index.snp" % base_fname, - "--haplotype", "%s.haplotype" % base_fname, - "%s.fa" % base_fname, - "%s" % base_fname] - else: - assert aligner in ["hisat2", "bowtie2"] - build_cmd = ["%s-build" % aligner, - "-p" if aligner == "hisat2" else "--threads", str(threads), - "%s.fa" % base_fname, - "%s" % base_fname] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(build_cmd) - - subprocess.call(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - - if aligner == "hisat2": - index_fnames = ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)] - else: - index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)] - index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)] - if not typing_common.check_files(index_fnames): - print >> sys.stderr, "Error: indexing failed! Perhaps, you may have forgotten to build %s executables?" % aligner - sys.exit(1) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Build genotype genome") - parser.add_argument("--base", "--base-fname", - dest="base_fname", - type=str, - default="genotype_genome", - help="base filename for genotype genome (default: genotype_genome)") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--database-list", - dest="database_list", - type=str, - default="", - help="A comma-separated list of databases (default: hla,codis,cyp)") - parser.add_argument("--commonvar", - dest="use_commonvar", - action="store_true", - help="Include common variants from dbSNP") - parser.add_argument("--clinvar", - dest="use_clinvar", - action="store_true", - help="Include variants from ClinVar database") - parser.add_argument("--inter-gap", - dest="inter_gap", - type=int, - default=30, - help="Maximum distance for variants to be in the same haplotype") - parser.add_argument("--intra-gap", - dest="intra_gap", - type=int, - default=50, - help="Break a haplotype into several haplotypes") - parser.add_argument("--aligner", - dest="aligner", - type=str, - default="hisat2", - help="Aligner (default: hisat2)") - parser.add_argument("--linear-index", - dest="graph_index", - action="store_false", - help="Build linear index") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.inter_gap > args.intra_gap: - print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap) - sys.exit(1) - - if args.database_list == "": - database_list = [] - else: - database_list = args.database_list.split(',') - - if args.use_clinvar and args.use_commonvar: - print >> sys.stderr, "Error: both --clinvar and --commonvar cannot be used together." - sys.exit(1) - - if args.aligner not in ["hisat2", "bowtie2"]: - print >> sys.stderr, "Error: --aligner should be either hisat2 or bowtie2." - sys.exit(1) - - build_genotype_genome(args.base_fname, - args.inter_gap, - args.intra_gap, - args.threads, - database_list, - args.use_clinvar, - args.use_commonvar, - args.aligner, - args.graph_index, - args.verbose) - diff --git a/hisatgenotype_extract_reads.py b/hisatgenotype_extract_reads.py deleted file mode 100755 index 98215655..00000000 --- a/hisatgenotype_extract_reads.py +++ /dev/null @@ -1,541 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re, resource -import inspect -import random -import glob -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -""" -def parallel_work(pids, - work, - fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose): - child = -1 - for i in range(len(pids)): - if pids[i] == 0: - child = i - break - - while child == -1: - status = os.waitpid(0, 0) - for i in range(len(pids)): - if status[0] == pids[i]: - child = i - pids[i] = 0 - break - - child_id = os.fork() - if child_id == 0: - work(fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose) - os._exit(os.EX_OK) - else: - # print >> sys.stderr, '\t\t>> thread %d: %d' % (child, child_id) - pids[child] = child_id - - -""" -""" -def wait_pids(pids): - for pid in pids: - if pid > 0: - os.waitpid(pid, 0) - - -""" -""" -def extract_reads(base_fname, - database_list, - read_dir, - out_dir, - suffix, - read_fname, - fastq, - paired, - simulation, - threads, - threads_aprocess, - max_sample, - job_range, - aligner, - block_size, - verbose): - if block_size > 0: - resource.setrlimit(resource.RLIMIT_NOFILE, (1000, 1000)) - resource.setrlimit(resource.RLIMIT_NPROC, (1000, 1000)) - - genotype_fnames = ["%s.fa" % base_fname, - "%s.locus" % base_fname, - "%s.snp" % base_fname, - "%s.haplotype" % base_fname, - "%s.link" % base_fname, - "%s.coord" % base_fname, - "%s.clnsig" % base_fname] - # graph index files - if aligner == "hisat2": - genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)] - else: - assert aligner == "bowtie2" - genotype_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)] - genotype_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)] - - if not typing_common.check_files(genotype_fnames): - print >> sys.stderr, "Error: %s related files do not exist as follows:" % base_fname - for fname in genotype_fnames: - print >> sys.stderr, "\t%s" % fname - sys.exit(1) - - filter_region = len(database_list) > 0 - ranges = [] - regions, region_loci = {}, {} - for line in open("%s.locus" % base_fname): - family, allele_name, chr, left, right = line.strip().split()[:5] - if filter_region and family.lower() not in database_list: - continue - region_name = "%s-%s" % (family, allele_name.split('*')[0]) - assert region_name not in regions - regions[region_name] = allele_name - left, right = int(left), int(right) - """ - exons = [] - for exon in exon_str.split(','): - exon_left, exon_right = exon.split('-') - exons.append([int(exon_left), int(exon_right)]) - """ - if chr not in region_loci: - region_loci[chr] = {} - region_loci[chr][region_name] = [allele_name, chr, left, right] - database_list.add(family.lower()) - - if out_dir != "" and not os.path.exists(out_dir): - os.mkdir(out_dir) - - # Extract reads - if len(read_fname) > 0: - if paired: - fq_fnames = [read_fname[0]] - fq_fnames2 = [read_fname[1]] - else: - fq_fnames = read_fname - else: - if paired: - fq_fnames = glob.glob("%s/*.1.%s" % (read_dir, suffix)) - else: - fq_fnames = glob.glob("%s/*.%s" % (read_dir, suffix)) - count = 0 - pids = [0 for i in range(threads)] - for file_i in range(len(fq_fnames)): - if file_i >= max_sample: - break - fq_fname = fq_fnames[file_i] - if job_range[1] > 1: - if job_range[0] != (file_i % job_range[1]): - continue - - fq_fname_base = fq_fname.split('/')[-1] - one_suffix = ".1." + suffix - if fq_fname_base.find(one_suffix) != -1: - fq_fname_base = fq_fname_base[:fq_fname_base.find(one_suffix)] - else: - fq_fname_base = fq_fname_base.split('.')[0] - - if paired: - if read_dir == "": - fq_fname2 = fq_fnames2[file_i] - else: - fq_fname2 = "%s/%s.2.%s" % (read_dir, fq_fname_base, suffix) - if not os.path.exists(fq_fname2): - print >> sys.stderr, "%s does not exist." % fq_fname2 - continue - else: - fq_fname2 = "" - - if paired: - if out_dir != "": - if os.path.exists("%s/%s.extracted.1.fq.gz" % (out_dir, fq_fname_base)): - continue - else: - if out_dir != "": - if os.path.exists("%s/%s.extracted.fq.gz" % (out_dir, fq_fname_base)): - continue - count += 1 - - print >> sys.stderr, "\t%d: Extracting reads from %s" % (count, fq_fname_base) - def work(fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose): - aligner_cmd = [aligner] - if threads_aprocess > 1: - aligner_cmd += ["-p", "%d" % threads_aprocess] - if not fastq: - aligner_cmd += ["-f"] - aligner_cmd += ["-x", base_fname] - if aligner == "hisat2": - aligner_cmd += ["--no-spliced-alignment"] - # aligner_cmd += ["--max-altstried", "64"] - aligner_cmd += ["-X", "1000"] - if paired: - aligner_cmd += ["-1", fq_fname, - "-2", fq_fname2] - else: - aligner_cmd += ["-U", fq_fname] - if verbose: - print >> sys.stderr, "\t\trunning", ' '.join(aligner_cmd) - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - gzip_dic = {} - out_dir_slash = out_dir - if out_dir != "": - out_dir_slash += "/" - for database in database_list: - if paired: - # LP6005041-DNA_A01.extracted.1.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.extracted.1.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'), - stderr=open("/dev/null", 'w')) - - # LP6005041-DNA_A01.extracted.2.fq.gz - gzip2_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.extracted.2.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'), - stderr=open("/dev/null", 'w')) - else: - # LP6005041-DNA_A01.extracted.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.extracted.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'), - stderr=open("/dev/null", 'w')) - gzip_dic[database] = [gzip1_proc, gzip2_proc if paired else None] - - whole_gzip_dic = {} - if block_size > 0: - mult = block_size / 1000000 - for chr_line in open("%s.fa.fai" % base_fname): - chr, length = chr_line.strip().split('\t')[:2] - length = int(length) - if chr not in [str(i+1) for i in range(22)] + ['X', 'Y', 'MT']: - continue - length = (length + block_size - 1) / block_size - assert chr not in whole_gzip_dic - whole_gzip_dic[chr] = [] - for region_i in range(length): - if paired: - # LP6005041-DNA_A01.extracted.1.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.%d_%dM.extracted.1.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'), - stderr=open("/dev/null", 'w')) - - # LP6005041-DNA_A01.extracted.2.fq.gz - gzip2_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.%d_%dM.extracted.2.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'), - stderr=open("/dev/null", 'w')) - else: - # LP6005041-DNA_A01.extracted.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.%d_%dM.extracted.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'), - stderr=open("/dev/null", 'w')) - whole_gzip_dic[chr].append([gzip1_proc, gzip2_proc if paired else None]) - - - def write_read(gzip_proc, read_name, seq, qual): - if fastq: - gzip_proc.stdin.write("@%s\n" % read_name) - gzip_proc.stdin.write("%s\n" % seq) - gzip_proc.stdin.write("+\n") - gzip_proc.stdin.write("%s\n" % qual) - else: - gzip_proc.stdin.write(">%s\n" % prev_read_name) - gzip_proc.stdin.write("%s\n" % seq) - - prev_read_name, extract_read, whole_extract_read, read1, read2, read1_first, read2_first = "", set(), set(), [], [], True, True - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11] - flag, pos = int(flag), int(pos) - 1 - strand = '-' if flag & 0x10 else '+' - AS, XS, NH = "", "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - elif col.startswith("XS"): - XS = int(col[5:]) - elif col.startswith("NH"): - NH = int(col[5:]) - - if (not simulation and read_name != prev_read_name) or \ - (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]): - for region in extract_read: - write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1]) - - for chr_region_num in whole_extract_read: - region_chr, region_num = chr_region_num.split('-') - region_num = int(region_num) - if region_chr not in whole_gzip_dic: - continue - - assert region_num < len(whole_gzip_dic[region_chr]) - write_read(whole_gzip_dic[region_chr][region_num][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(whole_gzip_dic[region_chr][region_num][1], prev_read_name, read2[0], read2[1]) - - prev_read_name, extract_read, whole_extract_read, read1, read2, read1_first, read2_first = read_name, set(), set(), [], [], True, True - - if flag & 0x4 == 0 and \ - ((aligner == "hisat2" and NH == 1) or (aligner == "bowtie2" and AS > XS and read1_first if flag & 0x40 or not paired else read2_first)): - if chr in region_loci: - for region, loci in region_loci[chr].items(): - region = region.split('-')[0].lower() - _, _, loci_left, loci_right = loci - # there might be a different candidate region for each of left and right reads - if pos >= loci_left and pos < loci_right: - extract_read.add(region) - break - if block_size > 0: - chr_region_num = "%s-%d" % (chr, pos / block_size) - whole_extract_read.add(chr_region_num) - - if flag & 0x40 or not paired: # left read - read1_first = False - if not read1: - if flag & 0x10: # reverse complement - read1 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read1 = [read, qual] - else: - assert flag & 0x80 # right read - read2_first = False - if flag & 0x10: # reverse complement - read2 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read2 = [read, qual] - - for region in extract_read: - write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1]) - - for chr_region_num in whole_extract_read: - region_chr, region_num = chr_region_num.split('-') - region_num = int(region_num) - if region_chr not in whole_gzip_dic: - continue - assert region_num < len(whole_gzip_dic[region_chr]) - write_read(whole_gzip_dic[region_chr][region_num][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(whole_gzip_dic[region_chr][region_num][1], prev_read_name, read2[0], read2[1]) - - for gzip1_proc, gzip2_proc in gzip_dic.values(): - gzip1_proc.stdin.close() - if paired: - gzip2_proc.stdin.close() - - for gzip_list in whole_gzip_dic.values(): - for gzip1_proc, gzip2_proc in gzip_list: - gzip1_proc.stdin.close() - if paired: - gzip2_proc.stdin.close() - - - if threads <= 1: - work(fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose) - else: - parallel_work(pids, - work, - fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose) - - if threads > 1: - wait_pids(pids) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='Extract reads') - parser.add_argument("--base", "--base-fname", - dest="base_fname", - type=str, - default="genotype_genome", - help="base filename for genotype genome") - parser.add_argument("--read-dir", - dest="read_dir", - type=str, - default="", - help="Directory name for read files") - parser.add_argument("--out-dir", - dest="out_dir", - type=str, - default="", - help="Directory name for extracted read files") - parser.add_argument("--suffix", - dest="suffix", - type=str, - default="fq.gz", - help="Read file suffix (Default: fq.gz)") - parser.add_argument('-f', '--fasta', - dest='fastq', - action='store_false', - help='FASTA format') - parser.add_argument("-U", - dest="read_fname_U", - type=str, - default="", - help="filename for single-end reads") - parser.add_argument("-1", - dest="read_fname_1", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("-2", - dest="read_fname_2", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("--database-list", - dest="database_list", - type=str, - default="", - help="A comma-separated list of database (default: empty)") - parser.add_argument('--simulation', - dest='simulation', - action='store_true', - help='Simulated reads (Default: False)') - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--pp", "--threads-aprocess", - dest="threads_aprocess", - type=int, - default=1, - help="Number of threads a process") - parser.add_argument("--max-sample", - dest="max_sample", - type=int, - default=sys.maxint, - help="Number of samples to be extracted (default: sys.maxint)") - parser.add_argument("--job-range", - dest="job_range", - type=str, - default="0,1", - help="two numbers (e.g. 1,3)") - parser.add_argument("--aligner", - dest="aligner", - type=str, - default="hisat2", - help="Aligner (default: hisat2)") - parser.add_argument("--extract-whole", - dest="extract_whole", - action='store_true', - help="Extract all reads") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - - args = parser.parse_args() - - database_list = set() - if args.database_list != "": - for region in args.database_list.split(','): - database_list.add(region) - if args.read_fname_U != "": - args.read_fname = [args.read_fname_U] - elif args.read_fname_1 != "" or args.read_fname_2 != "": - if args.read_fname_1 == "" or args.read_fname_2 == "": - print >> sys.stderr, "Error: please specify both -1 and -2." - sys.exit(1) - args.read_fname = [args.read_fname_1, args.read_fname_2] - else: - args.read_fname = [] - if len(args.read_fname) == 0: - if args.read_dir == "" or not os.path.exists(args.read_dir): - print >> sys.stderr, "Error: please specify --read-dir with an existing directory." - sys.exit(1) - if args.out_dir == "": - print >> sys.stderr, "Error: please specify --out-dir with a directory name." - sys.exit(1) - job_range = [] - for num in args.job_range.split(','): - job_range.append(int(num)) - - if args.aligner not in ["hisat2", "bowtie2"]: - print >> sys.stderr, "Error: --aligner should be either hisat2 or bowtie2." - sys.exit(1) - block_size = 20000000 if args.extract_whole else 0 - - extract_reads(args.base_fname, - database_list, - args.read_dir, - args.out_dir, - args.suffix, - args.read_fname, - args.fastq, - False if args.read_fname_U != "" else True, - args.simulation, - args.threads, - args.threads_aprocess, - args.max_sample, - job_range, - args.aligner, - block_size, - args.verbose) - diff --git a/hisatgenotype_extract_vars.py b/hisatgenotype_extract_vars.py deleted file mode 100755 index 4c673177..00000000 --- a/hisatgenotype_extract_vars.py +++ /dev/null @@ -1,1299 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2015, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import inspect -import glob -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -Mapping from base pair to a location in MSF format -""" -def create_map(seq): - seq_map = {} - count = 0 - for i in range(len(seq)): - bp = seq[i] - if bp == '.': - continue - assert bp in "ACGT" - seq_map[count] = i - count += 1 - return seq_map - - -""" -""" -def create_consensus_seq(seqs, - seq_len, - min_var_freq, - remove_empty = True): - consensus_freq = [[0, 0, 0, 0, 0] for i in range(seq_len)] - for i in range(len(seqs)): - seq = seqs[i] - if len(seq) != seq_len: - continue - for j in range(seq_len): - nt = seq[j] - assert nt in "ACGT.E" - if nt == 'A': - consensus_freq[j][0] += 1 - elif nt == 'C': - consensus_freq[j][1] += 1 - elif nt == 'G': - consensus_freq[j][2] += 1 - elif nt == 'T': - consensus_freq[j][3] += 1 - else: - assert nt in ".E" - consensus_freq[j][4] += 1 - - for j in range(len(consensus_freq)): - for k in range(len(consensus_freq[j])): - consensus_freq[j][k] /= float(len(seqs)) - consensus_freq[j][k] *= 100.0 - - consensus_seq = "" - has_empty = False - for c in range(len(consensus_freq)): - freq = consensus_freq[c] - A, C, G, T, E = freq - # No alleles have bases at this particular location - if E >= 100.0: - has_empty = True - consensus_seq += 'E' - continue - if E >= 100.0 - min_var_freq: - idx = 4 - else: - idx = freq.index(max(freq[:4])) - assert idx < 5 - consensus_seq += "ACGT."[idx] - consensus_seq = ''.join(consensus_seq) - - # Remove dots (deletions) - skip_pos = set() - if has_empty and remove_empty: - for seq_i in range(len(seqs)): - seqs[seq_i] = list(seqs[seq_i]) - for i in range(len(consensus_seq)): - if consensus_seq[i] != 'E': - continue - skip_pos.add(i) - for seq_i in range(len(seqs)): - if i >= len(seqs[seq_i]): - continue - seqs[seq_i][i] = 'E' - for seq_i in range(len(seqs)): - seqs[seq_i] = ''.join(seqs[seq_i]) - seqs[seq_i] = seqs[seq_i].replace('E', '') - consensus_seq = consensus_seq.replace('E', '') - - # Convert a list form of consensus_freq to a dictionary form - temp_freq = [] - for j in range(len(consensus_freq)): - if j in skip_pos: - continue - freq_dic = {} - for k in range(len(consensus_freq[j])): - freq = consensus_freq[j][k] - if freq <= 0.0: - continue - nt = "ACGT."[k] - freq_dic[nt] = freq - temp_freq.append(freq_dic) - consensus_freq = temp_freq - - assert len(consensus_seq) == len(consensus_freq) - return consensus_seq, consensus_freq - - - -""" -Left-shift deletions if poissble -""" -def leftshift_deletions(backbone_seq, seq, debug = False): - if len(seq) != len(backbone_seq): - return seq - seq = list(seq) - seq_len = len(seq) - bp_i = 0 - # Skip the first deletion - while bp_i < seq_len: - if seq[bp_i] in "ACGT": - break - bp_i += 1 - - while bp_i < seq_len: - bp = seq[bp_i] - if bp != '.': - bp_i += 1 - continue - bp_j = bp_i + 1 - while bp_j < seq_len: - bp2 = seq[bp_j] - if bp2 != '.': - break - else: - bp_j += 1 - - if bp_j >= seq_len: - bp_i = bp_j - break - - if debug: - print >> sys.stderr, bp_i, bp_j, backbone_seq[bp_i-10:bp_i], backbone_seq[bp_i:bp_j], backbone_seq[bp_j:bp_j+10] - print >> sys.stderr, bp_i, bp_j, ''.join(seq[bp_i-10:bp_i]), ''.join(seq[bp_i:bp_j]), ''.join(seq[bp_j:bp_j+10]) - prev_i, prev_j = bp_i, bp_j - - while bp_i > 0 and seq[bp_i-1] in "ACGT" and backbone_seq[bp_j-1] in "ACGT": - if seq[bp_i-1] != backbone_seq[bp_j-1]: - break - seq[bp_j-1] = seq[bp_i-1] - seq[bp_i-1] = '.' - bp_i -= 1 - bp_j -= 1 - bp_i = bp_j - while bp_i < seq_len: - if seq[bp_i] in "ACGT": - break - bp_i += 1 - - # DK - debugging purposes - if debug: - print prev_i, prev_j, ''.join(seq[prev_i-10:prev_i]), ''.join(seq[prev_i:prev_j]), ''.join(seq[prev_j:prev_j+10]) - - return ''.join(seq) - - -""" -""" -def extract_vars(base_fname, - base_dname, - locus_list, - inter_gap, - intra_gap, - whole_haplotype, - min_var_freq, - ext_seq_len, - leftshift, - partial, - verbose): - base_fullpath_name = base_fname - if base_dname != "" and not os.path.exists(base_dname): - os.mkdir(base_dname) - base_fullpath_name = "%s/%s" % (base_dname, base_fname) - - # Download human genome and HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index() - - # Corresponding genomic loci found by HISAT2 (reference is GRCh38) - # e.g. hisat2 --no-unal --score-min C,0 -x grch38/genome -f hisatgenotype_db/HLA/fasta/A_gen.fasta - locus_file = open(base_fullpath_name + ".locus", 'w') - left_ext_seq_dic, right_ext_seq_dic = {}, {} - genes, gene_strand = {}, {} - - # Clone a git repository, hisatgenotype_db - if not os.path.exists("hisatgenotype_db"): - typing_common.clone_hisatgenotype_database() - fasta_dname = "hisatgenotype_db/%s/fasta" % base_fname.upper() - - # Check HLA genes - gene_names = [] - if base_fname == "hla": - fasta_fnames = glob.glob("%s/*_gen.fasta" % fasta_dname) - else: - assert base_fname in ["codis", "cyp"] - fasta_fnames = glob.glob("%s/*.fasta" % fasta_dname) - for gen_fname in fasta_fnames: - gene_name = gen_fname.split('/')[-1].split('_')[0] - if gene_name == "hla": - continue - gene_names.append(gene_name) - - if locus_list == []: - locus_list = gene_names - - cigar_re = re.compile('\d+\w') - remove_locus_list = [] - for gene in locus_list: - aligner_cmd = ["hisat2"] - if base_fname in ["hla", "coids"]: - aligner_cmd += ["--score-min", "C,0"] - aligner_cmd += ["--no-unal", - "-x", "grch38/genome", - "-f", "%s/%s_gen.fasta" % (fasta_dname, gene)] - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - allele_id = "" - best_chr, best_left, best_right, best_AS, best_strand = "", -1, -1, -sys.maxint, '' - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - temp_allele_id, flag, chr, left, _, cigar_str = cols[:6] - left = int(left) - 1 - right = left - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - if len(cigars) > 1 or cigars[0][0] != 'M': - continue - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MND": - right += length - - flag = int(flag) - strand = '-' if flag & 0x10 else '+' - AS = "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = col[5:] - assert AS != "" - AS = int(AS) - if AS > best_AS: - allele_id = temp_allele_id - best_chr, best_left, best_right, best_AS, best_strand = chr, left, right, AS, strand - - chr, left, right, strand = best_chr, best_left, best_right, best_strand - align_proc.communicate() - if allele_id == "": - remove_locus_list.append(gene) - continue - if base_fname == "hla": - allele_name = "" - for line in open("%s/%s_gen.fasta" % (fasta_dname, gene)): - line = line.strip() - if not line.startswith('>'): - continue - tmp_allele_id, tmp_allele_name = line[1:].split()[:2] - if allele_id == tmp_allele_id: - allele_name = tmp_allele_name - break - else: - allele_name = allele_id - assert allele_name != "" and strand != '' - genes[gene] = allele_name - gene_strand[gene] = strand - print >> sys.stderr, "%s-%s's reference allele is %s on '%s' strand of chromosome %s" % \ - (base_fname.upper(), gene, allele_name, strand, chr) - - assert chr != "" and left >= 0 and right > left - if ext_seq_len > 0: - left_ext_seq, right_ext_seq = "", "" - left1, left2 = max(1, left - ext_seq_len), max(1, left - 1) - if left2 > 0: - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, left1, left2)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - left_ext_seq += line - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + ext_seq_len - 1)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - right_ext_seq += line - - if strand == '-': - left_ext_seq, right_ext_seq = typing_common.reverse_complement(right_ext_seq), typing_common.reverse_complement(left_ext_seq) - left_ext_seq_dic[gene], right_ext_seq_dic[gene] = left_ext_seq, right_ext_seq - - - # Extract exon information from hla.data - gene_exons, gene_exon_counts = {}, {} - if base_fname == "hla": - skip, look_exon_num = False, False - for line in open("hisatgenotype_db/%s/hla.dat" % base_fname.upper()): - if line.startswith("DE"): - allele_name = line.split()[1][:-1] - if allele_name.startswith("HLA-"): - allele_name = allele_name[4:] - gene = allele_name.split('*')[0] - if not gene in genes: - skip = True - else: - skip = False - if skip: - continue - if not line.startswith("FT"): - continue - - if line.find("exon") != -1: - look_exon_num = True - if allele_name == genes[gene]: - exon_range = line.split()[2].split("..") - exon_left, exon_right = int(exon_range[0]) - 1, int(exon_range[1]) - 1 - assert exon_left >= 0 - assert exon_left < exon_right - if not gene in gene_exons: - gene_exons[gene] = [] - if gene in left_ext_seq_dic: - left_ext_seq_len = len(left_ext_seq_dic[gene]) - else: - left_ext_seq_len = 0 - gene_exons[gene].append([exon_left + left_ext_seq_len, exon_right + left_ext_seq_len]) - elif look_exon_num: - assert line.find("number") - look_exon_num = False - num = line.strip().split("number=")[1] - num = int(num[1:-1]) - 1 - if gene not in gene_exon_counts: - gene_exon_counts[gene] = {} - if num not in gene_exon_counts[gene]: - gene_exon_counts[gene][num] = 1 - else: - gene_exon_counts[gene][num] += 1 - - for gene, exon_counts in gene_exon_counts.items(): - print >> sys.stderr, "%s exon counts:" % gene, exon_counts - - tmp_locus_list = [] - for gene in locus_list: - if gene in remove_locus_list: - continue - if base_fname == "hla" and gene not in gene_exons: - continue - tmp_locus_list.append(gene) - locus_list = tmp_locus_list - for key in genes.keys(): - if key in locus_list: - continue - del genes[key] - del gene_strand[key] - - # Write the backbone sequences into a fasta file - backbone_file = open(base_fullpath_name + "_backbone.fa", 'w') - # variants w.r.t the backbone sequences into a SNP file - var_file = open(base_fullpath_name + ".snp", 'w') - var_index_file = open(base_fullpath_name + ".index.snp", 'w') - # variant frequence - var_freq_file = open(base_fullpath_name + ".snp.freq", 'w') - # haplotypes - haplotype_file = open(base_fullpath_name + ".haplotype", 'w') - # pairs of a variant and the corresponding HLA allels into a LINK file - link_file = open(base_fullpath_name + ".link", 'w') - # Write all the sequences with dots removed into a file - input_file = open(base_fullpath_name + "_sequences.fa", 'w') - # Write allele names into a file - allele_file = open("%s.allele" % base_fullpath_name, 'w') - # Read partial alleles from hla.data, and write them into a file - partial_file = open("%s.partial" % base_fullpath_name, 'w') - - num_vars, num_haplotypes = 0, 0 - full_alleles = {} - for gene, ref_gene in genes.items(): - strand = gene_strand[gene] - left_ext_seq, right_ext_seq = "", "" - if gene in left_ext_seq_dic: - left_ext_seq, right_ext_seq = left_ext_seq_dic[gene], right_ext_seq_dic[gene] - - def read_MSF_file(fname, left_ext_seq = "", right_ext_seq = ""): - names = {} # HLA allele names to numeric IDs - seqs = [] # HLA multiple alignment sequences - for line in open(fname): - line = line.strip() - if not line or \ - not line[0].isalnum(): - continue - - if line.startswith("MSF"): - continue - - if line.startswith("Name"): - try: - name = line.split('\t')[0] - name = name.split()[1] - except ValueError: - continue - - if name in names: - print >> sys.stderr, "Warning: %s is found more than once in Names" % (name) - continue - - names[name] = len(names) - else: - if len(seqs) == 0: - seqs = [left_ext_seq for i in range(len(names))] - try: - cols = line.split() - name = cols[0] - fives = cols[1:] - assert len(fives) > 0 - except ValueError: - continue - - if name not in names: - names[name] = len(names) - - id = names[name] - if id >= len(seqs): - assert id == len(seqs) - seqs.append(left_ext_seq) - - seqs[id] += ''.join(fives) - - # Add sub-names of the allele - sub_name = "" - for group in name.split(':')[:-1]: - if sub_name != "": - sub_name += ":" - sub_name += group - if sub_name not in full_alleles: - full_alleles[sub_name] = [name] - else: - full_alleles[sub_name].append(name) - - if len(right_ext_seq) > 0: - for i_ in range(len(seqs)): - seqs[i_] += right_ext_seq - - return names, seqs - - if base_fname == "hla": - MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene) - else: - MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene) - - if not os.path.exists(MSA_fname): - print >> sys.stderr, "Warning: %s does not exist" % MSA_fname - continue - - names, seqs = read_MSF_file(MSA_fname, left_ext_seq, right_ext_seq) - full_allele_names = set(names.keys()) - - # Identify a consensus sequence - assert len(seqs) > 0 - - # Check sequences are of equal length - def find_seq_len(seqs): - seq_lens = {} - for s in range(len(seqs)): - seq_len = len(seqs[s]) - if seq_len not in seq_lens: - seq_lens[seq_len] = 1 - else: - seq_lens[seq_len] += 1 - - max_seq_count = 0 - for tmp_seq_len, tmp_seq_count in seq_lens.items(): - if tmp_seq_count > max_seq_count: - seq_len = tmp_seq_len - max_seq_count = tmp_seq_count - return seq_len - - seq_len = find_seq_len(seqs) - backbone_name = "%s*BACKBONE" % gene - backbone_seq, backbone_freq = create_consensus_seq(seqs, - seq_len, - min_var_freq, - not partial) # Remove empty sequences? - # Allele sequences can shrink, so readjust the sequence length - if not partial: - seq_len = find_seq_len(seqs) - - if partial and base_fname == "hla": - partial_MSA_fname = "hisatgenotype_db/HLA/msf/%s_nuc.msf" % gene - if not os.path.exists(partial_MSA_fname): - print >> sys.stderr, "Warning: %s does not exist" % partial_MSA_fname - continue - partial_names, partial_seqs = read_MSF_file(partial_MSA_fname) - - # DK - debugging purposes - # Partial alleles vs. Full alleles - """ - counts = [0, 0, 0, 0] - for partial_name in partial_names.keys(): - if partial_name in names: - continue - name_group = partial_name.split(':') - for group_i in [3, 2, 1, 0]: - if group_i == 0: - counts[group_i] += 1 - if group_i > len(name_group): - continue - sub_name = ':'.join(name_group[:group_i]) - if sub_name in full_alleles: - print partial_name, sub_name, full_alleles[sub_name][:5] - counts[group_i] += 1 - break - print "DK: counts:", counts - sys.exit(1) - """ - - ref_seq = seqs[names[ref_gene]] - ref_seq_map = create_map(ref_seq) - ref_partial_seq = partial_seqs[partial_names[ref_gene]] - ref_partial_seq_map = create_map(ref_partial_seq) - exons = gene_exons[gene] - exon_len = 0 - ref_exons = [] # converted exons to MSF file (e.g. A_gen.msf) - ref_partial_exons = [] # converted exons to MSF file (e.g. A_nuc.msf) - - complete = True - for exon in exons: - left, right = exon - ref_exons.append([ref_seq_map[left], ref_seq_map[right]]) - next_exon_len = right - left + exon_len - if next_exon_len >= len(ref_partial_seq_map): - print >> sys.stderr, "Warning: partial sequences (%s) seem to be incomplete" % gene - complete = False - break - ref_partial_exons.append([ref_partial_seq_map[exon_len], ref_partial_seq_map[next_exon_len]]) - exon_len += (right - left + 1) - # Make sure two MSF files (e.g. A_gen.msf and A_nuc.msf) share the same MSF lengths in the exonic sequences - ref_exon_len = ref_exons[-1][1] - ref_exons[-1][0] + 1 - ref_partial_exon_len = ref_partial_exons[-1][1] - ref_partial_exons[-1][0] + 1 - assert ref_exon_len == ref_partial_exon_len - - if complete: - partial_seq_len = find_seq_len(partial_seqs) - partial_backbone_seq, partial_backbone_freq = create_consensus_seq(partial_seqs, - partial_seq_len, - min_var_freq, - False) # Remove empty sequences? - for name, seq_id in partial_names.items(): - if name in names: - continue - seq = partial_seqs[seq_id] - new_seq = "" - right = 0 - for e in range(len(exons)): - ref_exon = ref_exons[e] - ref_partial_exon = ref_partial_exons[e] - new_seq += backbone_seq[right:ref_exon[0]] - exon_seq = seq[ref_partial_exon[0]:ref_partial_exon[1] + 1] - nt_exon_seq = exon_seq.replace('.', '') - if len(nt_exon_seq) == 0: - exon_seq = partial_backbone_seq[ref_partial_exon[0]:ref_partial_exon[1] + 1] - new_seq += exon_seq - right = ref_exon[1] + 1 - new_seq += backbone_seq[right:] - names[name] = len(seqs) - seqs.append(new_seq) - - backbone_seq, backbone_freq = create_consensus_seq(seqs, - seq_len, - min_var_freq, - True) # Remove empty sequences? - seq_len = find_seq_len(seqs) - - if min_var_freq <= 0.0: - assert '.' not in backbone_seq and 'E' not in backbone_seq - - # Reverse complement MSF if this gene is on '-' strand - if strand == '-': - # Reverse exons - ref_seq = seqs[names[ref_gene]] - ref_seq = ref_seq.replace('.', '') - ref_seq_len = len(ref_seq) - if base_fname == "hla": - exons = [] - for left, right in reversed(gene_exons[gene]): - left, right = ref_seq_len - right - 1, ref_seq_len - left - 1 - exons.append([left, right]) - gene_exons[gene] = exons - exon_counts = {} - for exon_i, count in gene_exon_counts[gene].items(): - exon_counts[len(gene_exons[gene]) - exon_i - 1] = count - gene_exon_counts[gene] = exon_counts - - for i in range(len(seqs)): - seqs[i] = typing_common.reverse_complement(seqs[i]) - backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True) - - if leftshift: - for seq_i in range(len(seqs)): - seqs[seq_i] = leftshift_deletions(backbone_seq, seqs[seq_i]) - backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True) - seq_len = find_seq_len(seqs) - - print >> sys.stderr, "%s: number of HLA alleles is %d." % (gene, len(names)) - - Vars = {} - for cmp_name, id in names.items(): - if cmp_name == backbone_name: - continue - assert id < len(seqs) - cmp_seq = seqs[id] - if len(cmp_seq) != seq_len: - print >> sys.stderr, "Warning: the length of %s (%d) is different from %d" % \ - (cmp_name, len(cmp_seq), seq_len) - continue - - # DK - debugging purposes - """ - if cmp_name == "A*03:01:07": - print cmp_name - cmp_seq2 = seqs[names["A*32:29"]] - for s in range(0, seq_len, 100): - print s, backbone_seq[s:s+100] - print s, cmp_seq2[s:s+100] - print s, cmp_seq[s:s+100] - # sys.exit(1) - """ - def insertVar(type, info): - pos, backbone_pos, data = info - if type in "MI": - varKey = "%d-%s-%s" % (pos, type, data) - else: - varKey = "%d-%s-%d" % (pos, type, data) - - if varKey not in Vars: - if type == 'M': - assert backbone_pos < backbone_freq - assert data in backbone_freq[backbone_pos] - freq = backbone_freq[backbone_pos][data] - elif type == 'D': - del_len = int(data) - freq = 100.0 - assert backbone_pos + del_len <= backbone_freq - for d in range(del_len): - assert '.' in backbone_freq[backbone_pos + d] - freq2 = backbone_freq[backbone_pos + d]['.'] - if freq2 < freq: - freq = freq2 - else: - assert type == 'I' - ins_len = len(data) - freq = 100.0 - assert backbone_pos + ins_len <= backbone_freq - for i in range(ins_len): - nt = data[i] - assert nt in backbone_freq[backbone_pos + i] - freq2 = backbone_freq[backbone_pos + i][nt] - if freq2 < freq: - freq = freq2 - assert freq <= min_var_freq - - Vars[varKey] = [freq, [cmp_name]] - else: - Vars[varKey][1].append(cmp_name) - - insertion, deletion = [], [] - ndots = 0 - for s in range(seq_len): - assert not (insertion and deletion) - bc = backbone_seq[s] - cc = cmp_seq[s] - if bc != '.' and cc != '.': - if insertion: - insertVar('I', insertion) - insertion = [] - elif deletion: - insertVar('D', deletion) - deletion = [] - if bc != cc: - mismatch = [s - ndots, s, cc] - insertVar('M', mismatch) - elif bc == '.' and cc != '.': - if deletion: - insertVar('D', deletion) - deletion = [] - if insertion: - insertion[2] += cc - else: - insertion = [s - ndots, s, cc] - elif bc != '.' and cc == '.': - if insertion: - insertVar('I', insertion) - insertion = [] - if deletion: - deletion[2] += 1 - else: - deletion = [s - ndots, s, 1] - - if bc == '.': - ndots += 1 - - """ - if backbone_seq[s] != cmp_seq[s]: - print "%s is different %s at %d: %s vs. %s" % \ - (backbone_name, cmp_name, s+1, backbone_seq[s], cmp_seq[s]) - """ - - if insertion: - insertVar('I', insertion) - elif deletion: - insertVar('D', deletion) - - - print >> sys.stderr, "Number of variants is %d." % (len(Vars.keys())) - - # Compare variants - def cmp_varKey(a, b): - a_locus, a_type, a_data = a.split('-') - b_locus, b_type, b_data = b.split('-') - a_locus, b_locus = int(a_locus), int(b_locus) - if a_locus != b_locus: - return a_locus - b_locus - if a_type != b_type: - if a_type == 'I': - return -1 - elif b_type == 'I': - return 1 - elif a_type == 'M': - return -1 - else: - assert b_type == 'M' - return 1 - assert a_data != b_data - if a_type in "MI": - if a_data < b_data: - return -1 - else: - return 1 - else: - assert a_type == 'D' - return int(a_data) - int(b_data) - - Vars_ = {} - for key, values in Vars.items(): - freq, names_ = values - for name in names_: - if not name in Vars_: - Vars_[name] = [key] - else: - Vars_[name].append(key) - for name, vars in Vars_.items(): - Vars_[name] = sorted(vars, cmp=cmp_varKey) - - # Sanity check - - # (1) Reconstruct the other sequences from the backbone sequence and variants and - # (2) Confirm these constructed sequences are the same as those input sequences. - for cmp_name, id in names.items(): - if cmp_name == backbone_name: - continue - - constr_seq = backbone_seq.replace('.', '') - constr_seq = list(constr_seq) - locus_diff = 0 - - if cmp_name not in Vars_: - continue - - for var in Vars_[cmp_name]: - try: - locus, type, data = var.split('-') - locus = int(locus) - except ValueError: - continue - - if type == 'M': - assert len(data) == 1 - constr_seq[locus + locus_diff] = data[0] - elif type == 'I': - assert locus + locus_diff >= 0 - assert locus + locus_diff <= len(constr_seq) - constr_seq = constr_seq[:locus + locus_diff] + list(data) + constr_seq[locus + locus_diff:] - locus_diff += len(data) - else: - assert type == 'D' - assert locus + locus_diff + len(data) <= len(constr_seq) - assert locus + locus_diff >= 0 - del_len = int(data) - constr_seq = constr_seq[:locus + locus_diff] + constr_seq[locus + locus_diff + del_len:] - locus_diff -= del_len - - constr_seq = "".join(constr_seq) - assert id < len(seqs) - cmp_seq = seqs[id].replace('.', '') - if len(constr_seq) != len(cmp_seq): - print >> sys.stderr, "Error: reconstruction fails (%s)! Lengths different: %d vs. %d" % \ - (cmp_name, len(constr_seq), len(cmp_seq)) - assert False - - # Sanity check - for s in range(len(constr_seq)): - if constr_seq[s] != cmp_seq[s]: - print >> sys.stderr, "Differ at %d: %s vs. %s (reconstruction vs. original)" % \ - (s, constr_seq[s], cmp_seq[s]) - print "%s:%s vs. %s:%s" % \ - (constr_seq[s-10:s], constr_seq[s:s+10], cmp_seq[s-10:s], cmp_seq[s:s+10]) - - if constr_seq != cmp_seq.replace('.', ''): - print >> sys.stderr, "Error: reconstruction fails for %s" % (cmp_name) - assert False - - # Write the backbone sequences into a fasta file - print >> backbone_file, ">%s" % (backbone_name) - backbone_seq_ = backbone_seq.replace('.', '') - for s in range(0, len(backbone_seq_), 60): - print >> backbone_file, backbone_seq_[s:s+60] - - # Remap the backbone allele, which is sometimes slighly different from - # fasta version - ref_backbone_id = names[ref_gene] - ref_backbone_seq = seqs[ref_backbone_id] - aligner_cmd = ["hisat2"] - if base_fname == "hla": - aligner_cmd += ["--score-min", "C,0"] - aligner_cmd += ["--no-unal", - "-x", "grch38/genome", - "-f", - "-c", "%s" % ref_backbone_seq.replace('.', '')] - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - best_chr, best_left, best_right, best_AS = "", 0, 0, -sys.maxint - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - allele_id, flag, chr, left, mapQ, cigar_str = cols[:6] - flag = int(flag) - assert flag & 0x10 == 0 - left = int(left) - 1 - right = left - AS = "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = col[5:] - AS = int(AS) - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MND": - right += length - if AS > best_AS: - best_chr, best_left, best_right, best_AS = chr, left, right, AS - - chr, left, right = best_chr, best_left, best_right - align_proc.communicate() - if left == right: - print >> sys.stderr, "Warning: %s (%s) is not remapped" % (gene, ref_gene) - continue - assert left < right - - base_locus = 0 - ref_seq = seqs[names[ref_gene]] - ref_seq_map = create_map(ref_seq) - - del_count = [] - for nt in backbone_seq: - assert nt in "ACGT." - add = 1 if nt == '.' else 0 - if len(del_count) == 0: - del_count.append(add) - else: - del_count.append(del_count[-1] + add) - - if base_fname == "hla": - exon_str = "" - for exon_i in range(len(gene_exons[gene])): - exon_left, exon_right = gene_exons[gene][exon_i] - exon_left, exon_right = ref_seq_map[exon_left], ref_seq_map[exon_right] - exon_left -= del_count[exon_left] - exon_right -= del_count[exon_right] - if exon_str != "": - exon_str += ',' - primary = gene_exon_counts[gene][exon_i] == max(gene_exon_counts[gene].values()) - exon_str += ("%d-%d%s" % (exon_left, exon_right, 'p' if primary else '')) - - # Sanity check for exonic sequence - sanity_check = True - if sanity_check and \ - os.path.exists("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene): - exons_ = [] - for exon in exon_str.split(','): - if exon.endswith('p'): - exon = exon[:-1] - exon_left, exon_right = exon.split('-') - exon_left, exon_right = int(exon_left), int(exon_right) - exons_.append([exon_left, exon_right]) - - backbone_seq_ = backbone_seq.replace('.', '') - if ref_gene in Vars_: - vars_ = Vars_[ref_gene] - else: - vars_ = [] - seq_ = list(backbone_seq_) - has_insertion = False - for var_ in vars_: - var_pos, var_type, var_data = var_.split('-') - var_pos = int(var_pos) - assert var_pos >= 0 and var_pos < len(backbone_seq_) - if var_type == 'M': - seq_[var_pos] = var_data - elif var_type == 'D': - del_len = int(var_data) - assert var_pos + del_len <= len(ref_seq) - seq_[var_pos:var_pos + del_len] = ['.'] * del_len - else: - assert var_type == 'I' - has_insertion = True - - seq_ = ''.join(seq_) - exon_seq_ = "" - for exon_left, exon_right in exons_: - exon_seq_ += seq_[exon_left:exon_right+1] - exon_seq_ = exon_seq_.replace('.', '') - if gene_strand[gene] == '-': - exon_seq_ = typing_common.reverse_complement(exon_seq_) - - cmp_exon_seq_, allele_name_ = "", "" - for line in open("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene): - if line.startswith(">"): - if allele_name_ == ref_gene: - break - allele_name_ = line.strip().split()[1] - cmp_exon_seq_ = "" - else: - cmp_exon_seq_ += line.strip() - """ - print "Has insertions:", has_insertion - print "constructed:", len(exon_seq_) - for p in range(0, len(exon_seq_), 60): - print exon_seq_[p:p+60] - print "true:", len(cmp_exon_seq_) - for p in range(0, len(cmp_exon_seq_), 60): - print cmp_exon_seq_[p:p+60] - """ - if exon_seq_ != cmp_exon_seq_: - print >> sys.stderr, "Waring: exonic sequences do not match (%s)" % gene - else: - exon_str = "%d-%d" % (left, right - 1) - - print >> locus_file, "%s\t%s\t%d\t%d\t%d\t%s\t%s" % \ - (backbone_name, chr, left, right - 1, len(backbone_seq.replace('.', '')), exon_str, gene_strand[gene]) - - # Write - # (1) variants w.r.t the backbone sequences into a SNP file - # (2) pairs of a variant and the corresponding HLA allels into a LINK file - keys = sorted(Vars.keys(), cmp=cmp_varKey) - var2ID = {} - for k in range(len(keys)): - locus, type, data = keys[k].split('-') - locus = int(locus) - if type == 'M': - type_str = "single" - elif type == 'I': - type_str = "insertion" - else: - assert type == 'D' - type_str = "deletion" - - freq, names_ = Vars[keys[k]] - names_ = sorted(names_) - varID = "hv%d" % (num_vars) - tmp_backbone_name = backbone_name - print >> var_file, "%s\t%s\t%s\t%d\t%s" % \ - (varID, type_str, tmp_backbone_name, base_locus + locus, data) - if freq >= min_var_freq: - print >> var_index_file, "%s\t%s\t%s\t%d\t%s" % \ - (varID, type_str, tmp_backbone_name, base_locus + locus, data) - print >> var_freq_file, "%s\t%.2f" % (varID, freq) - print >> link_file, "%s\t%s" % (varID, ' '.join(names_)) - var2ID[keys[k]] = num_vars - num_vars += 1 - - add_seq_len = 0 - # Write haplotypes - excluded_vars = set() - var_leftmost, var_rightmost = sys.maxint, -1 - for k in range(len(keys)): - key = keys[k] - if Vars[key][0] < min_var_freq: - excluded_vars.add(key) - - # Update leftmost and rightmost of Vars - locus, type, data = key.split('-') - left = right = int(locus) - if type == 'D': - right = left + int(data) - 1 - if k == 0: - var_leftmost = left - if var_rightmost < right: - var_rightmost = right - - i = 0 - while i < len(keys): - key_i = keys[i] - locus, type, data = key_i.split('-') - locus = int(locus) - if type == 'D': - locus += (int(data) - 1) - prev_locus = locus - if whole_haplotype: - j = len(keys) - else: - j = i + 1 - while j < len(keys): - key_j = keys[j] - locus2, type2, data2 = key_j.split('-') - locus2 = int(locus2) - if prev_locus + inter_gap < locus2: - break - prev_locus = locus2 - if type == 'D': - prev_locus += (int(data) - 1) - j += 1 - - alleles = set() - for k in range(i, j): - key_k = keys[k] - freq, names_ = Vars[key_k] - if freq < min_var_freq: - continue - add_alleles = set(names_) - alleles |= add_alleles - - haplotypes = set() - cur_vars = set(keys[i:j]) - excluded_vars - for allele in alleles: - allele_vars = set(Vars_[allele]) - excluded_vars - allele_cur_vars = '#'.join(sorted(list(cur_vars & allele_vars), cmp=cmp_varKey)) - haplotypes.add(allele_cur_vars) - - # Split some haplotypes that include large gaps inside - def split_haplotypes(haplotypes): - split_haplotypes = set() - for haplotype in haplotypes: - haplotype = haplotype.split('#') - assert len(haplotype) > 0 - if len(haplotype) == 1: - split_haplotypes.add(haplotype[0]) - continue - prev_s, s = 0, 1 - while s < len(haplotype): - prev_locus, prev_type, prev_data = haplotype[s-1].split('-') - locus, type, data = haplotype[s].split('-') - prev_locus, locus = int(prev_locus), int(locus) - if prev_type == 'D': - prev_locus += (int(prev_data) - 1) - if prev_locus + intra_gap < locus: - split_haplotypes.add('#'.join(haplotype[prev_s:s])) - prev_s = s - s += 1 - if s == len(haplotype): - split_haplotypes.add('#'.join(haplotype[prev_s:s])) - return split_haplotypes - - if not whole_haplotype: - haplotypes = split_haplotypes(haplotypes) - - def cmp_haplotype(a, b): - a = a.split('#') - a1_locus, _, _ = a[0].split('-') - a2_locus, a2_type, a2_data = a[-1].split('-') - a_begin, a_end = int(a1_locus), int(a2_locus) - if a2_type == 'D': - a_end += (int(a2_data) - 1) - b = b.split('#') - b1_locus, _, _ = b[0].split('-') - b2_locus, b2_type, b2_data = b[-1].split('-') - b_begin, b_end = int(b1_locus), int(b2_locus) - if b2_type == 'D': - b_end += (int(b2_data) - 1) - if a_begin != b_begin: - return a_begin - b_begin - return a_end - b_end - - haplotypes = sorted(list(haplotypes), cmp=cmp_haplotype) - - # DK - for debugging purposes - """ - dis = prev_locus - locus - print "\n[%d, %d]: %d haplotypes" % (i, j, len(haplotypes)), dis - if len(cur_vars) in range(0, 1000): - # print "vars:", sorted(list(cur_vars), cmp=cmp_varKey - print "num:", len(haplotypes) - for haplotype in haplotypes: - print haplotype.split('#') - print "\nnum:", len(haplotypes2) - for haplotype in haplotypes2: - print haplotype.split('#') - """ - - # Write haplotypes - sanity_vars = set() - for h_i in range(len(haplotypes)): - h = haplotypes[h_i].split('#') - varIDs = [] - for var in h: - varIDs.append("hv%s" % var2ID[var]) - # DK - for debugging purposes - # varIDs.append(var) - sanity_vars.add(var2ID[var]) - if whole_haplotype: - h_begin, h_end = var_leftmost, var_rightmost - else: - h1_locus, _, _ = h[0].split('-') - h2_locus, h2_type, h2_data = h[-1].split('-') - h_begin, h_end = int(h1_locus), int(h2_locus) - if h2_type == 'D': - h_end += (int(h2_data) - 1) - assert h_begin <= h_end - h_new_begin = h_begin - for h_j in reversed(range(0, h_i)): - hc = haplotypes[h_j].split('#') - hc_begin, hc_type, hc_data = hc[-1].split('-') - hc_begin = int(hc_begin) - hc_end = hc_begin - if hc_type == 'D': - hc_end += (int(hc_data) - 1) - if hc_end + inter_gap < h_begin: - break - if h_new_begin > hc_end: - h_new_begin = hc_end - assert h_new_begin <= h_begin - h_begin = h_new_begin - tmp_backbone_name = backbone_name - print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (num_haplotypes, tmp_backbone_name, base_locus + h_begin, base_locus + h_end, ','.join(varIDs)) - num_haplotypes += 1 - add_seq_len += (h_end - h_begin + 1) - assert len(sanity_vars) == len(cur_vars) - - i = j - - print >> sys.stderr, "Length of additional sequences for haplotypes:", add_seq_len - - # Write all the sequences with dots removed into a file - for name, ID in names.items(): - print >> input_file, ">%s" % (name) - assert ID < len(seqs) - seq = seqs[ID].replace('.', '') - for s in range(0, len(seq), 60): - print >> input_file, seq[s:s+60] - print >> allele_file, name - - - # Write partial allele names - for name in names: - if name not in full_allele_names: - print >> partial_file, name - - backbone_file.close() - locus_file.close() - var_file.close() - var_index_file.close() - var_freq_file.close() - haplotype_file.close() - link_file.close() - input_file.close() - allele_file.close() - partial_file.close() - - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Extract variants from multiple sequence alignments") - parser.add_argument("-b", "--base", - dest="base_fname", - type=str, - default="hla", - help="base filename for backbone sequence, variants, and linking info (Default: hla)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="A comma-separated list of gene names (default: empty, all genes)") - parser.add_argument("--inter-gap", - dest="inter_gap", - type=int, - default=30, - help="Maximum distance for variants to be in the same haplotype (default: 30)") - parser.add_argument("--intra-gap", - dest="intra_gap", - type=int, - default=50, - help="Break a haplotype into several haplotypes (default: 50)") - parser.add_argument("--whole-haplotype", - dest="whole_haplotype", - action="store_true", - help="Include partial alleles (e.g. A_nuc.fasta)") - parser.add_argument("--min-var-freq", - dest="min_var_freq", - type=float, - default=0.0, - help="Exclude variants whose freq is below than this value in percentage (Default: 0.0)") - parser.add_argument("--ext-seq", - dest="ext_seq_len", - type=int, - default=0, - help="Length of extra sequences flanking backbone sequences (Default: 0)") - parser.add_argument("--leftshift", - dest="leftshift", - action="store_true", - help="Shift deletions to the leftmost") - parser.add_argument("--no-partial", - dest="partial", - action="store_false", - help="Exclude partial alleles, exon-only sequences in HLA") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.locus_list == "": - locus_list = [] - else: - locus_list = args.locus_list.split(',') - if args.inter_gap > args.intra_gap: - print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap) - sys.exit(1) - - if args.base_fname.find('/') != -1: - elems = args.base_fname.split('/') - base_fname = elems[-1] - base_dname = '/'.join(elems[:-1]) - else: - base_fname = args.base_fname - base_dname = "" - - extract_vars(base_fname, - base_dname, - locus_list, - args.inter_gap, - args.intra_gap, - args.whole_haplotype, - args.min_var_freq, - args.ext_seq_len, - args.leftshift, - args.partial, - args.verbose) - diff --git a/hisatgenotype_hla_cyp.py b/hisatgenotype_hla_cyp.py deleted file mode 100755 index cd97eea9..00000000 --- a/hisatgenotype_hla_cyp.py +++ /dev/null @@ -1,1671 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2015, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import sys, os, subprocess, re -import inspect, random -import math -from argparse import ArgumentParser, FileType - -""" -""" -def simulate_reads(HLAs, - test_HLA_list, - simulate_interval): - HLA_reads_1, HLA_reads_2 = [], [] - for test_HLA_names in test_HLA_list: - gene = test_HLA_names[0].split('*')[0] - # ref_allele = refHLAs[gene] - # ref_seq = HLAs[gene][ref_allele] - - # Simulate reads from two HLA alleles - def simulate_reads_impl(seq, simulate_interval = 1, frag_len = 250, read_len = 100): - comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} - reads_1, reads_2 = [], [] - for i in range(0, len(seq) - frag_len + 1, simulate_interval): - reads_1.append(seq[i:i+read_len]) - tmp_read_2 = reversed(seq[i+frag_len-read_len:i+frag_len]) - read_2 = "" - for s in tmp_read_2: - if s in comp_table: - read_2 += comp_table[s] - else: - read_2 += s - reads_2.append(read_2) - return reads_1, reads_2 - - for test_HLA_name in test_HLA_names: - HLA_seq = HLAs[gene][test_HLA_name] - tmp_reads_1, tmp_reads_2 = simulate_reads_impl(HLA_seq, simulate_interval) - HLA_reads_1 += tmp_reads_1 - HLA_reads_2 += tmp_reads_2 - - # Write reads into a fasta read file - def write_reads(reads, idx): - read_file = open('hla_input_%d.fa' % idx, 'w') - for read_i in range(len(reads)): - print >> read_file, ">%d" % (read_i + 1) - print >> read_file, reads[read_i] - read_file.close() - write_reads(HLA_reads_1, 1) - write_reads(HLA_reads_2, 2) - - -""" -Align reads, and sort the alignments into a BAM file -""" -def align_reads(ex_path, - base_fname, - aligner, - index_type, - read_fname, - fastq, - threads, - verbose): - if aligner == "hisat2": - hisat2 = os.path.join(ex_path, "hisat2") - aligner_cmd = [hisat2, - "--no-unal", - "--mm"] - if index_type == "linear": - aligner_cmd += ["-k", "10"] - aligner_cmd += ["-x", "%s.%s" % (base_fname, index_type)] - elif aligner == "bowtie2": - aligner_cmd = [aligner, - "--no-unal", - "-k", "10", - "-x", base_fname] - else: - assert False - assert len(read_fname) in [1,2] - aligner_cmd += ["-p", str(threads)] - if not fastq: - aligner_cmd += ["-f"] - if len(read_fname) == 1: - aligner_cmd += ["-U", read_fname[0]] - else: - aligner_cmd += ["-1", "%s" % read_fname[0], - "-2", "%s" % read_fname[1]] - - if verbose: - print >> sys.stderr, ' '.join(aligner_cmd) - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sambam_cmd = ["samtools", - "view", - "-bS", - "-"] - sambam_proc = subprocess.Popen(sambam_cmd, - stdin=align_proc.stdout, - stdout=open("hla_input_unsorted.bam", 'w'), - stderr=open("/dev/null", 'w')) - sambam_proc.communicate() - if index_type == "graph": - bamsort_cmd = ["samtools", - "sort", - "hla_input_unsorted.bam", - "-o", "hla_input.bam"] - bamsort_proc = subprocess.Popen(bamsort_cmd, - stderr=open("/dev/null", 'w')) - bamsort_proc.communicate() - - bamindex_cmd = ["samtools", - "index", - "hla_input.bam"] - bamindex_proc = subprocess.Popen(bamindex_cmd, - stderr=open("/dev/null", 'w')) - bamindex_proc.communicate() - - os.system("rm hla_input_unsorted.bam") - else: - os.system("mv hla_input_unsorted.bam hla_input.bam") - - -""" -""" -def normalize(prob): - total = sum(prob.values()) - for allele, mass in prob.items(): - prob[allele] = mass / total - - -""" -""" -def prob_diff(prob1, prob2): - diff = 0.0 - for allele in prob1.keys(): - if allele in prob2: - diff += abs(prob1[allele] - prob2[allele]) - else: - diff += prob1[allele] - return diff - - -""" -""" -def HLA_prob_cmp(a, b): - if a[1] != b[1]: - if a[1] < b[1]: - return 1 - else: - return -1 - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - - -""" -""" -def single_abundance(HLA_cmpt, - HLA_length): - def normalize2(prob, length): - total = 0 - for allele, mass in prob.items(): - assert allele in length - total += (mass / length[allele]) - for allele, mass in prob.items(): - assert allele in length - prob[allele] = mass / length[allele] / total - - HLA_prob, HLA_prob_next = {}, {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - for allele in alleles: - if allele not in HLA_prob: - HLA_prob[allele] = 0.0 - HLA_prob[allele] += (float(count) / len(alleles)) - - # normalize2(HLA_prob, HLA_length) - normalize(HLA_prob) - def next_prob(HLA_cmpt, HLA_prob, HLA_length): - HLA_prob_next = {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - alleles_prob = 0.0 - for allele in alleles: - assert allele in HLA_prob - alleles_prob += HLA_prob[allele] - for allele in alleles: - if allele not in HLA_prob_next: - HLA_prob_next[allele] = 0.0 - HLA_prob_next[allele] += (float(count) * HLA_prob[allele] / alleles_prob) - # normalize2(HLA_prob_next, HLA_length) - normalize(HLA_prob_next) - return HLA_prob_next - - diff, iter = 1.0, 0 - while diff > 0.0001 and iter < 1000: - HLA_prob_next = next_prob(HLA_cmpt, HLA_prob, HLA_length) - diff = prob_diff(HLA_prob, HLA_prob_next) - HLA_prob = HLA_prob_next - iter += 1 - for allele, prob in HLA_prob.items(): - allele_len = HLA_length[allele] - HLA_prob[allele] /= float(allele_len) - normalize(HLA_prob) - HLA_prob = [[allele, prob] for allele, prob in HLA_prob.items()] - HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp) - return HLA_prob - - -""" -""" -def joint_abundance(HLA_cmpt, - HLA_length): - allele_names = set() - for cmpt in HLA_cmpt.keys(): - allele_names |= set(cmpt.split('-')) - - HLA_prob, HLA_prob_next = {}, {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - for allele1 in alleles: - for allele2 in allele_names: - if allele1 < allele2: - allele_pair = "%s-%s" % (allele1, allele2) - else: - allele_pair = "%s-%s" % (allele2, allele1) - if not allele_pair in HLA_prob: - HLA_prob[allele_pair] = 0.0 - HLA_prob[allele_pair] += (float(count) / len(alleles)) - - if len(HLA_prob) <= 0: - return HLA_prob - - # Choose top allele pairs - def choose_top_alleles(HLA_prob): - HLA_prob_list = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()] - HLA_prob_list = sorted(HLA_prob_list, cmp=HLA_prob_cmp) - HLA_prob = {} - best_prob = HLA_prob_list[0][1] - for i in range(len(HLA_prob_list)): - allele_pair, prob = HLA_prob_list[i] - if prob * 2 <= best_prob: - break - HLA_prob[allele_pair] = prob - normalize(HLA_prob) - return HLA_prob - HLA_prob = choose_top_alleles(HLA_prob) - - def next_prob(HLA_cmpt, HLA_prob): - HLA_prob_next = {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - prob = 0.0 - for allele in alleles: - for allele_pair in HLA_prob.keys(): - if allele in allele_pair: - prob += HLA_prob[allele_pair] - for allele in alleles: - for allele_pair in HLA_prob.keys(): - if not allele in allele_pair: - continue - if allele_pair not in HLA_prob_next: - HLA_prob_next[allele_pair] = 0.0 - HLA_prob_next[allele_pair] += (float(count) * HLA_prob[allele_pair] / prob) - normalize(HLA_prob_next) - return HLA_prob_next - - diff, iter = 1.0, 0 - while diff > 0.0001 and iter < 1000: - HLA_prob_next = next_prob(HLA_cmpt, HLA_prob) - diff = prob_diff(HLA_prob, HLA_prob_next) - HLA_prob = HLA_prob_next - HLA_prob = choose_top_alleles(HLA_prob) - iter += 1 - - HLA_prob = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()] - HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp) - return HLA_prob - - -""" -""" -def HLA_typing(ex_path, - base_fname, - simulation, - reference_type, - hla_list, - partial, - refHLAs, - HLAs, - HLA_names, - HLA_lengths, - refHLA_loci, - Vars, - Var_list, - Links, - exclude_allele_list, - aligners, - num_mismatch, - fastq, - read_fname, - alignment_fname, - threads, - enable_coverage, - best_alleles, - verbose): - - def lower_bound(Var_list, pos): - low, high = 0, len(Var_list) - while low < high: - m = (low + high) / 2 - m_pos = Var_list[m][0] - if m_pos < pos: - low = m + 1 - elif m_pos > pos: - high = m - else: - assert m_pos == pos - while m > 0: - if Var_list[m-1][0] < pos: - break - m -= 1 - return m - return low - - if simulation: - test_passed = {} - for aligner, index_type in aligners: - if index_type == "graph": - print >> sys.stderr, "\n\t\t%s %s on %s" % (aligner, index_type, reference_type) - else: - print >> sys.stderr, "\n\t\t%s %s" % (aligner, index_type) - - if alignment_fname == "": - # Align reads, and sort the alignments into a BAM file - align_reads(ex_path, - base_fname, - aligner, - index_type, - read_fname, - fastq, - threads, - verbose) - - for test_HLA_names in hla_list: - if simulation: - gene = test_HLA_names[0].split('*')[0] - else: - gene = test_HLA_names - - ref_allele = refHLAs[gene] - ref_seq = HLAs[gene][ref_allele] - ref_exons = refHLA_loci[gene][-1] - - # Read alignments - alignview_cmd = ["samtools", - "view"] - if alignment_fname == "": - alignview_cmd += ["hla_input.bam"] - else: - if not os.path.exists(alignment_fname + ".bai"): - os.system("samtools index %s" % alignment_fname) - alignview_cmd += [alignment_fname] - base_locus = 0 - if index_type == "graph": - if reference_type == "gene": - alignview_cmd += ["%s" % ref_allele] - else: - assert reference_type in ["chromosome", "genome"] - _, chr, left, right, _ = refHLA_loci[gene] - base_locus = left - alignview_cmd += ["%s:%d-%d" % (chr, left + 1, right + 1)] - - bamview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sort_read_cmd = ["sort", "-k", "1", "-n"] - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - else: - alignview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - # Count alleles - HLA_counts, HLA_cmpt = {}, {} - coverage = [0 for i in range(len(ref_seq) + 1)] - num_reads, total_read_len = 0, 0 - prev_read_id = None - prev_exon = False - if index_type == "graph": - # Cigar regular expression - cigar_re = re.compile('\d+\w') - for line in alignview_proc.stdout: - cols = line.strip().split() - read_id, flag, chr, pos, mapQ, cigar_str = cols[:6] - read_seq, qual = cols[9], cols[10] - num_reads += 1 - total_read_len += len(read_seq) - flag, pos = int(flag), int(pos) - pos -= (base_locus + 1) - if pos < 0: - continue - - if flag & 0x4 != 0: - continue - - NM, Zs, MD = "", "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("Zs"): - Zs = col[5:] - elif col.startswith("MD"): - MD = col[5:] - elif col.startswith("NM"): - NM = int(col[5:]) - - if NM > num_mismatch: - continue - - # daehwan - for debugging purposes - debug = False - if read_id in ["2339"] and False: - debug = True - print "read_id: %s)" % read_id, pos, cigar_str, "NM:", NM, MD, Zs - print " ", read_seq - - vars = [] - if Zs: - vars = Zs.split(',') - - assert MD != "" - MD_str_pos, MD_len = 0, 0 - read_pos, left_pos = 0, pos - right_pos = left_pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - cmp_list = [] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op == 'M': - # Update coverage - if enable_coverage: - if right_pos + length < len(coverage): - coverage[right_pos] += 1 - coverage[right_pos + length] -= 1 - elif right_pos < len(coverage): - coverage[right_pos] += 1 - coverage[-1] -= 1 - - first = True - MD_len_used = 0 - while True: - if not first or MD_len == 0: - if MD[MD_str_pos].isdigit(): - num = int(MD[MD_str_pos]) - MD_str_pos += 1 - while MD_str_pos < len(MD): - if MD[MD_str_pos].isdigit(): - num = num * 10 + int(MD[MD_str_pos]) - MD_str_pos += 1 - else: - break - MD_len += num - # Insertion or full match followed - if MD_len >= length: - MD_len -= length - cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used]) - break - first = False - read_base = read_seq[read_pos + MD_len] - MD_ref_base = MD[MD_str_pos] - MD_str_pos += 1 - assert MD_ref_base in "ACGT" - cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used]) - cmp_list.append(["mismatch", right_pos + MD_len, 1]) - MD_len_used = MD_len + 1 - MD_len += 1 - # Full match - if MD_len == length: - MD_len = 0 - break - elif cigar_op == 'I': - cmp_list.append(["insertion", right_pos, length]) - elif cigar_op == 'D': - if MD[MD_str_pos] == '0': - MD_str_pos += 1 - assert MD[MD_str_pos] == '^' - MD_str_pos += 1 - while MD_str_pos < len(MD): - if not MD[MD_str_pos] in "ACGT": - break - MD_str_pos += 1 - cmp_list.append(["deletion", right_pos, length]) - elif cigar_op == 'S': - cmp_list.append(["soft", right_pos, length]) - else: - assert cigar_op == 'N' - cmp_list.append(["intron", right_pos, length]) - - if cigar_op in "MND": - right_pos += length - - if cigar_op in "MIS": - read_pos += length - - exon = False - for exon in ref_exons: - exon_left, exon_right = exon - if right_pos <= exon_left or pos > exon_right: - continue - else: - exon = True - break - - if right_pos > len(ref_seq): - continue - - def add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, exon = True): - max_count = max(HLA_count_per_read.values()) - cur_cmpt = set() - for allele, count in HLA_count_per_read.items(): - if count < max_count: - continue - if allele in exclude_allele_list: - continue - cur_cmpt.add(allele) - if not allele in HLA_counts: - HLA_counts[allele] = 1 - else: - HLA_counts[allele] += 1 - - if len(cur_cmpt) == 0: - return - - # daehwan - for debugging purposes - alleles = ["", ""] - # alleles = ["B*40:304", "B*40:02:01"] - allele1_found, allele2_found = False, False - for allele, count in HLA_count_per_read.items(): - if count < max_count: - continue - if allele == alleles[0]: - allele1_found = True - elif allele == alleles[1]: - allele2_found = True - if allele1_found != allele2_found: - print alleles[0], HLA_count_per_read[alleles[0]] - print alleles[1], HLA_count_per_read[alleles[1]] - if allele1_found: - print ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, HLA_count_per_read[alleles[1]])) - else: - print ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, HLA_count_per_read[alleles[0]])) - print read_seq - - cur_cmpt = sorted(list(cur_cmpt)) - cur_cmpt = '-'.join(cur_cmpt) - add = 1 - if partial and not exon: - add *= 0.2 - if not cur_cmpt in HLA_cmpt: - HLA_cmpt[cur_cmpt] = add - else: - HLA_cmpt[cur_cmpt] += add - - if read_id != prev_read_id: - if prev_read_id != None: - add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, prev_exon) - - HLA_count_per_read = {} - for HLA_name in HLA_names[gene]: - if HLA_name.find("BACKBONE") != -1: - continue - HLA_count_per_read[HLA_name] = 0 - - def add_count(var_id, add): - assert var_id in Links - alleles = Links[var_id] - for allele in alleles: - if allele.find("BACKBONE") != -1: - continue - HLA_count_per_read[allele] += add - # daehwan - for debugging purposes - if debug: - if allele in ["DQA1*05:05:01:01", "DQA1*05:05:01:02"]: - print allele, add, var_id - - # Decide which allele(s) a read most likely came from - # also sanity check - read length, cigar string, and MD string - for var_id, data in Vars[gene].items(): - var_type, var_pos, var_data = data - if var_type != "deletion": - continue - if left_pos >= var_pos and right_pos <= var_pos + int(var_data): - add_count(var_id, -1) - ref_pos, read_pos, cmp_cigar_str, cmp_MD = left_pos, 0, "", "" - cigar_match_len, MD_match_len = 0, 0 - for cmp in cmp_list: - type = cmp[0] - length = cmp[2] - if type == "match": - var_idx = lower_bound(Var_list[gene], ref_pos) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if ref_pos + length <= var_pos: - break - if ref_pos <= var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "insertion": - if ref_pos < var_pos and ref_pos + length > var_pos + len(var_data): - add_count(var_id, -1) - # daehwan - for debugging purposes - if debug: - print cmp, var_id, Links[var_id] - elif var_type == "deletion": - del_len = int(var_data) - if ref_pos < var_pos and ref_pos + length > var_pos + del_len: - # daehwan - for debugging purposes - if debug: - print cmp, var_id, Links[var_id], -1, Vars[gene][var_id] - # Check if this might be one of the two tandem repeats (the same left coordinate) - cmp_left, cmp_right = cmp[1], cmp[1] + cmp[2] - test1_seq1 = ref_seq[cmp_left:cmp_right] - test1_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos + del_len:cmp_right + del_len] - # Check if this happens due to small repeats (the same right coordinate - e.g. 19 times of TTTC in DQA1*05:05:01:02) - cmp_left -= read_pos - cmp_right += (len(read_seq) - read_pos - cmp[2]) - test2_seq1 = ref_seq[cmp_left+int(var_data):cmp_right] - test2_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos+int(var_data):cmp_right] - if test1_seq1 != test1_seq2 and test2_seq1 != test2_seq2: - add_count(var_id, -1) - else: - if debug: - print cmp, var_id, Links[var_id], -1 - add_count(var_id, -1) - var_idx += 1 - - read_pos += length - ref_pos += length - cigar_match_len += length - MD_match_len += length - elif type == "mismatch": - read_base = read_seq[read_pos] - var_idx = lower_bound(Var_list[gene], ref_pos) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if ref_pos < var_pos: - break - if ref_pos == var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "single": - if var_data == read_base: - # daehwan - for debugging purposes - if debug: - print cmp, var_id, 1, var_data, read_base, Links[var_id] - - # daehwan - for debugging purposes - if False: - read_qual = ord(qual[read_pos]) - add_count(var_id, (read_qual - 60) / 60.0) - else: - add_count(var_id, 1) - # daehwan - check out if this routine is appropriate - # else: - # add_count(var_id, -1) - var_idx += 1 - - cmp_MD += ("%d%s" % (MD_match_len, ref_seq[ref_pos])) - MD_match_len = 0 - cigar_match_len += 1 - read_pos += 1 - ref_pos += 1 - elif type == "insertion": - ins_seq = read_seq[read_pos:read_pos+length] - var_idx = lower_bound(Var_list[gene], ref_pos) - # daehwan - for debugging purposes - if debug: - print left_pos, cigar_str, MD, vars - print ref_pos, ins_seq, Var_list[gene][var_idx], Vars[gene][Var_list[gene][var_idx][1]] - # sys.exit(1) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if ref_pos < var_pos: - break - if ref_pos == var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "insertion": - if var_data == ins_seq: - # daehwan - for debugging purposes - if debug: - print cmp, var_id, 1, Links[var_id] - add_count(var_id, 1) - var_idx += 1 - - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - read_pos += length - cmp_cigar_str += ("%dI" % length) - elif type == "deletion": - del_len = length - # Deletions can be shifted bidirectionally - temp_ref_pos = ref_pos - while temp_ref_pos > 0: - last_bp = ref_seq[temp_ref_pos + del_len - 1] - prev_bp = ref_seq[temp_ref_pos - 1] - if last_bp != prev_bp: - break - temp_ref_pos -= 1 - var_idx = lower_bound(Var_list[gene], temp_ref_pos) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if temp_ref_pos < var_pos: - first_bp = ref_seq[temp_ref_pos] - next_bp = ref_seq[temp_ref_pos + del_len] - if first_bp == next_bp: - temp_ref_pos += 1 - continue - else: - break - if temp_ref_pos == var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "deletion": - var_len = int(var_data) - if var_len == length: - if debug: - print cmp, var_id, 1, Links[var_id] - print ref_seq[var_pos - 10:var_pos], ref_seq[var_pos:var_pos+int(var_data)], ref_seq[var_pos+int(var_data):var_pos+int(var_data)+10] - add_count(var_id, 1) - var_idx += 1 - - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - cmp_MD += ("%d" % MD_match_len) - MD_match_len = 0 - cmp_cigar_str += ("%dD" % length) - cmp_MD += ("^%s" % ref_seq[ref_pos:ref_pos+length]) - ref_pos += length - elif type == "soft": - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - read_pos += length - cmp_cigar_str += ("%dS" % length) - else: - assert type == "intron" - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - cmp_cigar_str += ("%dN" % length) - ref_pos += length - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cmp_MD += ("%d" % MD_match_len) - if read_pos != len(read_seq) or \ - cmp_cigar_str != cigar_str or \ - cmp_MD != MD: - print >> sys.stderr, "Error:", cigar_str, MD - print >> sys.stderr, "\tcomputed:", cmp_cigar_str, cmp_MD - print >> sys.stderr, "\tcmp list:", cmp_list - assert False - - prev_read_id = read_id - prev_exon = exon - - if num_reads <= 0: - continue - - if prev_read_id != None: - add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read) - - # Coverage - # it is not used by the default - if enable_coverage: - assert num_reads > 0 - read_len = int(total_read_len / float(num_reads)) - coverage_sum = 0 - for i in range(len(coverage)): - if i > 0: - coverage[i] += coverage[i-1] - coverage_sum += coverage[i] - coverage_avg = coverage_sum / float(len(coverage)) - assert len(ref_seq) < len(coverage) - for i in range(len(ref_seq)): - coverage_threshold = 1.0 * coverage_avg - if i < read_len: - coverage_threshold *= ((i+1) / float(read_len)) - elif i + read_len > len(ref_seq): - coverage_threshold *= ((len(ref_seq) - i) / float(read_len)) - if coverage[i] >= coverage_threshold: - continue - pseudo_num_reads = (coverage_threshold - coverage[i]) / read_len - var_idx = lower_bound(Var_list[gene], i + 1) - if var_idx >= len(Var_list[gene]): - var_idx = len(Var_list[gene]) - 1 - cur_cmpt = set() - while var_idx >= 0: - var_pos, var_id = Var_list[gene][var_idx] - var_type, _, var_data = Vars[gene][var_id] - if var_type == "deletion": - del_len = int(var_data) - if i < var_pos: - break - if i + read_len < var_pos + int(var_data): - assert var_id in Links - cur_cmpt = cur_cmpt.union(set(Links[var_id])) - var_idx -= 1 - if cur_cmpt: - cur_cmpt = '-'.join(list(cur_cmpt)) - if not cur_cmpt in HLA_cmpt: - HLA_cmpt[cur_cmpt] = 0 - HLA_cmpt[cur_cmpt] += pseudo_num_reads - else: - assert index_type == "linear" - def add_alleles(alleles): - if not allele in HLA_counts: - HLA_counts[allele] = 1 - else: - HLA_counts[allele] += 1 - - cur_cmpt = sorted(list(alleles)) - cur_cmpt = '-'.join(cur_cmpt) - if not cur_cmpt in HLA_cmpt: - HLA_cmpt[cur_cmpt] = 1 - else: - HLA_cmpt[cur_cmpt] += 1 - - prev_read_id, prev_AS = None, None - alleles = set() - for line in alignview_proc.stdout: - cols = line[:-1].split() - read_id, flag, allele = cols[:3] - flag = int(flag) - if flag & 0x4 != 0: - continue - if not allele.startswith(gene): - continue - if allele.find("BACKBONE") != -1: - continue - - AS = None - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - assert AS != None - if read_id != prev_read_id: - if alleles: - if aligner == "hisat2" or \ - (aligner == "bowtie2" and len(alleles) < 10): - add_alleles(alleles) - alleles = set() - prev_AS = None - if prev_AS != None and AS < prev_AS: - continue - prev_read_id = read_id - prev_AS = AS - alleles.add(allele) - - if alleles: - add_alleles(alleles) - - HLA_counts = [[allele, count] for allele, count in HLA_counts.items()] - def HLA_count_cmp(a, b): - if a[1] != b[1]: - return b[1] - a[1] - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - HLA_counts = sorted(HLA_counts, cmp=HLA_count_cmp) - for count_i in range(len(HLA_counts)): - count = HLA_counts[count_i] - if simulation: - found = False - for test_HLA_name in test_HLA_names: - if count[0] == test_HLA_name: - print >> sys.stderr, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_HLA_name, count[1]) - found = True - """ - if count_i > 0 and HLA_counts[0][1] > count[1]: - print >> sys.stderr, "Warning: %s ranked first (count: %d)" % (HLA_counts[0][0], HLA_counts[0][1]) - assert False - else: - test_passed += 1 - """ - if count_i < 5 and not found: - print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - else: - print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - if count_i >= 9: - break - print >> sys.stderr - - HLA_prob = single_abundance(HLA_cmpt, HLA_lengths[gene]) - - success = [False for i in range(len(test_HLA_names))] - found_list = [False for i in range(len(test_HLA_names))] - for prob_i in range(len(HLA_prob)): - prob = HLA_prob[prob_i] - found = False - if simulation: - for name_i in range(len(test_HLA_names)): - test_HLA_name = test_HLA_names[name_i] - if prob[0] == test_HLA_name: - rank_i = prob_i - while rank_i > 0: - if prob == HLA_prob[rank_i - 1][1]: - rank_i -= 1 - else: - break - print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_HLA_name, prob[1] * 100.0) - if rank_i < len(success): - success[rank_i] = True - found_list[name_i] = True - found = True - if not False in found_list: - break - if not found: - print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, prob[0], prob[1] * 100.0) - if best_alleles and prob_i < 2: - print >> sys.stdout, "SingleModel %s (abundance: %.2f%%)" % (prob[0], prob[1] * 100.0) - if not simulation and prob_i >= 9: - break - print >> sys.stderr - - if len(test_HLA_names) == 2 or not simulation: - HLA_prob = joint_abundance(HLA_cmpt, HLA_lengths[gene]) - if len(HLA_prob) <= 0: - continue - success = [False] - for prob_i in range(len(HLA_prob)): - allele_pair, prob = HLA_prob[prob_i] - allele1, allele2 = allele_pair.split('-') - if best_alleles and prob_i < 1: - print >> sys.stdout, "PairModel %s (abundance: %.2f%%)" % (allele_pair, prob * 100.0) - if simulation: - if allele1 in test_HLA_names and allele2 in test_HLA_names: - rank_i = prob_i - while rank_i > 0: - if HLA_prob[rank_i-1][1] == prob: - rank_i -= 1 - else: - break - print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, allele_pair, prob * 100.0) - if rank_i == 0: - success[0] = True - break - print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, allele_pair, prob * 100.0) - if not simulation and prob_i >= 9: - break - print >> sys.stderr - - # Li's method - """ - li_hla = os.path.join(ex_path, "li_hla/hla") - if os.path.exists(li_hla): - li_hla_cmd = [li_hla, - "hla", - "hla_input.bam", - "-b", "%s*BACKBONE" % gene] - li_hla_proc = subprocess.Popen(li_hla_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - # read in the result of Li's hla - for line in li_hla_proc.stdout: - allele1, allele2, score = line.strip().split() - score = float(score) - if simulation: - if allele1 in test_HLA_names and allele2 in test_HLA_names: - print >> sys.stderr, "\t\t\t*** 1 ranked %s-%s (score: %.2f)" % (allele1, allele2, score) - success[0] = True - else: - print >> sys.stderr, "\t\t\tLiModel fails" - if best_alleles: - print >> sys.stdout, "LiModel %s-%s (score: %.2f)" % (allele1, allele2, score) - li_hla_proc.communicate() - """ - - if simulation and not False in success: - aligner_type = "%s %s" % (aligner, index_type) - if not aligner_type in test_passed: - test_passed[aligner_type] = 1 - else: - test_passed[aligner_type] += 1 - - if simulation: - return test_passed - - -def read_HLA_alleles(fname, HLAs): - for line in open(fname): - if line.startswith(">"): - HLA_name = line.strip().split()[0][1:] - HLA_gene = HLA_name.split('*')[0] - if not HLA_gene in HLAs: - HLAs[HLA_gene] = {} - if not HLA_name in HLAs[HLA_gene]: - HLAs[HLA_gene][HLA_name] = "" - else: - HLAs[HLA_gene][HLA_name] += line.strip() - return HLAs - -""" -""" -def genotyping(base_fname, - reference_type, - hla_list, - partial, - aligners, - read_fname, - alignment_fname, - threads, - simulate_interval, - enable_coverage, - best_alleles, - exclude_allele_list, - default_allele_list, - num_mismatch, - verbose, - daehwan_debug): - # Current script directory - curr_script = os.path.realpath(inspect.getsourcefile(genotyping)) - ex_path = os.path.dirname(curr_script) - - # Clone a git repository, IMGTHLA - if not os.path.exists("IMGTHLA"): - os.system("git clone https://github.com/jrob119/IMGTHLA.git") - - # Clone hisat2 genotype database, hisat_genotype_db - """ - if not os.path.exists("hisat_genotype_db"): - os.system("git clone https://github.com/infphilo/hisat_genotype_db.git") - """ - - simulation = (read_fname == [] and alignment_fname == "") - - def check_files(fnames): - for fname in fnames: - if not os.path.exists(fname): - return False - return True - - # Download HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not check_files(HISAT2_fnames): - os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz") - hisat2_inspect = os.path.join(ex_path, "hisat2-inspect") - os.system("%s grch38/genome > genome.fa" % hisat2_inspect) - os.system("samtools faidx genome.fa") - - # Check if the pre-existing files (hla*) are compatible with the current parameter setting - if os.path.exists("%s.ref" % base_fname): - left = 0 - HLA_genes = set() - BACKBONE = False - for line in open("%s.ref" % base_fname): - HLA_name = line.strip().split()[0] - if HLA_name.find("BACKBONE") != -1: - BACKBONE = True - HLA_gene = HLA_name.split('*')[0] - HLA_genes.add(HLA_gene) - delete_hla_files = False - if reference_type == "gene": - if not BACKBONE: - delete_hla_files = True - elif reference_type in ["chromosome", "genome"]: - if BACKBONE: - delete_hla_files = True - else: - assert False - if not set(hla_list).issubset(HLA_genes): - delete_hla_files = True - if base_fname == "hla": - if delete_hla_files: - os.system("rm %s*" % base_fname) - - # Extract HLA variants, backbone sequence, and other sequeces - HLA_fnames = [base_fname+"_backbone.fa", - base_fname+"_sequences.fa", - base_fname+".ref", - base_fname+".snp", - base_fname+".haplotype", - base_fname+".link", - base_fname+"_alleles_excluded.txt"] - - # Check if excluded alleles in current files match - excluded_alleles_match = False - if(os.path.exists(HLA_fnames[6])): - afile = open(HLA_fnames[6],'r') - afile.readline() - lines = afile.read().split() - excluded_alleles_match = (set(exclude_allele_list) == set(lines)) - afile.close() - elif len(exclude_allele_list) == 0: - excluded_alleles_match = True - try: - temp_name = HLA_fnames[6] - HLA_fnames.remove(HLA_fnames[6]) - os.remove(temp_name) - except OSError: - pass - - if not excluded_alleles_match: - print("Creating Allele Exclusion File.\n") - afile = open(HLA_fnames[6],'w') - afile.write("Alleles excluded:\n") - afile.write("\n".join(exclude_allele_list)) - afile.close() - - if (not check_files(HLA_fnames)) or (not excluded_alleles_match) : - extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py") - extract_cmd = [extract_hla_script, - "--base", base_fname, - "--reference-type", reference_type] - - if base_fname == "hla": - extract_cmd += ["--hla-list", ','.join(hla_list)] - - if len(exclude_allele_list) > 0: - print exclude_allele_list - extract_cmd += ["--exclude-allele-list", ",".join(exclude_allele_list)] - - if len(base_fname) > 3: - extract_cmd += ["--base", base_fname] - - if partial: - extract_cmd += ["--partial"] - extract_cmd += ["--inter-gap", "30", - "--intra-gap", "50"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - - if not check_files(HLA_fnames): - print >> sys.stderr, "Error: extract_HLA_vars failed!" - sys.exit(1) - - for aligner, index_type in aligners: - # Build HISAT2 graph indexes based on the above information - if aligner == "hisat2" and index_type == "graph": - HLA_hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base_fname, i+1) for i in range(8)] - if not check_files(HLA_hisat2_graph_index_fnames) or (not excluded_alleles_match): - hisat2_build = os.path.join(ex_path, "hisat2-build") - build_cmd = [hisat2_build, - "-p", str(threads), - "--snp", HLA_fnames[3], - "--haplotype", HLA_fnames[4] , - HLA_fnames[0], - "%s.graph" % base_fname] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(build_cmd) - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(HLA_hisat2_graph_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed! Perhaps, you may have forgotten to build hisat2 executables?" - sys.exit(1) - - # Build HISAT2 linear indexes based on the above information - elif aligner == "hisat2" and index_type == "linear": - HLA_hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base_fname, i+1) for i in range(8)] - if reference_type == "gene" and (not check_files(HLA_hisat2_linear_index_fnames) or (not excluded_alleles_match)): - hisat2_build = os.path.join(ex_path, "hisat2-build") - build_cmd = [hisat2_build, - "%s,%s"%(HLA_fnames[0],HLA_fnames[1]), - "%s.linear" % base_fname] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(HLA_hisat2_linear_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - - # Build Bowtie2 indexes based on the above information - else: - assert aligner == "bowtie2" and index_type == "linear" - HLA_bowtie2_index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)] - HLA_bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)] - if reference_type == "gene" and (not check_files(HLA_bowtie2_index_fnames) or (not excluded_alleles_match)): - build_cmd = ["bowtie2-build", - "%s,%s"%(HLA_fnames[0],HLA_fnames[1]), - base_fname] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w')) - proc.communicate() - if not check_files(HLA_bowtie2_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - - # Read partial alleles from hla.data (temporary) - partial_alleles = set() - if base_fname == "hla": - for line in open("IMGTHLA/hla.dat"): - if not line.startswith("DE"): - continue - allele_name = line.split()[1][4:-1] - gene = allele_name.split('*')[0] - if line.find("partial") != -1: - partial_alleles.add(allele_name) - - if len(default_allele_list) != 0: - #print os.getcwd() - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - #current_path = os.getcwd() - try: - os.mkdir("./Default-HLA") - except: - pass - #os.chdir(current_path + "/Default-HLA") - - extract_hla_script = os.path.join(ex_path, "hisat2_extract_HLA_vars.py") - extract_cmd = [extract_hla_script, - "--reference-type", reference_type, - "--hla-list", ','.join(hla_list), - "--base", "./Default-HLA/hla"] - - if partial: - extract_cmd += ["--partial"] - extract_cmd += ["--inter-gap", "30", - "--intra-gap", "50"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!" - sys.exit(1) - - # Read HLA alleles (names and sequences) - refHLAs, refHLA_loci = {}, {} - for line in open("%s.ref" % base_fname): - HLA_name, chr, left, right, length, exon_str = line.strip().split() - HLA_gene = HLA_name.split('*')[0] - assert not HLA_gene in refHLAs - refHLAs[HLA_gene] = HLA_name - left, right = int(left), int(right) - exons = [] - for exon in exon_str.split(','): - exon_left, exon_right = exon.split('-') - exons.append([int(exon_left), int(exon_right)]) - refHLA_loci[HLA_gene] = [HLA_name, chr, left, right, exons] - - HLAs = {} - if reference_type == "gene": - read_HLA_alleles(HLA_fnames[0], HLAs) - read_HLA_alleles(HLA_fnames[1], HLAs) - - # HLA gene alleles - HLA_names = {} - for HLA_gene, data in HLAs.items(): - HLA_names[HLA_gene] = list(data.keys()) - - # HLA gene allele lengths - HLA_lengths = {} - for HLA_gene, HLA_alleles in HLAs.items(): - HLA_lengths[HLA_gene] = {} - for allele_name, seq in HLA_alleles.items(): - HLA_lengths[HLA_gene][allele_name] = len(seq) - - # Construct excluded alleles (Via default backbone data) - custom_allele_check = False - if len(default_allele_list) > 0: - custom_allele_check = True - HLAs_default = {} - read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default) - read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default) - - for HLA_gene, HLA_alleles in HLAs_default.items(): - for allele_name, seq in HLA_alleles.items(): - if allele_name in default_allele_list: - HLA_lengths[HLA_gene][allele_name] = len(seq) - - # Read HLA variants, and link information - Vars, Var_list = {}, {} - for line in open("%s.snp" % base_fname): - var_id, var_type, allele, pos, data = line.strip().split('\t') - pos = int(pos) - if reference_type != "gene": - allele, dist = None, 0 - for tmp_gene, values in refHLA_loci.items(): - allele_name, chr, left, right, exons = values - if allele == None: - allele = allele_name - dist = abs(pos - left) - else: - if dist > abs(pos - left): - allele = allele_name - dist = abs(pos - left) - - gene = allele.split('*')[0] - if not gene in Vars: - Vars[gene] = {} - assert not gene in Var_list - Var_list[gene] = [] - - assert not var_id in Vars[gene] - left = 0 - if reference_type != "gene": - _, _, left, _, _ = refHLA_loci[gene] - Vars[gene][var_id] = [var_type, pos - left, data] - Var_list[gene].append([pos - left, var_id]) - - for gene, in_var_list in Var_list.items(): - Var_list[gene] = sorted(in_var_list) - - Links = {} - for line in open("%s.link" % base_fname): - var_id, alleles = line.strip().split('\t') - alleles = alleles.split() - assert not var_id in Links - Links[var_id] = alleles - - # Test HLA typing - test_list = [] - if simulation: - basic_test, pair_test = True, False - if daehwan_debug: - if "basic_test" in daehwan_debug: - basic_test, pair_test = True, False - else: - basic_test, pair_test = False, True - - test_passed = {} - test_list = [] - if base_fname == "hla": - genes = list(set(hla_list) & set(HLA_names.keys())) - else: - genes = HLA_names.keys() - - if basic_test: - for gene in genes: - HLA_gene_alleles = HLA_names[gene] - for HLA_name in HLA_gene_alleles: - if HLA_name.find("BACKBONE") != -1: - continue - test_list.append([[HLA_name]]) - if pair_test: - test_size = 500 - allele_count = 2 - for test_i in range(test_size): - test_pairs = [] - for gene in genes: - HLA_gene_alleles = [] - for allele in HLA_names[gene]: - if allele.find("BACKBONE") != -1: - continue - HLA_gene_alleles.append(allele) - - # DK - temporary - if len(HLA_gene_alleles) < 2: - continue - - nums = [i for i in range(len(HLA_gene_alleles))] - random.shuffle(nums) - test_pairs.append(sorted([HLA_gene_alleles[nums[i]] for i in range(allele_count)])) - test_list.append(test_pairs) - - for test_i in range(len(test_list)): - if "test_id" in daehwan_debug: - daehwan_test_ids = daehwan_debug["test_id"].split('-') - if str(test_i + 1) not in daehwan_test_ids: - continue - - print >> sys.stderr, "Test %d" % (test_i + 1) - test_HLA_list = test_list[test_i] - - # daehwan - for debugging purposes - # test_HLA_list = [["A*11:50Q", "A*11:01:01:01", "A*01:01:01:01"]] - for test_HLA_names in test_HLA_list: - for test_HLA_name in test_HLA_names: - if custom_allele_check: - gene = test_HLA_name.split('*')[0] - test_HLA_seq = HLAs_default[gene][test_HLA_name] - seq_type = "partial" if test_HLA_name in partial_alleles else "full" - print >> sys.stderr, "\t%s - %d bp (%s sequence)" % (test_HLA_name, len(test_HLA_seq), seq_type) - continue - gene = test_HLA_name.split('*')[0] - test_HLA_seq = HLAs[gene][test_HLA_name] - seq_type = "partial" if test_HLA_name in partial_alleles else "full" - print >> sys.stderr, "\t%s - %d bp (%s sequence)" % (test_HLA_name, len(test_HLA_seq), seq_type) - if custom_allele_check: - simulate_reads(HLAs_default, test_HLA_list, simulate_interval) - else: - simulate_reads(HLAs, test_HLA_list, simulate_interval) - - if "test_id" in daehwan_debug: - read_fname = ["hla_input_1.fa"] - else: - read_fname = ["hla_input_1.fa", "hla_input_2.fa"] - - fastq = False - - tmp_test_passed = HLA_typing(ex_path, - base_fname, - simulation, - reference_type, - test_HLA_list, - partial, - refHLAs, - HLAs, - HLA_names, - HLA_lengths, - refHLA_loci, - Vars, - Var_list, - Links, - exclude_allele_list, - aligners, - num_mismatch, - fastq, - read_fname, - alignment_fname, - threads, - enable_coverage, - best_alleles, - verbose) - - for aligner_type, passed in tmp_test_passed.items(): - if aligner_type in test_passed: - test_passed[aligner_type] += passed - else: - test_passed[aligner_type] = passed - - print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1))) - - - for aligner_type, passed in test_passed.items(): - print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list)) - - else: # With real reads or BAMs - if base_fname == "hla": - gene_list = hla_list - else: - gene_list = Vars.keys() - print >> sys.stderr, "\t", ' '.join(gene_list) - - fastq = True - HLA_typing(ex_path, - base_fname, - simulation, - reference_type, - gene_list, - partial, - refHLAs, - HLAs, - HLA_names, - HLA_lengths, - refHLA_loci, - Vars, - Var_list, - Links, - exclude_allele_list, - aligners, - num_mismatch, - fastq, - read_fname, - alignment_fname, - threads, - enable_coverage, - best_alleles, - verbose) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='genotyping') - parser.add_argument("--base", - dest="base_fname", - type=str, - default="hla", - help="base filename for backbone HLA sequence, HLA variants, and HLA linking info") - parser.add_argument("--default-list", - dest = "default_allele_list", - type=str, - default="", - help="A comma-separated list of HLA alleles to be tested. Alleles are retrieved from default backbone data (all alleles included in backbone).") - parser.add_argument("--reference-type", - dest="reference_type", - type=str, - default="gene", - help="Reference type: gene, chromosome, and genome (default: gene)") - parser.add_argument("--hla-list", - dest="hla_list", - type=str, - default="A,B,C,DQA1,DQB1,DRB1", - help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)") - parser.add_argument('--partial', - dest='partial', - action='store_true', - help='Include partial alleles (e.g. A_nuc.fasta)') - parser.add_argument("--aligner-list", - dest="aligners", - type=str, - default="hisat2.graph,hisat2.linear,bowtie2.linear", - help="A comma-separated list of aligners (default: hisat2.graph,hisat2.linear,bowtie2.linear)") - parser.add_argument("--reads", - dest="read_fname", - type=str, - default="", - help="Fastq read file name") - parser.add_argument("--alignment", - dest="alignment_fname", - type=str, - default="", - help="BAM file name") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--simulate-interval", - dest="simulate_interval", - type=int, - default=1, - help="Reads simulated at every these base pairs (default: 1)") - parser.add_argument("--coverage", - dest="coverage", - action='store_true', - help="Experimental purpose (assign reads based on coverage)") - parser.add_argument("--best-alleles", - dest="best_alleles", - action='store_true', - help="") - parser.add_argument("--exclude-allele-list", - dest="exclude_allele_list", - type=str, - default="", - help="A comma-separated list of alleles to be excluded. Enter a number N to randomly select N alleles for exclusion and N non-excluded alleles for testing (2N tested in total).") - parser.add_argument("--num-mismatch", - dest="num_mismatch", - type=int, - default=0, - help="Maximum number of mismatches per read alignment to be considered (default: 0)") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument("--debug", - dest="debug", - type=str, - default="", - help="e.g., test_id:10,read_id:10000,basic_test") - parser.add_argument("--novel_allele_detection", - dest="novel_allele_detection", - action='store_true', - help="Change test to detection of new alleles. Report sensitivity and specificity rate at the end.") - - - args = parser.parse_args() - if not args.reference_type in ["gene", "chromosome", "genome"]: - print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type) - sys.exit(1) - args.hla_list = args.hla_list.split(',') - if args.aligners == "": - print >> sys.stderr, "Error: --aligners must be non-empty." - sys.exit(1) - args.aligners = args.aligners.split(',') - for i in range(len(args.aligners)): - args.aligners[i] = args.aligners[i].split('.') - if args.read_fname: - args.read_fname = args.read_fname.split(',') - else: - args.read_fname = [] - if args.alignment_fname != "" and \ - not os.path.exists(args.alignment_fname): - print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname - sys.exit(1) - - if len(args.default_allele_list) > 0: - args.default_allele_list = args.default_allele_list.split(',') - - if len(args.exclude_allele_list) > 0: - if args.exclude_allele_list.strip().isdigit(): - num_alleles = int(args.exclude_allele_list) - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - try: - os.mkdir("./Default-HLA") - except: - pass - - extract_hla_script = os.path.join(ex_path, "hisat2_extract_HLA_vars.py") - extract_cmd = [extract_hla_script, - "--reference-type", reference_type, - "--hla-list", ','.join(hla_list), - "--base", "./Default-HLA/hla"] - if partial: - extract_cmd += ["--partial"] - extract_cmd += ["--inter-gap", "30", - "--intra-gap", "50"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!" - sys.exit(1) - - HLAs_default = {} - #read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default) - read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default) - - allele_names = list(HLAs_default['A'].keys()) - random.shuffle(allele_names) - args.exclude_allele_list = allele_names[0:num_alleles] - args.default_allele_list = allele_names[num_alleles:2*num_alleles] - - args.default_allele_list = args.default_allele_list + args.exclude_allele_list - else: - args.exclude_allele_list = args.exclude_allele_list.split(',') - - debug = {} - if args.debug != "": - for item in args.debug.split(','): - if ':' in item: - key, value = item.split(':') - debug[key] = value - else: - debug[item] = 1 - - random.seed(1) - genotyping(args.base_fname, - args.reference_type, - args.hla_list, - args.partial, - args.aligners, - args.read_fname, - args.alignment_fname, - args.threads, - args.simulate_interval, - args.coverage, - args.best_alleles, - args.exclude_allele_list, - args.default_allele_list, - args.num_mismatch, - args.verbose, - debug) - - diff --git a/hisatgenotype_locus.py b/hisatgenotype_locus.py deleted file mode 100755 index 4d958058..00000000 --- a/hisatgenotype_locus.py +++ /dev/null @@ -1,2631 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re -import inspect, random -import math -from datetime import datetime, date, time -from argparse import ArgumentParser, FileType -from copy import deepcopy -import hisatgenotype_typing_common as typing_common, hisatgenotype_assembly_graph as assembly_graph - - -""" - var: ['single', 3300, 'G'] - exons: [[301, 373], [504, 822], [1084, 1417], [2019, 2301], [2404, 2520], [2965, 2997], [3140, 3187], [3357, 3361]] -""" -def var_in_exon(var, exons): - exonic = False - var_type, var_left, var_data = var - var_right = var_left - if var_type == "deletion": - var_right = var_left + int(var_data) - 1 - for exon_left, exon_right in exons: - if var_left >= exon_left and var_right <= exon_right: - return True - return False - - -""" -Report variant IDs whose var is within exonic regions -""" -def get_exonic_vars(Vars, exons): - vars = set() - for var_id, var in Vars.items(): - var_type, var_left, var_data = var - var_right = var_left - if var_type == "deletion": - var_right = var_left + int(var_data) - 1 - for exon_left, exon_right in exons: - if var_left >= exon_left and var_right <= exon_right: - vars.add(var_id) - - return vars - - -""" -Get representative alleles among those that share the same exonic sequences -""" -def get_rep_alleles(Links, exon_vars, in_alleles = None): - allele_vars = {} - for var, alleles in Links.items(): - if var not in exon_vars: - continue - for allele in alleles: - if in_alleles != None and allele not in in_alleles: - continue - if allele not in allele_vars: - allele_vars[allele] = set() - allele_vars[allele].add(var) - - allele_groups = {} - for allele, vars in allele_vars.items(): - vars = '-'.join(vars) - if vars not in allele_groups: - allele_groups[vars] = [] - allele_groups[vars].append(allele) - - allele_reps = {} # allele representatives - allele_rep_groups = {} # allele groups by allele representatives - for allele_members in allele_groups.values(): - assert len(allele_members) > 0 - allele_rep = allele_members[0] - allele_rep_groups[allele_rep] = allele_members - for allele_member in allele_members: - assert allele_member not in allele_reps - allele_reps[allele_member] = allele_rep - - return allele_reps, allele_rep_groups - - -""" -""" -def error_correct(ref_seq, - read_seq, - read_pos, - mpileup, - Vars, - Var_list, - cmp_list, - debug = False): - if debug: - print >> sys.stderr, cmp_list - print >> sys.stderr, read_seq - - num_correction = 0 - i = 0 - while i < len(cmp_list): - type, left, length = cmp_list[i][:3] - assert length > 0 - if left >= len(ref_seq): - break - if type == "match": - middle_cmp_list = [] - last_j = 0 - for j in range(length): - if read_pos + j >= len(read_seq) or \ - left + j >= len(ref_seq): - continue - - read_bp, ref_bp = read_seq[read_pos + j], ref_seq[left + j] - assert left + j < len(mpileup) - nt_set = mpileup[left + j][0] - if len(nt_set) > 0 and read_bp not in nt_set: - read_bp = 'N' if len(nt_set) > 1 else nt_set[0] - read_seq = read_seq[:read_pos + j] + read_bp + read_seq[read_pos + j + 1:] - assert read_bp != ref_bp - new_cmp = ["mismatch", left + j, 1, "unknown"] - num_correction += 1 - if read_bp != 'N': - var_idx = typing_common.lower_bound(Var_list, left + j) - while var_idx < len(Var_list): - var_pos, var_id = Var_list[var_idx] - if var_pos > left + j: - break - if var_pos == left + j: - var_type, _, var_data = Vars[var_id] - if var_type == "single" and read_bp == var_data: - new_cmp[3] = var_id - break - var_idx += 1 - if j > last_j: - middle_cmp_list.append(["match", left + last_j, j- last_j]) - middle_cmp_list.append(new_cmp) - last_j = j + 1 - if last_j < length: - middle_cmp_list.append(["match", left + last_j, length - last_j]) - - assert len(middle_cmp_list) > 0 - cmp_list = cmp_list[:i] + middle_cmp_list + cmp_list[i+1:] - i += (len(middle_cmp_list) - 1) - else: - assert type == "mismatch" - read_bp, ref_bp = read_seq[read_pos], ref_seq[left] - assert left < len(mpileup) - nt_set = mpileup[left][0] - - if debug: - print >> sys.stderr, left, read_bp, ref_bp, mpileup[left] - - if len(nt_set) > 0 and read_bp not in nt_set: - read_bp = 'N' if len(nt_set) > 1 else nt_set[0] - read_seq = read_seq[:read_pos] + read_bp + read_seq[read_pos+1:] - if read_bp == 'N': - cmp_list[i][3] = "unknown" - elif read_bp == ref_bp: - cmp_list[i] = ["match", left, 1] - num_correction += 1 - else: - cmp_list[i][3] = "unknown" - var_idx = typing_common.lower_bound(Var_list, left) - while var_idx < len(Var_list): - var_pos, var_id = Var_list[var_idx] - if var_pos > left: - break - if var_pos == left: - var_type, _, var_data = Vars[var_id] - if var_type == "single" and read_bp == var_data: - cmp_list[i][3] = var_id - break - var_idx += 1 - - if debug: - print >> sys.stderr, left, read_bp, ref_bp, mpileup[left] - print >> sys.stderr, cmp_list[i] - - read_pos += length - i += 1 - - # Combine matches - i = 0 - while i < len(cmp_list): - type, left, length = cmp_list[i][:3] - if type == "match" and i + 1 < len(cmp_list): - type2, left2, length2 = cmp_list[i+1][:3] - if type2 == "match": - cmp_list[i] = [type, left, length + length2] - cmp_list = cmp_list[:i+1] + cmp_list[i+2:] - continue - i += 1 - - if debug: - print >> sys.stderr, cmp_list - print >> sys.stderr, read_seq - - return cmp_list, read_seq, num_correction - - -""" -""" -def typing(simulation, - base_fname, - locus_list, - genotype_genome, - partial, - partial_alleles, - refGenes, - Genes, - Gene_names, - Gene_lengths, - refGene_loci, - Vars, - Var_list, - Links, - aligners, - num_editdist, - assembly, - output_base, - error_correction, - keep_alignment, - allow_discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - fastq, - read_fname, - alignment_fname, - num_frag_list, - read_len, - fragment_len, - threads, - best_alleles, - verbose, - assembly_verbose): - if simulation: - test_passed = {} - report_file = open(output_base + ".report", 'w') - for aligner, index_type in aligners: - for f_ in [sys.stderr, report_file]: - if index_type == "graph": - print >> f_, "\n\t\t%s %s" % (aligner, index_type) - else: - print >> f_, "\n\t\t%s %s" % (aligner, index_type) - - remove_alignment_file = False - if alignment_fname == "": - # Align reads, and sort the alignments into a BAM file - remove_alignment_file = True - if simulation: - alignment_fname = "%s_output.bam" % base_fname - else: - alignment_fname = read_fname[0].split('/')[-1] - alignment_fname = "%s.bam" % '.'.join(alignment_fname.split('.')[:2]) - - typing_common.align_reads(aligner, - simulation, - genotype_genome if genotype_genome != "" else (base_fname + "." + index_type), - index_type, - base_fname, - read_fname, - fastq, - threads, - alignment_fname, - verbose) - - for test_Gene_names in locus_list: - if base_fname == "genome": - if simulation: - region_chr, region_left, region_right = test_Gene_names[0] - else: - region_chr, region_left, region_right = test_Gene_names - gene = "%s:%d-%d" % (region_chr, region_left, region_right) - else: - if simulation: - gene = test_Gene_names[0].split('*')[0] - else: - gene = test_Gene_names - - ref_allele = refGenes[gene] - ref_seq = Genes[gene][ref_allele] - ref_locus = refGene_loci[gene] - ref_exons, ref_primary_exons = ref_locus[-2], ref_locus[-1] - novel_var_count = 0 - gene_vars, gene_var_list = deepcopy(Vars[gene]), deepcopy(Var_list[gene]) - cur_maxright = -1 - gene_var_maxrights = {} - for var_pos, var_id in gene_var_list: - var_type, var_pos, var_data = gene_vars[var_id] - if var_type == "deletion": - var_pos = var_pos + int(var_data) - 1 - cur_maxright = max(cur_maxright, var_pos) - gene_var_maxrights[var_id] = cur_maxright - - var_count = {} - def add_novel_var(gene_vars, - gene_var_list, - novel_var_count, - var_type, - var_pos, - var_data): - var_idx = typing_common.lower_bound(gene_var_list, var_pos) - while var_idx < len(gene_var_list): - pos_, id_ = gene_var_list[var_idx] - if pos_ > var_pos: - break - if pos_ == var_pos: - type_, _, data_ = gene_vars[id_] - assert type_ != var_type or data_ != var_data - if type_ != var_type: - if var_type == "insertion": - break - elif var_type == "single" and type_ == "deletion": - break - else: - if var_data < data_: - break - var_idx += 1 - var_id = "nv%d" % novel_var_count - assert var_id not in gene_vars - gene_vars[var_id] = [var_type, var_pos, var_data] - gene_var_list.insert(var_idx, [var_pos, var_id]) - return var_id, novel_var_count + 1 - - if not os.path.exists(alignment_fname + ".bai"): - os.system("samtools index %s" % alignment_fname) - # Read alignments - alignview_cmd = ["samtools", - "view", - alignment_fname] - base_locus = 0 - if genotype_genome != "": - _, chr, left, right = ref_locus[:4] - alignview_cmd += ["%s:%d-%d" % (chr, left+1, right+1)] - base_locus = left - - if index_type == "graph": - alignview_cmd += [ref_allele] - mpileup = typing_common.get_mpileup(alignview_cmd, - ref_seq, - base_locus, - gene_vars, - allow_discordant) - - if base_fname == "codis": - pair_interdist = typing_common.get_pair_interdist(alignview_cmd, - simulation, - verbose) - else: - pair_interdist = None - - bamview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - else: - alignview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - # List of nodes that represent alleles - allele_vars = {} - for _, var_id in gene_var_list: - if var_id not in Links: - continue - allele_list = Links[var_id] - for allele_id in allele_list: - if allele_id not in Genes[gene]: - continue - if allele_id not in allele_vars: - allele_vars[allele_id] = [var_id] - else: - allele_vars[allele_id].append(var_id) - - # Extract variants that are within exons - exon_vars = get_exonic_vars(gene_vars, ref_exons) - primary_exon_vars = get_exonic_vars(gene_vars, ref_primary_exons) - - # Store nodes that represent alleles - allele_nodes = {} - def create_allele_node(allele_name): - if allele_name in allele_nodes: - return allele_nodes[allele_name] - if allele_name in allele_vars: - var_ids = allele_vars[allele_name] - else: - var_ids = [] - seq = list(ref_seq) # sequence that node represents - var = ["" for i in range(len(ref_seq))] # how sequence is related to backbone - for var_id in var_ids: - assert var_id in gene_vars - var_type, var_pos, var_data = gene_vars[var_id] - assert var_pos >= 0 and var_pos < len(ref_seq) - if var_type == "single": - seq[var_pos] = var_data - var[var_pos] = var_id - elif var_type == "deletion": - del_len = int(var_data) - assert var_pos + del_len <= len(ref_seq) - seq[var_pos:var_pos + del_len] = ['D'] * del_len - var[var_pos:var_pos + del_len] = [var_id] * del_len - else: - # DK - to be implemented for insertions - assert var_type == "insertion" - - qual = ' ' * len(seq) - allele_node = assembly_graph.Node(allele_name, - 0, - seq, - qual, - var, - ref_seq, - gene_vars, - mpileup, - simulation) - allele_nodes[allele_name] = allele_node - return allele_node - - true_allele_nodes = {} - if simulation: - for allele_name in test_Gene_names: - true_allele_nodes[allele_name] = create_allele_node(allele_name) - - display_allele_nodes = {} - for display_allele in display_alleles: - display_allele_nodes[display_allele] = create_allele_node(display_allele) - - # Assembly graph - asm_graph = assembly_graph.Graph(ref_seq, - gene_vars, - ref_exons, - ref_primary_exons, - partial_alleles, - true_allele_nodes, - {}, # predicted_allele_nodes, which is empty for now - display_allele_nodes, - simulation) - - # Choose allele representives from those that share the same exonic sequences - allele_reps, allele_rep_groups = get_rep_alleles(Links, exon_vars) - allele_rep_set = set(allele_reps.values()) - - # Choose allele representives from those that share the primary exonic sequences - primary_exon_allele_reps, primary_exon_allele_rep_groups = get_rep_alleles(Links, primary_exon_vars, allele_rep_set) - primary_exon_allele_rep_set = set(primary_exon_allele_reps.values()) - - # Sanity check - for exon_allele in primary_exon_allele_reps.keys(): - # DK - debugging purposes - if exon_allele not in allele_rep_set: - print exon_allele, allele_reps[exon_allele], exon_allele in primary_exon_allele_reps.keys() - - assert exon_allele in allele_rep_set - - # For checking alternative alignments near the ends of alignments - Alts_left, Alts_right = typing_common.get_alternatives(ref_seq, - allele_vars, - gene_vars, - gene_var_list, - verbose >= 2) - - def haplotype_alts_list(haplotype_alts, left = True): - haplotype_list = [] - for haplotype in haplotype_alts.keys(): - if left: - pos = int(haplotype.split('-')[-1]) - else: - pos = int(haplotype.split('-')[0]) - haplotype_list.append([pos, haplotype]) - return sorted(haplotype_list, cmp = lambda a, b: a[0] - b[0]) - - Alts_left_list, Alts_right_list = haplotype_alts_list(Alts_left, True), haplotype_alts_list(Alts_right, False) - - # Count alleles - Gene_primary_exons_counts, Gene_primary_exons_cmpt = {}, {} - Gene_exons_counts, Gene_exons_cmpt = {}, {} - Gene_counts, Gene_cmpt = {}, {} - num_reads, num_pairs = 0, 0 - - # For debugging purposes - debug_allele_names = set(test_Gene_names) if simulation and verbose >= 2 else set() - - # Read information - prev_read_id = None - prev_right_pos = 0 - prev_lines = [] - left_read_ids, right_read_ids = set(), set() - if index_type == "graph": - # nodes for reads - read_nodes = [] - read_vars_list = [] - - # - def add_count(count_per_read, ht, add): - if base_fname == "genome" and len(count_per_read) == 1: - for allele in count_per_read.keys(): - count_per_read[allele] = add - return - - orig_ht = ht - ht = ht.split('-') - - assert len(ht) >= 2 - left, right = int(ht[0]), int(ht[-1]) - assert left <= right - - ht = ht[1:-1] - alleles = set(Genes[gene].keys()) - set([ref_allele]) - for i in range(len(ht)): - var_id = ht[i] - if var_id.startswith("nv") or \ - var_id not in Links: - continue - alleles &= set(Links[var_id]) - ht = set(ht) - - tmp_alleles = set() - var_idx = typing_common.lower_bound(gene_var_list, right + 1) - var_idx = min(var_idx, len(gene_var_list) - 1) - while var_idx >= 0: - _, var_id = gene_var_list[var_idx] - if var_id.startswith("nv") or \ - var_id in ht or \ - var_id not in Links: - var_idx -= 1 - continue - if var_id in gene_var_maxrights and gene_var_maxrights[var_id] < left: - break - var_type, var_left, var_data = gene_vars[var_id] - var_right = var_left - if var_type == "deletion": - var_right = var_left + int(var_data) - 1 - if (var_left >= left and var_left <= right) or \ - (var_right >= left and var_right <= right): - tmp_alleles |= set(Links[var_id]) - var_idx -= 1 - alleles -= tmp_alleles - alleles &= set(count_per_read.keys()) - - for allele in alleles: - count_per_read[allele] += add - - return len(alleles) - - # Identify best pairs - def choose_pairs(left_positive_hts, right_positive_hts): - if len(left_positive_hts) > 0 and \ - len(right_positive_hts) > 0 and \ - max(len(left_positive_hts), len(right_positive_hts)) >= 2: - expected_inter_dist = pair_interdist - """ - if simulation: - expected_inter_dist = fragment_len - read_len * 2 - """ - - best_diff = sys.maxint - picked = [] - for left_ht_str in left_positive_hts: - left_ht = left_ht_str.split('-') - l_left, l_right = int(left_ht[0]), int(left_ht[-1]) - for right_ht_str in right_positive_hts: - right_ht = right_ht_str.split('-') - r_left, r_right = int(right_ht[0]), int(right_ht[-1]) - if l_right < r_right: - inter_dist = r_left - l_right - 1 - else: - inter_dist = l_left - r_right - 1 - - cur_diff = abs(expected_inter_dist - inter_dist) - if best_diff > cur_diff: - best_diff = cur_diff - picked = [[left_ht_str, right_ht_str]] - elif best_diff == cur_diff: - picked.append([left_ht_str, right_ht_str]) - - assert len(picked) > 0 - - left_positive_hts, right_positive_hts = set(), set() - for left_ht_str, right_ht_str in picked: - left_positive_hts.add(left_ht_str) - right_positive_hts.add(right_ht_str) - - return left_positive_hts, right_positive_hts - - def get_exon_haplotypes(ht, exons): - if len(exons) <= 0: - return [] - - debug_ht = deepcopy(ht) - ht = ht.split('-') - assert len(ht) >= 2 - ht[0], ht[-1] = int(ht[0]), int(ht[-1]) - exon_hts = [] - for e_left, e_right in exons: - assert len(ht) >= 2 - ht_left, ht_right = ht[0], ht[-1] - if e_left > ht_right or e_right < ht_left: - continue - - new_ht = deepcopy(ht) - if ht_left < e_left: - split = False - for i in range(1, len(new_ht) - 1): - var_id = new_ht[i] - type, left, data = gene_vars[var_id] - if (type != "deletion" and left >= e_left) or \ - (type == "deletion" and left - 1 >= e_left): - ht_left = e_left - new_ht = [ht_left] + new_ht[i:] - split = True - break - if type == "deletion": - right = left + int(data) - if right >= e_left: - ht_left = right - new_ht = [right] + new_ht[i+1:] - split = True - break - if not split: - ht_left = e_left - new_ht = [ht_left, ht_right] - assert ht_left >= e_left - if ht_right > e_right: - split = False - for i in reversed(range(1, len(new_ht) - 1)): - var_id = new_ht[i] - type, right, data = gene_vars[var_id] - if type == "deletion": - right = right + int(data) - 1 - if (type != "deletion" and right <= e_right) or \ - (type == "deletion" and right + 1 <= e_right): - ht_right = e_right - new_ht = new_ht[:i+1] + [ht_right] - split = True - break - if type == "deletion": - left = right - int(data) - if left <= e_right: - ht_right = left - new_ht = new_ht[:i] + [ht_right] - split = True - break - if not split: - ht_right = e_right - new_ht = [ht_left, ht_right] - - if len(new_ht) == 2: - new_ht = "%d-%d" % (new_ht[0], new_ht[-1]) - else: - assert len(new_ht) > 2 - new_ht = "%d-%s-%d" % (new_ht[0], '-'.join(new_ht[1:-1]), new_ht[-1]) - assert ht_left <= ht_right - exon_hts.append(new_ht) - - return exon_hts - - # Positive evidence for left and right reads - left_positive_hts, right_positive_hts = set(), set() - - # Cigar regular expression - cigar_re = re.compile('\d+\w') - for line in alignview_proc.stdout: - line = line.strip() - cols = line.split() - read_id, flag, chr, pos, mapQ, cigar_str = cols[:6] - node_read_id = orig_read_id = read_id - if simulation: - read_id = read_id.split('|')[0] - read_seq, read_qual = cols[9], cols[10] - flag, pos = int(flag), int(pos) - pos -= (base_locus + 1) - if pos < 0: - continue - - # Unalined? - if flag & 0x4 != 0: - if simulation and verbose >= 2: - print "Unaligned" - print "\t", line - continue - - # Concordantly mapped? - if flag & 0x2 != 0: - concordant = True - else: - concordant = False - - NM, Zs, MD, NH = "", "", "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("Zs"): - Zs = col[5:] - elif col.startswith("MD"): - MD = col[5:] - elif col.startswith("NM"): - NM = int(col[5:]) - elif col.startswith("NH"): - NH = int(col[5:]) - - if NM > num_editdist: - continue - - # Only consider unique alignment - if NH > 1: - continue - - # Concordantly aligned mate pairs - if not allow_discordant and not concordant: - continue - - # Left read? - is_left_read = flag & 0x40 != 0 - if is_left_read: - if read_id in left_read_ids: - continue - left_read_ids.add(read_id) - if not simulation: - node_read_id += '|L' - else: # Right read? - assert flag & 0x80 != 0 - if read_id in right_read_ids: - continue - right_read_ids.add(read_id) - if not simulation: - node_read_id += '|R' - - if Zs: - Zs_str = Zs - Zs = Zs.split(',') - - assert MD != "" - MD_str_pos, MD_len = 0, 0 - Zs_pos, Zs_i = 0, 0 - for _i in range(len(Zs)): - Zs[_i] = Zs[_i].split('|') - Zs[_i][0] = int(Zs[_i][0]) - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - read_pos, left_pos = 0, pos - right_pos = left_pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - cmp_list = [] - num_error_correction = 0 - likely_misalignment = False - - # Extract variants w.r.t backbone from CIGAR string - softclip = [0, 0] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op == 'M': - first = True - MD_len_used = 0 - cmp_list_i = len(cmp_list) - while True: - if not first or MD_len == 0: - if MD[MD_str_pos].isdigit(): - num = int(MD[MD_str_pos]) - MD_str_pos += 1 - while MD_str_pos < len(MD): - if MD[MD_str_pos].isdigit(): - num = num * 10 + int(MD[MD_str_pos]) - MD_str_pos += 1 - else: - break - MD_len += num - # Insertion or full match followed - if MD_len >= length: - MD_len -= length - if length > MD_len_used: - cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used]) - break - first = False - read_base = read_seq[read_pos + MD_len] - MD_ref_base = MD[MD_str_pos] - MD_str_pos += 1 - assert MD_ref_base in "ACGT" - if MD_len > MD_len_used: - cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used]) - - _var_id = "unknown" - if read_pos + MD_len == Zs_pos and Zs_i < len(Zs): - assert Zs[Zs_i][1] == 'S' - _var_id = Zs[Zs_i][2] - Zs_i += 1 - Zs_pos += 1 - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - else: - # Search for a known (yet not indexed) variant or a novel variant - ref_pos = right_pos + MD_len - var_idx = typing_common.lower_bound(gene_var_list, ref_pos) - while var_idx < len(gene_var_list): - var_pos, var_id = gene_var_list[var_idx] - if var_pos > ref_pos: - break - if var_pos == ref_pos: - var_type, _, var_data = gene_vars[var_id] - if var_type == "single" and var_data == read_base: - _var_id = var_id - break - var_idx += 1 - - cmp_list.append(["mismatch", right_pos + MD_len, 1, _var_id]) - MD_len_used = MD_len + 1 - MD_len += 1 - # Full match - if MD_len == length: - MD_len = 0 - break - - # Correction for sequencing errors and update for cmp_list - if error_correction: - assert cmp_list_i < len(cmp_list) - new_cmp_list, read_seq, _num_error_correction = error_correct(ref_seq, - read_seq, - read_pos, - mpileup, - gene_vars, - gene_var_list, - cmp_list[cmp_list_i:], - node_read_id == "aHSQ1008:175:C0JVFACXX:5:1109:17665:21583|L") - cmp_list = cmp_list[:cmp_list_i] + new_cmp_list - num_error_correction += _num_error_correction - - elif cigar_op == 'I': - _var_id = "unknown" - if read_pos == Zs_pos and Zs_i < len(Zs): - assert Zs[Zs_i][1] == 'I' - _var_id = Zs[Zs_i][2] - Zs_i += 1 - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - else: - # Search for a known (yet not indexed) variant or a novel variant - var_idx = typing_common.lower_bound(gene_var_list, right_pos) - while var_idx < len(gene_var_list): - var_pos, var_id = gene_var_list[var_idx] - if var_pos > right_pos: - break - if var_pos == right_pos: - var_type, _, var_data = gene_vars[var_id] - if var_type == "insertion" and len(var_data) == length: - _var_id = var_id - break - var_idx += 1 - cmp_list.append(["insertion", right_pos, length, _var_id]) - if 'N' in read_seq[read_pos:read_pos+length]: - likely_misalignment = True - - elif cigar_op == 'D': - if MD[MD_str_pos] == '0': - MD_str_pos += 1 - assert MD[MD_str_pos] == '^' - MD_str_pos += 1 - while MD_str_pos < len(MD): - if not MD[MD_str_pos] in "ACGT": - break - MD_str_pos += 1 - _var_id = "unknown" - if read_pos == Zs_pos and \ - Zs_i < len(Zs) and \ - Zs[Zs_i][1] == 'D': - _var_id = Zs[Zs_i][2] - Zs_i += 1 - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - else: - # Search for a known (yet not indexed) variant or a novel variant - var_idx = typing_common.lower_bound(gene_var_list, right_pos) - while var_idx < len(gene_var_list): - var_pos, var_id = gene_var_list[var_idx] - if var_pos > right_pos: - break - if var_pos == right_pos: - var_type, _, var_data = gene_vars[var_id] - if var_type == "deletion" and int(var_data) == length: - _var_id = var_id - break - var_idx += 1 - - cmp_list.append(["deletion", right_pos, length, _var_id]) - - # Check if this deletion is artificial alignment - if right_pos < len(mpileup): - del_count, nt_count = 0, 0 - for nt, value in mpileup[right_pos][1].items(): - count = value[0] - if nt == 'D': - del_count += count - else: - nt_count += count - - # DK - debugging purposes - if base_fname == "hla": - if del_count * 6 < nt_count: # and nt_count >= 15: - likely_misalignment = True - - elif cigar_op == 'S': - if i == 0: - softclip[0] = length - Zs_pos += length - else: - assert i + 1 == len(cigars) - softclip[1] = length - else: - assert cigar_op == 'N' - assert False - cmp_list.append(["intron", right_pos, length]) - - if cigar_op in "MND": - right_pos += length - - if cigar_op in "MIS": - read_pos += length - - # Remove softclip in cigar and modify read_seq and read_qual accordingly - if sum(softclip) > 0: - if softclip[0] > 0: - cigars = cigars[1:] - read_seq = read_seq[softclip[0]:] - read_qual = read_qual[softclip[0]:] - if softclip[1] > 0: - cigars = cigars[:-1] - read_seq = read_seq[:-softclip[1]] - read_qual = read_qual[:-softclip[1]] - - cigar_str = "" - for type, length in cigars: - cigar_str += str(length) - cigar_str += type - - if sum(softclip) > 0: - continue - - if right_pos > len(ref_seq): - continue - - if num_error_correction > max(1, num_editdist): - continue - - if likely_misalignment: - continue - - # Add novel variants - read_pos = 0 - for cmp_i in range(len(cmp_list)): - type_, pos_, length_ = cmp_list[cmp_i][:3] - if type_ != "match": - var_id_ = cmp_list[cmp_i][3] - if var_id_ == "unknown": - add = True - if type_ == "mismatch": - data_ = read_seq[read_pos] - if data_ == 'N': - add = False - elif type_ == "deletion": - data_ = str(length_) - else: - assert type_ == "insertion" - data_ = read_seq[read_pos:read_pos + length_] - if add: - var_id_, novel_var_count = add_novel_var(gene_vars, - gene_var_list, - novel_var_count, - type_ if type_ != "mismatch" else "single", - pos_, - data_) - cmp_list[cmp_i][3] = var_id_ - if var_id_ != "unknown": - if var_id_ not in var_count: - var_count[var_id_] = 1 - else: - var_count[var_id_] += 1 - - if type_ != "deletion": - read_pos += length_ - - # Count the number of reads aligned uniquely with some constraints - num_reads += 1 - - def add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, include_alleles = set()): - if len(Gene_count_per_read) <= 0: - return "" - max_count = max(Gene_count_per_read.values()) - cur_cmpt = set() - for allele, count in Gene_count_per_read.items(): - if count < max_count: - continue - if len(include_alleles) > 0 and allele not in include_alleles: - continue - - cur_cmpt.add(allele) - if allele not in Gene_counts: - Gene_counts[allele] = 1 - else: - Gene_counts[allele] += 1 - - if len(cur_cmpt) == 0: - return "" - - if verbose >= 2: - alleles = ["", ""] - allele1_found, allele2_found = False, False - if alleles[0] != "": - for allele, count in Gene_count_per_read.items(): - if count < max_count: - continue - if allele == alleles[0]: - allele1_found = True - elif allele == alleles[1]: - allele2_found = True - if allele1_found != allele2_found: - print >> sys.stderr, alleles[0], Gene_count_per_read[alleles[0]] - print >> sys.stderr, alleles[1], Gene_count_per_read[alleles[1]] - if allele1_found: - print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, Gene_count_per_read[alleles[1]])) - else: - print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, Gene_count_per_read[alleles[0]])) - - cur_cmpt = sorted(list(cur_cmpt)) - cur_cmpt = '-'.join(cur_cmpt) - if not cur_cmpt in Gene_cmpt: - Gene_cmpt[cur_cmpt] = 1 - else: - Gene_cmpt[cur_cmpt] += 1 - - return cur_cmpt - - if read_id != prev_read_id: - if prev_read_id != None: - num_pairs += 1 - # DK - needs more test - # Several alleles go over 100 bps - """ - if base_fname == "codis" and gene == "D18S51": - left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts) - """ - - for positive_ht in left_positive_hts | right_positive_hts: - primary_exon_hts = get_exon_haplotypes(positive_ht, ref_primary_exons) - for exon_ht in primary_exon_hts: - add_count(Gene_primary_exons_count_per_read, exon_ht, 1) - exon_hts = get_exon_haplotypes(positive_ht, ref_exons) - for exon_ht in exon_hts: - add_count(Gene_exons_count_per_read, exon_ht, 1) - add_count(Gene_count_per_read, positive_ht, 1) - - # DK - debugging purposes - if prev_read_id.startswith("NS500497:33:HY32TBGXX:3:13511:0:56517876") and False: - print prev_read_id, left_positive_hts, right_positive_hts - max_count = max(Gene_primary_exons_count_per_read.values()) - for allele, count in Gene_primary_exons_count_per_read.items(): - if allele not in primary_exon_allele_rep_set: - continue - if count < max_count: - continue - print allele, count - - # DK - debugging purposes - """ - debug_allele_id = "TH01*10" - assert debug_allele_id in Gene_gen_count_per_read - debug_max_read_count = max(Gene_gen_count_per_read.values()) - debug_read_count = Gene_gen_count_per_read[debug_allele_id] - if debug_read_count < debug_max_read_count: - print prev_read_id, debug_read_count, debug_max_read_count, Gene_gen_count_per_read - print "\t", left_positive_hts, right_positive_hts - None - if prev_read_id == "HSQ1008:175:C0JVFACXX:5:1109:17665:21583": - for line in prev_lines: - print line - print "left_positive_hts :", left_positive_hts - print "right_positive_hts:", right_positive_hts - print "exon:", debug_read_count, "max:", debug_max_read_count - print "gen:", Gene_gen_count_per_read[debug_allele_id], "max:", max(Gene_gen_count_per_read.values()) - - for allele_id, count in Gene_count_per_read.items(): - if count == debug_max_read_count: - None - # print "allele max:", allele_id, count - # sys.exit(1) - None - """ - - cur_cmpt, cur_cmpt_gen = "", "" - if base_fname == "hla": - cur_primary_exons_cmpt = add_stat(Gene_primary_exons_cmpt, Gene_primary_exons_counts, Gene_primary_exons_count_per_read, primary_exon_allele_rep_set) - - # DK - debugging purposes - # for cmpt, count in Gene_primary_exons_count_per_read.items(): - if cur_primary_exons_cmpt.find("A*24:145") != -1 and cur_primary_exons_cmpt.find("A*24:02:01") == -1: - print prev_read_id - print cur_primary_exons_cmpt - - - - cur_exons_cmpt = add_stat(Gene_exons_cmpt, Gene_exons_counts, Gene_exons_count_per_read, allele_rep_set) - cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read) - else: - cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read) - for read_id_, read_id_i, read_node in read_nodes: - asm_graph.add_node(read_id_, - read_id_i, - read_node, - simulation) - read_nodes, read_var_list = [], [] - if simulation and \ - verbose >= 2 and \ - base_fname in ["hla", "codis"]: - cur_cmpt = cur_cmpt.split('-') if cur_cmpt != "" else set() - cur_cmpt_gen = cur_cmpt_gen.split('-') if cur_cmpt_gen != "" else set() - show_debug = (partial and cur_cmpt != "" and not set(cur_cmpt) & set(test_Gene_names)) or \ - (not partial and cur_cmpt_gen != "" and not set(cur_cmpt_gen) & set(test_Gene_names)) - - if show_debug: - print "%s are chosen instead of %s" % (cur_cmpt if partial else cur_cmpt_gen, '-'.join(test_Gene_names)) - for prev_line in prev_lines: - print "\t", prev_line - - prev_lines = [] - - left_positive_hts, right_positive_hts = set(), set() - Gene_primary_exons_count_per_read, Gene_exons_count_per_read, Gene_count_per_read = {}, {}, {} - for allele in Gene_names[gene]: - if allele.find("BACKBONE") != -1: - continue - if base_fname == "genome" and allele.find("GRCh38") != -1: - continue - if allele in primary_exon_allele_rep_set: - Gene_primary_exons_count_per_read[allele] = 0 - if allele in allele_rep_set: - Gene_exons_count_per_read[allele] = 0 - Gene_count_per_read[allele] = 0 - - prev_lines.append(line) - - # Remove mismatches due to unknown or novel variants - cmp_list2 = [] - for cmp in cmp_list: - cmp = deepcopy(cmp) - type, pos, length = cmp[:3] - if type == "match": - if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match": - cmp_list2[-1][2] += length - else: - cmp_list2.append(cmp) - elif type == "mismatch" and \ - (cmp[3] == "unknown" or cmp[3].startswith("nv")): - if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match": - cmp_list2[-1][2] += 1 - else: - cmp_list2.append(["match", pos, 1]) - else: - cmp_list2.append(cmp) - - cmp_list_left, cmp_list_right, cmp_left_alts, cmp_right_alts = \ - typing_common.identify_ambigious_diffs(ref_seq, - gene_vars, - Alts_left, - Alts_right, - Alts_left_list, - Alts_right_list, - cmp_list2, - verbose, - orig_read_id.startswith("HSQ1009:126:D0UUYACXX:4:2212:9787:80992#")) # debug? - - mid_ht = [] - for cmp in cmp_list2[cmp_list_left:cmp_list_right+1]: - type = cmp[0] - if type not in ["mismatch", "deletion", "insertion"]: - continue - var_id = cmp[3] - mid_ht.append(var_id) - - for l in range(len(cmp_left_alts)): - left_ht = cmp_left_alts[l].split('-') - left_ht += mid_ht - for r in range(len(cmp_right_alts)): - right_ht = cmp_right_alts[r].split('-') - ht = left_ht + right_ht - if len(ht) <= 0: - continue - ht_str = '-'.join(ht) - if is_left_read: - left_positive_hts.add(ht_str) - else: - right_positive_hts.add(ht_str) - - # DK - debugging purposes - DK_debug = False - if orig_read_id.startswith("30|R!"): - DK_debug = True - print line - print cmp_list - print "positive hts:", left_positive_hts, right_positive_hts - print "cmp_list [%d, %d]" % (cmp_list_left, cmp_list_right) - - if assembly: - # Construct multiple candidate realignments for CODIS - cmp_llist = [] - hts = left_positive_hts if is_left_read else right_positive_hts - assert len(hts) > 0 - for ht in hts: - cmp_list = [] - read_pos = 0 - vars_ = ht.split('-') - left_ = int(vars_[0]) - vars_ = vars_[1:] - for var_i in range(len(vars_)): - var_id = vars_[var_i] - # ref_seq, read_seq - if var_i == len(vars_) - 1: - right_ = int(var_id) - else: - var_type, var_pos, var_data = gene_vars[var_id] - right_ = var_pos - 1 - - for pos in range(left_, right_ + 1): - if read_seq[read_pos] != ref_seq[pos]: - if left_ < pos: - cmp_list.append(["match", left_, pos - left_]) - cmp_list.append(["mismatch", pos, 1, "unknown"]) - left_ = pos + 1 - read_pos += 1 - if left_ <= right_: - cmp_list.append(["match", left_, right_ - left_ + 1]) - - if var_i == len(vars_) - 1: - left_ = right_ + 1 - break - - if var_type == "single": - cmp_list.append(["mismatch", var_pos, 1, var_id]) - left_ = var_pos + 1 - read_pos += 1 - elif var_type == "deletion": - del_len = int(var_data) - cmp_list.append(["deletion", var_pos, del_len, var_id]) - left_ = var_pos + del_len - else: - assert var_type == "insertion" - cmp_list.append(["insertion", var_pos, len(var_data), var_id]) - left_ = var_pos - read_pos += len(var_data) - - assert len(cmp_list) > 0 - cmp_llist.append(cmp_list) - - for cmp_list_i in range(len(cmp_llist)): - # Node - cmp_list = cmp_llist[cmp_list_i] - read_node_pos, read_node_seq, read_node_qual, read_node_var = -1, [], [], [] - read_vars = [] - ref_pos, read_pos = cmp_list[0][1], 0 - cmp_i = 0 - while cmp_i < len(cmp_list): - cmp = cmp_list[cmp_i] - type, length = cmp[0], cmp[2] - if type in ["match", "mismatch"]: - if read_node_pos < 0: - read_node_pos = ref_pos - if type == "match": - read_node_seq += list(read_seq[read_pos:read_pos+length]) - read_node_qual += list(read_qual[read_pos:read_pos+length]) - read_node_var += ([''] * length) - read_pos += length - elif type == "mismatch": - var_id = cmp[3] - read_base, qual = read_seq[read_pos], read_qual[read_pos] - read_node_seq += [read_base] - read_node_qual += [qual] - read_node_var.append(var_id) - read_pos += 1 - elif type == "deletion": - var_id = cmp[3] - del_len = length - read_node_seq += (['D'] * del_len) - read_node_qual += ([''] * del_len) - if len(read_node_seq) > len(read_node_var): - assert len(read_node_seq) == len(read_node_var) + del_len - read_node_var += ([var_id] * del_len) - elif type == "insertion": - var_id = cmp[3] - ins_len = length - ins_seq = read_seq[read_pos:read_pos+ins_len] - read_node_seq += ["I%s" % nt for nt in ins_seq] - read_node_qual += list(read_qual[read_pos:read_pos+ins_len]) - read_node_var += ([var_id] * ins_len) - read_pos += length - else: - assert type == "intron" - cmp_i += 1 - - read_nodes.append([node_read_id, - cmp_list_i, - assembly_graph.Node(node_read_id, - read_node_pos, - read_node_seq, - read_node_qual, - read_node_var, - ref_seq, - gene_vars, - mpileup, - simulation)]) - - prev_read_id = read_id - prev_right_pos = right_pos - - if prev_read_id != None: - num_pairs += 1 - if base_fname == "codis" and gene == "D18S51": - left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts) - for positive_ht in left_positive_hts | right_positive_hts: - primary_exon_hts = get_exon_haplotypes(positive_ht, ref_primary_exons) - for exon_ht in primary_exon_hts: - add_count(Gene_primary_exons_count_per_read, exon_ht, 1) - exon_hts = get_exon_haplotypes(positive_ht, ref_exons) - for exon_ht in exon_hts: - add_count(Gene_exons_count_per_read, exon_ht, 1) - add_count(Gene_count_per_read, positive_ht, 1) - - if base_fname == "hla": - add_stat(Gene_primary_exons_cmpt, Gene_primary_exons_counts, Gene_primary_exons_count_per_read, primary_exon_allele_rep_set) - add_stat(Gene_exons_cmpt, Gene_exons_counts, Gene_exons_count_per_read, allele_rep_set) - add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read) - for read_id_, read_id_i, read_node in read_nodes: - asm_graph.add_node(read_id_, - read_id_i, - read_node, - simulation) - read_nodes, read_var_list = [], [] - - if num_reads <= 0: - continue - - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t%d reads and %d pairs are aligned" % (num_reads, num_pairs) - - else: - assert index_type == "linear" - def add_alleles(alleles): - if not allele in Gene_counts: - Gene_counts[allele] = 1 - else: - Gene_counts[allele] += 1 - - cur_cmpt = sorted(list(alleles)) - cur_cmpt = '-'.join(cur_cmpt) - if not cur_cmpt in Gene_cmpt: - Gene_cmpt[cur_cmpt] = 1 - else: - Gene_cmpt[cur_cmpt] += 1 - - prev_read_id, prev_AS = None, None - alleles = set() - for line in alignview_proc.stdout: - cols = line[:-1].split() - read_id, flag, allele = cols[:3] - flag = int(flag) - if flag & 0x4 != 0: - continue - if not allele.startswith(gene): - continue - if allele.find("BACKBONE") != -1: - continue - - AS = None - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - assert AS != None - if read_id != prev_read_id: - if alleles: - if aligner == "hisat2" or \ - (aligner == "bowtie2" and len(alleles) < 10): - add_alleles(alleles) - alleles = set() - prev_AS = None - if prev_AS != None and AS < prev_AS: - continue - prev_read_id = read_id - prev_AS = AS - alleles.add(allele) - - if alleles: - add_alleles(alleles) - - Gene_counts = [[allele, count] for allele, count in Gene_counts.items()] - def Gene_count_cmp(a, b): - if a[1] != b[1]: - return b[1] - a[1] - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - Gene_counts = sorted(Gene_counts, cmp=Gene_count_cmp) - for count_i in range(len(Gene_counts)): - count = Gene_counts[count_i] - if simulation: - found = False - for test_Gene_name in test_Gene_names: - if count[0] == test_Gene_name: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_Gene_name, count[1]) - found = True - if count_i < 5 and not found: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - else: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - if count_i >= 9: - break - for f_ in [sys.stderr, report_file]: - print >> f_ - - # Calculate the abundance of representative alleles on exonic sequences - if base_fname == "hla": - perform_typing_primary_exon = False - # Incorporate representive alleles for primary exons (experimental feature) - if perform_typing_primary_exon: - Gene_prob = primary_exon_prob = typing_common.single_abundance(Gene_primary_exons_cmpt) - primary_exon_alleles = set() - primary_exon_prob_sum = 0.0 - for prob_i in range(len(primary_exon_prob)): - allele, prob = primary_exon_prob[prob_i][:2] - if len(primary_exon_allele_rep_groups[allele]) <= 1: - continue - primary_exon_prob_sum += prob - primary_exon_alleles |= set(primary_exon_allele_rep_groups[allele]) - - # Incorporate representative alleles for exons - if len(primary_exon_alleles) > 0: - Gene_exons_cmpt2 = {} - for cmpt, value in Gene_exons_cmpt.items(): - cmpt2 = [] - for allele in cmpt.split('-'): - if allele in primary_exon_alleles: - cmpt2.append(allele) - if len(cmpt2) == 0: - continue - cmpt2 = '-'.join(cmpt2) - if cmpt2 not in Gene_exons_cmpt2: - Gene_exons_cmpt2[cmpt2] = value - else: - Gene_exons_cmpt2[cmpt2] += value - exon_prob = typing_common.single_abundance(Gene_exons_cmpt2, - remove_low_abundance_alleles) - exon_prob2 = {} - for allele, prob in primary_exon_prob: - if allele not in primary_exon_alleles: - exon_prob2[allele] = prob - for allele, prob in exon_prob: - exon_prob2[allele] = prob * primary_exon_prob_sum - exon_prob = [[allele, prob] for allele, prob in exon_prob2.items()] - Gene_prob = exon_prob = sorted(exon_prob, cmp=typing_common.Gene_prob_cmp) - else: - # Incorporate representative alleles for exons - Gene_prob = exon_prob = typing_common.single_abundance(Gene_exons_cmpt, - remove_low_abundance_alleles) - - exon_alleles = set() - exon_prob_sum = 0.0 - for prob_i in range(len(exon_prob)): - allele, prob = exon_prob[prob_i][:2] - if prob_i >= 10 and prob < 0.03: - break - if len(allele_rep_groups[allele]) <= 1: - continue - - exon_prob_sum += prob - exon_alleles |= set(allele_rep_groups[allele]) - - # Incorporate full-length alleles, non-representative alleles - if len(exon_alleles) > 0: - Gene_cmpt2 = {} - for cmpt, value in Gene_cmpt.items(): - cmpt2 = [] - for allele in cmpt.split('-'): - if allele in exon_alleles: - cmpt2.append(allele) - if len(cmpt2) == 0: - continue - cmpt2 = '-'.join(cmpt2) - if cmpt2 not in Gene_cmpt2: - Gene_cmpt2[cmpt2] = value - else: - Gene_cmpt2[cmpt2] += value - Gene_cmpt = Gene_cmpt2 - Gene_prob = typing_common.single_abundance(Gene_cmpt, - True, - Gene_lengths[gene]) - - Gene_combined_prob = {} - for allele, prob in exon_prob: - if allele not in exon_alleles: - Gene_combined_prob[allele] = prob - - for allele, prob in Gene_prob: - Gene_combined_prob[allele] = prob * exon_prob_sum - - Gene_prob = [[allele, prob] for allele, prob in Gene_combined_prob.items()] - Gene_prob = sorted(Gene_prob, cmp=typing_common.Gene_prob_cmp) - else: - if len(Gene_cmpt.keys()) <= 1: - Gene_prob = [] - if len(Gene_cmpt.keys()) == 1: - Gene_prob = [[Gene_cmpt.keys()[0], 1.0]] - else: - Gene_prob = typing_common.single_abundance(Gene_cmpt) - - if index_type == "graph" and assembly: - allele_node_order = [] - predicted_allele_nodes = {} - for allele_name, prob in Gene_prob: - if prob < 0.1: # abundance of 10% - break - predicted_allele_nodes[allele_name] = create_allele_node(allele_name) - allele_node_order.append([allele_name, prob]) - if len(predicted_allele_nodes) >= 2: - break - asm_graph.predicted_allele_nodes = predicted_allele_nodes - asm_graph.allele_node_order = allele_node_order - asm_graph.calculate_coverage() - - # Start drawing assembly graph - asm_graph.begin_draw("%s.%s.%s" % (output_base, base_fname, gene)) - - # Draw assembly graph - begin_y = asm_graph.draw(0, "a. Read alignment") - begin_y += 200 - - # Apply De Bruijn graph - asm_graph.guided_DeBruijn(assembly_verbose) - - # Draw assembly graph - begin_y = asm_graph.draw(begin_y, "b. Asssembly") - begin_y += 200 - - # Draw assembly graph - asm_graph.nodes = asm_graph.nodes2 - asm_graph.to_node, asm_graph.from_node = {}, {} - begin_y = asm_graph.draw(begin_y, "c. Assembly with known alleles") - - # End drawing assembly graph - asm_graph.end_draw() - - # Compare two alleles - if simulation and len(test_Gene_names) == 2: - allele_name1, allele_name2 = test_Gene_names - print >> sys.stderr, allele_name1, "vs.", allele_name2 - asm_graph.print_node_comparison(asm_graph.true_allele_nodes) - - def compare_alleles(vars1, vars2, print_output = True): - skip = True - var_i, var_j = 0, 0 - exon_i = 0 - allele_seq, mismatches = list(ref_seq), 0 - while var_i < len(vars1) and var_j < len(vars2): - cmp_var_id, node_var_id = vars1[var_i], vars2[var_j] - cmp_var, node_var = gene_vars[cmp_var_id], gene_vars[node_var_id] - - min_pos = min(cmp_var[1], node_var[1]) - cmp_var_in_exon, node_var_in_exon = False, False - while exon_i < len(ref_exons): - exon_left, exon_right = ref_exons[exon_i] - if min_pos <= exon_right: - if cmp_var[1] >= exon_left and cmp_var[1] <= exon_right: - cmp_var_in_exon = True - else: - cmp_var_in_exon = False - if node_var[1] >= exon_left and node_var[1] <= exon_right: - node_var_in_exon = True - else: - node_var_in_exon = False - break - exon_i += 1 - - if cmp_var_id == node_var_id: - skip = False - if print_output: - if cmp_var_in_exon: - print >> sys.stderr, "\033[94mexon%d\033[00m" % (exon_i + 1), - print >> sys.stderr, cmp_var_id, cmp_var, "\t\t\t", mpileup[cmp_var[1]] - var_i += 1; var_j += 1 - - var_type, var_pos, var_data = cmp_var - if var_type == "single": - allele_seq[var_pos] = var_data - elif var_type == "deletion": - allele_seq[var_pos:var_pos+int(var_data)] = '.' * int(var_data) - else: - assert var_type == "insertion" - continue - if cmp_var[1] <= node_var[1]: - if not skip: - if (var_i > 0 and var_i + 1 < len(vars1)) or cmp_var[0] != "deletion": - if print_output: - if cmp_var_in_exon: - for f_ in [sys.stderr, report_file]: - print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1), - for f_ in [sys.stderr, report_file]: - print >> f_, "***", cmp_var_id, cmp_var, "==", "\t\t\t", mpileup[cmp_var[1]] - mismatches += 1 - var_i += 1 - else: - if print_output: - if node_var_in_exon: - for f_ in [sys.stderr, report_file]: - print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1), - for f_ in [sys.stderr, report_file]: - print >> f_, "*** ==", node_var_id, node_var, "\t\t\t", mpileup[node_var[1]] - mismatches += 1 - var_j += 1 - - allele_exons = ref_exons[:] - allele_seq = ''.join(allele_seq) - del_counts = [] - for del_i in range(len(allele_seq)): - del_count = 0 if del_i == 0 else del_counts[-1] - if allele_seq[del_i] == '.': - del_count += 1 - del_counts.append(del_count) - for exon_i in range(len(allele_exons)): - exon_left, exon_right = allele_exons[exon_i] - exon_left -= del_counts[exon_left] - exon_right -= del_counts[exon_right] - allele_exons[exon_i] = [exon_left, exon_right] - - allele_seq = allele_seq.replace('.', '') - return allele_seq, allele_exons, mismatches - - tmp_nodes = asm_graph.nodes - print >> sys.stderr, "Number of tmp nodes:", len(tmp_nodes) - count = 0 - for id, node in tmp_nodes.items(): - count += 1 - if count > 10: - break - node_vars = node.get_var_ids() - node.print_info(); print >> sys.stderr - if node.id in asm_graph.to_node: - for id2, at in asm_graph.to_node[node.id]: - print >> sys.stderr, "\tat %d ==> %s" % (at, id2) - - if simulation: - cmp_Gene_names = test_Gene_names - else: - cmp_Gene_names = [allele_name for allele_name, _ in allele_node_order] - - alleles, cmp_vars, max_common = [], [], -sys.maxint - for cmp_Gene_name in cmp_Gene_names: - tmp_vars = allele_nodes[cmp_Gene_name].get_var_ids(node.left, node.right) - tmp_common = len(set(node_vars) & set(tmp_vars)) - tmp_common -= len(set(node_vars) | set(tmp_vars)) - if max_common < tmp_common: - max_common = tmp_common - alleles = [[cmp_Gene_name, tmp_vars]] - elif max_common == tmp_common: - alleles.append([cmp_Gene_name, tmp_vars]) - - for allele_name, cmp_vars in alleles: - for f_ in [sys.stderr, report_file]: - print >> f_, "vs.", allele_name - allele_seq, allele_exons, allele_mm = compare_alleles(cmp_vars, node_vars) - print >> f_, "\t\tallele sequence (%d bps):" % len(allele_seq), allele_seq - print >> f_, "\t\texons (zero-based offset):", allele_exons - - print >> sys.stderr - print >> sys.stderr - - - # Identify alleles that perfectly or closesly match assembled alleles - for node_name, node in asm_graph.nodes.items(): - vars = set(node.get_var_ids()) - - max_allele_names, max_common = [], -sys.maxint - for allele_name, vars2 in allele_vars.items(): - vars2 = set(vars2) - tmp_common = len(vars & vars2) - len(vars | vars2) - if tmp_common > max_common: - max_common = tmp_common - max_allele_names = [allele_name] - elif tmp_common == max_common: - max_allele_names.append(allele_name) - - for f_ in [sys.stderr, report_file]: - print >> f_, "Genomic:", node_name - node_vars = node.get_var_ids() - min_mismatches = sys.maxint - for max_allele_name in max_allele_names: - cmp_vars = allele_vars[max_allele_name] - cmp_vars = sorted(cmp_vars, cmp=lambda a, b: int(a[2:]) - int(b[2:])) - print_output = False - _, _, tmp_mismatches = compare_alleles(cmp_vars, node_vars, print_output) - print >> f_, "\t\t%s:" % max_allele_name, max_common, tmp_mismatches - if tmp_mismatches < min_mismatches: - min_mismatches = tmp_mismatches - if min_mismatches > 0: - print >> f_, "Novel allele" - else: - print >> f_, "Known allele" - - """ - allele_exon_vars = {} - for allele_name, vars in allele_vars.items(): - allele_exon_vars[allele_name] = set(vars) & exon_vars - - for node_name, node in asm_graph.nodes.items(): - vars = [] - for left, right in ref_exons: - vars += node.get_var_ids(left, right) - vars = set(vars) & exon_vars - - max_allele_names, max_common = [], -sys.maxint - for allele_name, vars2 in allele_exon_vars.items(): - tmp_common = len(vars & vars2) - len(vars | vars2) - if tmp_common > max_common: - max_common = tmp_common - max_allele_names = [allele_name] - elif tmp_common == max_common: - max_allele_names.append(allele_name) - - for f_ in [sys.stderr, report_file]: - print >> f_, "Exonic:", node_name - for max_allele_name in max_allele_names: - print >> f_, "\t\t%s:" % max_allele_name, max_common - """ - - if simulation: - success = [False for i in range(len(test_Gene_names))] - found_list = [False for i in range(len(test_Gene_names))] - for prob_i in range(len(Gene_prob)): - prob = Gene_prob[prob_i] - if prob[1] < 0.01: - break - found = False - _allele_rep = prob[0] - """ - if partial and exonic_only: - _fields = _allele_rep.split(':') - if len(_fields) == 4: - _allele_rep = ':'.join(_fields[:-1]) - """ - if simulation: - for name_i in range(len(test_Gene_names)): - test_Gene_name = test_Gene_names[name_i] - if prob[0] == test_Gene_name: - rank_i = prob_i - while rank_i > 0: - if prob == Gene_prob[rank_i - 1][1]: - rank_i -= 1 - else: - break - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_Gene_name, prob[1] * 100.0) - if rank_i < len(success): - success[rank_i] = True - found_list[name_i] = True - found = True - # DK - for debugging purposes - if not False in found_list and prob_i >= 10: - break - if not found: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, _allele_rep, prob[1] * 100.0) - - if best_alleles and prob_i < 2: - for f_ in [sys.stderr, report_file]: - print >> f_, "SingleModel %s (abundance: %.2f%%)" % (_allele_rep, prob[1] * 100.0) - - # DK - debugging purposes - """ - # ref_allele_node_ = create_allele_node("A*03:01:01:01") - ref_allele_node_ = create_allele_node("DQA1*01:02:01:01") - cmp_node_ = create_allele_node(_allele_rep) - count_ = 0 - for i_ in range(len(ref_allele_node_.seq)): - if assembly_graph.get_major_nt(ref_allele_node_.seq[i_]) != assembly_graph.get_major_nt(cmp_node_.seq[i_]): - count_ += 1 - print "\t\t\t\t\tDK:", count_, len(ref_allele_node_.seq) - vars1, vars2 = allele_vars["DQA1*01:02:01:01"], allele_vars[_allele_rep] - print "\t\t\t\t\tDK:", set(vars1) - set(vars2), set(vars2) - set(vars1) - """ - - if not simulation and prob_i >= 9: - break - if prob_i >= 19: - break - print >> sys.stderr - - if simulation and not False in success: - aligner_type = "%s %s" % (aligner, index_type) - if not aligner_type in test_passed: - test_passed[aligner_type] = 1 - else: - test_passed[aligner_type] += 1 - - if not keep_alignment and remove_alignment_file: - os.system("rm %s*" % (alignment_fname)) - - report_file.close() - if simulation: - return test_passed - - -""" -""" -def read_backbone_alleles(genotype_genome, refGene_loci, Genes): - for gene_name in refGene_loci: - allele_name, chr, left, right = refGene_loci[gene_name][:4] - seq_extract_cmd = ["samtools", - "faidx", - "%s.fa" % genotype_genome, - "%s:%d-%d" % (chr, left+1, right+1)] - - length = right - left + 1 - proc = subprocess.Popen(seq_extract_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) - seq = "" - for line in proc.stdout: - line = line.strip() - if line.startswith('>'): - continue - seq += line - assert len(seq) == length - assert gene_name not in Genes - Genes[gene_name] = {} - Genes[gene_name][allele_name] = seq - - -""" -""" -def read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes): - for gene_name in Genes: - # Assert there is only one allele per gene, which is a backbone allele - assert len(Genes[gene_name]) == 1 - backbone_allele_name, backbone_seq = Genes[gene_name].items()[0] - gene_vars, gene_var_list = Vars[gene_name], Var_list[gene_name] - allele_vars = {} - for _, var_id in gene_var_list: - if var_id not in Links: - continue - for allele_name in Links[var_id]: - if allele_name not in allele_vars: - allele_vars[allele_name] = [] - allele_vars[allele_name].append(var_id) - - for allele_name, vars in allele_vars.items(): - seq = "" - prev_pos = 0 - for var_id in vars: - type, pos, data = gene_vars[var_id] - assert prev_pos <= pos - if pos > prev_pos: - seq += backbone_seq[prev_pos:pos] - if type == "single": - prev_pos = pos + 1 - seq += data - elif type == "deletion": - prev_pos = pos + int(data) - else: - assert type == "insertion" - seq += data - prev_pos = pos - if prev_pos < len(backbone_seq): - seq += backbone_seq[prev_pos:] - Genes[gene_name][allele_name] = seq - - if len(Genes[gene_name]) <= 1: - Genes[gene_name]["%s*GRCh38" % gene_name] = backbone_seq - - -""" -""" -def read_Gene_alleles(fname, Genes): - for line in open(fname): - if line.startswith(">"): - allele_name = line.strip().split()[0][1:] - gene_name = allele_name.split('*')[0] - if not gene_name in Genes: - Genes[gene_name] = {} - if not allele_name in Genes[gene_name]: - Genes[gene_name][allele_name] = "" - else: - Genes[gene_name][allele_name] += line.strip() - return Genes - - -""" -""" -def read_Gene_vars(fname): - Vars, Var_list = {}, {} - for line in open(fname): - var_id, var_type, allele, pos, data = line.strip().split('\t') - pos = int(pos) - gene = allele.split('*')[0] - if not gene in Vars: - Vars[gene] = {} - assert not gene in Var_list - Var_list[gene] = [] - - assert not var_id in Vars[gene] - Vars[gene][var_id] = [var_type, pos, data] - Var_list[gene].append([pos, var_id]) - - for gene, in_var_list in Var_list.items(): - Var_list[gene] = sorted(in_var_list) - - return Vars, Var_list - - -""" -""" -def read_Gene_vars_genotype_genome(fname, refGene_loci): - loci = {} - for gene, values in refGene_loci.items(): - allele_name, chr, left, right = values[:4] - if chr not in loci: - loci[chr] = [] - loci[chr].append([allele_name, left, right]) - - Vars, Var_list = {}, {} - for line in open(fname): - var_id, var_type, var_chr, pos, data = line.strip().split('\t') - if var_chr not in loci: - continue - pos = int(pos) - found = False - for allele_name, left, right in loci[var_chr]: - if pos >= left and pos <= right: - found = True - break - if not found: - continue - - gene = allele_name.split('*')[0] - if not gene in Vars: - Vars[gene] = {} - assert not gene in Var_list - Var_list[gene] = [] - - assert not var_id in Vars[gene] - Vars[gene][var_id] = [var_type, pos - left, data] - Var_list[gene].append([pos - left, var_id]) - - for gene, in_var_list in Var_list.items(): - Var_list[gene] = sorted(in_var_list) - - return Vars, Var_list - - -""" -""" -def read_Gene_links(fname): - Links = {} - for line in open(fname): - var_id, alleles = line.strip().split('\t') - alleles = alleles.split() - assert not var_id in Links - Links[var_id] = alleles - - return Links - - -""" -""" -def genotyping_locus(base_fname, - locus_list, - genotype_genome, - only_locus_list, - partial, - aligners, - read_fname, - fastq, - alignment_fname, - threads, - simulate_interval, - read_len, - fragment_len, - best_alleles, - num_editdist, - perbase_errorrate, - perbase_snprate, - skip_fragment_regions, - assembly, - output_base, - error_correction, - keep_alignment, - discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - verbose, - assembly_verbose, - debug_instr): - simulation = (read_fname == [] and alignment_fname == "") - if genotype_genome == "": - if not os.path.exists("hisatgenotype_db"): - typing_common.clone_hisatgenotype_database() - - # Download human genome and HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index() - - # Check if the pre-existing files (hla*) are compatible with the current parameter setting - if genotype_genome != "": - if os.path.exists("%s.locus" % base_fname): - left = 0 - Gene_genes = [] - BACKBONE = False - for line in open("%s.locus" % base_fname): - Gene_name = line.strip().split()[0] - if Gene_name.find("BACKBONE") != -1: - BACKBONE = True - Gene_gene = Gene_name.split('*')[0] - Gene_genes.append(Gene_gene) - delete_hla_files = False - if not BACKBONE: - delete_hla_files = True - if len(locus_list) == 0: - locus_list = Gene_genes - if not set(locus_list).issubset(set(Gene_genes)): - delete_hla_files = True - if delete_hla_files: - os.system("rm %s*" % base_fname) - - # Extract variants, backbone sequence, and other sequeces - if genotype_genome != "": - genome_fnames = [genotype_genome + ".fa", - genotype_genome + ".fa.fai", - genotype_genome + ".locus", - genotype_genome + ".snp", - genotype_genome + ".index.snp", - genotype_genome + ".haplotype", - genotype_genome + ".link", - genotype_genome + ".clnsig", - genotype_genome + ".coord", - genotype_genome + ".allele", - genotype_genome + ".partial"] - for i in range(8): - genome_fnames.append(genotype_genome + ".%d.ht2" % (i+1)) - - if not typing_common.check_files(genome_fnames): - print >> sys.stderr, "Error: some of the following files are not available:", ' '.join(genome_fnames) - sys.exit(1) - else: - typing_common.extract_database_if_not_exists(base_fname, - only_locus_list, - 30, # inter_gap - 50, # intra_gap - partial, - verbose >= 1) - for aligner, index_type in aligners: - typing_common.build_index_if_not_exists(base_fname, - aligner, - index_type, - threads, - verbose >= 1) - - # Read alleles - alleles = set() - if genotype_genome != "": - for line in open("%s.allele" % genotype_genome): - family, allele_name = line.strip().split('\t') - if family == base_fname: - alleles.add(allele_name) - else: - for line in open("%s.allele" % base_fname): - alleles.add(line.strip()) - - # Read partial alleles - partial_alleles = set() - if genotype_genome != "": - for line in open("%s.partial" % genotype_genome): - family, allele_name = line.strip().split('\t') - if family == base_fname: - partial_alleles.add(allele_name) - - else: - for line in open("%s.partial" % base_fname): - partial_alleles.add(line.strip()) - - # Read alleles (names and sequences) - refGenes, refGene_loci = {}, {} - if base_fname == "genome": - for chr, left, right in locus_list: - region_name = "%s:%d-%d" % (chr, left, right) - refGenes[region_name] = region_name - refGene_loci[region_name] = [region_name, chr, left, right, []] - else: - for line in open("%s.locus" % (genotype_genome if genotype_genome != "" else base_fname)): - fields = line.strip().split() - if genotype_genome != "" and base_fname != fields[0].lower(): - continue - if genotype_genome != "": - _, Gene_name, chr, left, right, exon_str, strand = fields - else: - Gene_name, chr, left, right, _, exon_str, strand = fields - Gene_gene = Gene_name.split('*')[0] - assert not Gene_gene in refGenes - refGenes[Gene_gene] = Gene_name - left, right = int(left), int(right) - exons, primary_exons = [], [] - for exon in exon_str.split(','): - primary = exon.endswith('p') - if primary: - exon = exon[:-1] - exon_left, exon_right = exon.split('-') - exon_left, exon_right = int(exon_left), int(exon_right) - exons.append([exon_left, exon_right]) - if primary: - primary_exons.append([exon_left, exon_right]) - refGene_loci[Gene_gene] = [Gene_name, chr, left, right, exons, primary_exons] - Genes = {} - if len(locus_list) == 0: - locus_list = refGene_loci.keys() - - # Read variants, and link information - if genotype_genome: - Vars, Var_list = read_Gene_vars_genotype_genome("%s.snp" % genotype_genome, refGene_loci) - Links = read_Gene_links("%s.link" % genotype_genome) - else: - Vars, Var_list = read_Gene_vars("%s.snp" % base_fname) - Links = read_Gene_links("%s.link" % base_fname) - - # Some loci may have only one allele such as AMELX and AMELY - for gene_name in refGene_loci.keys(): - if gene_name in Vars: - continue - Vars[gene_name], Var_list[gene_name], Links[gene_name] = {}, [], {} - - # Read allele sequences - if genotype_genome != "": - read_backbone_alleles(genotype_genome, refGene_loci, Genes) - read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes) - else: - read_Gene_alleles(base_fname + "_backbone.fa", Genes) - read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes) - - # alleles corresponding to backbones - for allele in alleles: - locus = allele.split('*')[0] - assert locus in Genes - if allele not in Genes[locus]: - Genes[locus][allele] = Genes[locus]["%s*BACKBONE" % locus] - - # Sanity Check - if os.path.exists(base_fname + "_backbone.fa") and \ - os.path.exists(base_fname + "_sequences.fa"): - Genes2 = {} - read_Gene_alleles(base_fname + "_backbone.fa", Genes2) - read_Gene_alleles(base_fname + "_sequences.fa", Genes2) - for gene_name, alleles in Genes.items(): - assert gene_name in Genes2 - for allele_name, allele_seq in alleles.items(): - assert allele_name in Genes2[gene_name] - allele_seq2 = Genes2[gene_name][allele_name] - assert allele_seq == allele_seq2 - - # alleles names - Gene_names = {} - for Gene_gene, data in Genes.items(): - Gene_names[Gene_gene] = list(data.keys()) - - # allele lengths - Gene_lengths = {} - for Gene_gene, Gene_alleles in Genes.items(): - Gene_lengths[Gene_gene] = {} - for allele_name, seq in Gene_alleles.items(): - Gene_lengths[Gene_gene][allele_name] = len(seq) - - # Test typing - test_list = [] - if simulation: - basic_test, pair_test = True, False - if debug_instr and "pair" in debug_instr: - basic_test, pair_test = False, True - - test_passed = {} - test_list = [] - genes = list(set(locus_list) & set(Gene_names.keys())) - if basic_test: - for gene in genes: - Gene_gene_alleles = Gene_names[gene] - for allele in Gene_gene_alleles: - if allele.find("BACKBONE") != -1: - continue - test_list.append([[allele]]) - random.shuffle(test_list) - if pair_test: - test_size = 200 - allele_count = 2 - for test_i in range(test_size): - test_pairs = [] - for gene in genes: - Gene_gene_alleles = [] - - for allele in Gene_names[gene]: - if allele.find("BACKBONE") != -1: - continue - - if "full" in debug: - if allele in partial_alleles: - continue - - Gene_gene_alleles.append(allele) - nums = [i for i in range(len(Gene_gene_alleles))] - random.shuffle(nums) - test_pairs.append(sorted([Gene_gene_alleles[nums[i]] for i in range(allele_count)])) - test_list.append(test_pairs) - - if "test_list" in debug_instr: - test_list = [[debug_instr["test_list"].split('-')]] - - for test_i in range(len(test_list)): - if "test_id" in debug_instr: - test_ids = debug_instr["test_id"].split('-') - if str(test_i + 1) not in test_ids: - continue - - print >> sys.stderr, "Test %d" % (test_i + 1), str(datetime.now()) - test_locus_list = test_list[test_i] - num_frag_list = typing_common.simulate_reads(Genes, - base_fname, - test_locus_list, - Vars, - Links, - simulate_interval, - read_len, - fragment_len, - perbase_errorrate, - perbase_snprate, - skip_fragment_regions) - - assert len(num_frag_list) == len(test_locus_list) - for i_ in range(len(test_locus_list)): - test_Gene_names = test_locus_list[i_] - num_frag_list_i = num_frag_list[i_] - assert len(num_frag_list_i) == len(test_Gene_names) - for j_ in range(len(test_Gene_names)): - test_Gene_name = test_Gene_names[j_] - gene = test_Gene_name.split('*')[0] - test_Gene_seq = Genes[gene][test_Gene_name] - seq_type = "partial" if test_Gene_name in partial_alleles else "full" - print >> sys.stderr, "\t%s - %d bp (%s sequence, %d pairs)" % (test_Gene_name, len(test_Gene_seq), seq_type, num_frag_list_i[j_]) - - if "single-end" in debug_instr: - read_fname = ["%s_input_1.fa" % base_fname] - else: - read_fname = ["%s_input_1.fa" % base_fname, "%s_input_2.fa" % base_fname] - - fastq = False - tmp_test_passed = typing(simulation, - base_fname, - test_locus_list, - genotype_genome, - partial, - partial_alleles, - refGenes, - Genes, - Gene_names, - Gene_lengths, - refGene_loci, - Vars, - Var_list, - Links, - aligners, - num_editdist, - assembly, - output_base, - error_correction, - keep_alignment, - discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - fastq, - read_fname, - alignment_fname, - num_frag_list, - read_len, - fragment_len, - threads, - best_alleles, - verbose, - assembly_verbose) - - for aligner_type, passed in tmp_test_passed.items(): - if aligner_type in test_passed: - test_passed[aligner_type] += passed - else: - test_passed[aligner_type] = passed - - print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1))) - - - for aligner_type, passed in test_passed.items(): - print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list)) - - else: # With real reads or BAMs - if base_fname == "genome": - print >> sys.stderr, "\t", locus_list - else: - print >> sys.stderr, "\t", ' '.join(locus_list) - typing(simulation, - base_fname, - locus_list, - genotype_genome, - partial, - partial_alleles, - refGenes, - Genes, - Gene_names, - Gene_lengths, - refGene_loci, - Vars, - Var_list, - Links, - aligners, - num_editdist, - assembly, - output_base, - error_correction, - keep_alignment, - discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - fastq, - read_fname, - alignment_fname, - [], - read_len, - fragment_len, - threads, - best_alleles, - verbose, - assembly_verbose) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='hisatgenotype_locus') - parser.add_argument("--base", "--base-fname", - dest="base_fname", - type=str, - default="hla", - help="base filename for backbone sequence, variants, and linking info (default: hla)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="A comma-separated list of genes (default: empty, all genes)") - parser.add_argument("--genotype-genome", - dest="genotype_genome", - type=str, - default="", - help="Base name for genotype genome, which the program will use instead of region-based small indexes (default: empty)") - parser.add_argument("-f", "--fasta", - dest='fastq', - action='store_false', - help='FASTA format') - parser.add_argument("-U", - dest="read_fname_U", - type=str, - default="", - help="filename for single-end reads") - parser.add_argument("-1", - dest="read_fname_1", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("-2", - dest="read_fname_2", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("--alignment", - dest="alignment_fname", - type=str, - default="", - help="BAM file name") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument('--no-partial', - dest='partial', - action='store_false', - help='Include partial alleles (e.g. A_nuc.fasta)') - parser.add_argument("--aligner-list", - dest="aligners", - type=str, - default="hisat2.graph", - help="A comma-separated list of aligners such as hisat2.graph,hisat2.linear,bowtie2.linear (default: hisat2.graph)") - parser.add_argument("--simulate-interval", - dest="simulate_interval", - type=int, - default=10, - help="Reads simulated at every these base pairs (default: 10)") - parser.add_argument("--read-len", - dest="read_len", - type=int, - default=100, - help="Length of simulated reads (default: 100)") - parser.add_argument("--fragment-len", - dest="fragment_len", - type=int, - default=350, - help="Length of fragments (default: 350)") - parser.add_argument("--best-alleles", - dest="best_alleles", - action='store_true', - help="") - parser.add_argument("--random-seed", - dest="random_seed", - type=int, - default=1, - help="A seeding number for randomness (default: 1)") - parser.add_argument("--num-editdist", - dest="num_editdist", - type=int, - default=2, - help="Maximum number of mismatches per read alignment to be considered (default: 2)") - parser.add_argument("--perbase-errorrate", - dest="perbase_errorrate", - type=float, - default=0.0, - help="Per basepair error rate in percentage when simulating reads (default: 0.0)") - parser.add_argument("--perbase-snprate", - dest="perbase_snprate", - type=float, - default=0.0, - help="Per basepair SNP rate in percentage when simulating reads (default: 0.0)") - parser.add_argument("--skip-fragment-regions", - dest="skip_fragment_regions", - type=str, - default="", - help="A comma-separated list of regions from which no reads originate, e.g., 500-600,1200-1400 (default: None).") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument('--verbose-level', - dest='verbose_level', - type=int, - default=0, - help='also print some statistics to stderr (default: 0)') - parser.add_argument("--debug", - dest="debug", - type=str, - default="", - help="e.g., test_id:10,read_id:10000,basic_test") - parser.add_argument("--output-base", "--assembly-base", - dest="output_base", - type=str, - default="assembly_graph", - help="base file name (default: assembly_graph)") - parser.add_argument("--assembly", - dest="assembly", - action="store_true", - help="Perform assembly") - parser.add_argument("--no-error-correction", - dest="error_correction", - action="store_false", - help="Correct sequencing errors") - parser.add_argument("--keep-alignment", - dest="keep_alignment", - action="store_true", - help="Keep alignment file") - parser.add_argument("--only-locus-list", - dest="only_locus_list", - type=str, - default="", - help="A comma-separated list of genes (default: empty, all genes)") - parser.add_argument("--discordant", - dest="discordant", - action="store_true", - help="Allow discordantly mapped pairs or singletons") - parser.add_argument("--type-primary-exons", - dest="type_primary_exons", - action="store_true", - help="Look at primary exons first") - parser.add_argument("--keep-low-abundance-alleles", - dest="remove_low_abundance_alleles", - action="store_false", - help="Do not remove alleles with low abundance while performing typing") - parser.add_argument("--assembly-verbose", - dest="assembly_verbose", - action="store_true", - help="Output intermediate assembly information") - parser.add_argument("--display-alleles", - dest="display_alleles", - type=str, - default="", - help="A comma-separated list of alleles to display in HTML (default: empty)") - - args = parser.parse_args() - if args.locus_list == "": - locus_list = [] - else: - locus_list = args.locus_list.split(',') - if args.base_fname == "genome": - assert ':' in args.locus_list - for i in range(len(locus_list)): - assert ':' in locus_list[i] and '-' in locus_list[i] - chr, coord = locus_list[i].split(':') - left, right = coord.split('-') - locus_list[i] = [chr, int(left), int(right)] - - if args.only_locus_list == "": - only_locus_list = [] - else: - locus_list = only_locus_list = args.only_locus_list.split(',') - if args.aligners == "": - print >> sys.stderr, "Error: --aligners must be non-empty." - sys.exit(1) - args.aligners = args.aligners.split(',') - for i in range(len(args.aligners)): - args.aligners[i] = args.aligners[i].split('.') - if args.read_fname_U != "": - args.read_fname = [args.read_fname_U] - elif args.read_fname_1 != "" or args.read_fname_2 != "": - if args.read_fname_1 == "" or args.read_fname_2 == "": - print >> sys.stderr, "Error: please specify both -1 and -2." - sys.exit(1) - args.read_fname = [args.read_fname_1, args.read_fname_2] - else: - args.read_fname = [] - if args.alignment_fname != "" and \ - not os.path.exists(args.alignment_fname): - print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname - sys.exit(1) - - if args.verbose and args.verbose_level == 0: - args.verbose_level = 1 - - debug = {} - if args.debug != "": - for item in args.debug.split(','): - if ':' in item: - fields = item.split(':') - assert len(fields) >= 2 - key, value = fields[0], ':'.join(fields[1:]) - debug[key] = value - else: - debug[item] = 1 - - if not args.partial: - print >> sys.stderr, "Warning: --no-partial should be used for debugging purpose only." - - if args.read_len * 2 > args.fragment_len: - print >> sys.stderr, "Warning: fragment might be too short (%d)" % (args.fragment_len) - - skip_fragment_regions = [] - if args.skip_fragment_regions != "": - prev_left, prev_right = -1, -1 - for region in args.skip_fragment_regions.split(','): - left, right = region.split('-') - left, right = int(left), int(right) - assert left < right - assert prev_right < left - prev_left, prev_right = left, right - skip_fragment_regions.append([left, right]) - - if args.display_alleles == "": - display_alleles = [] - else: - display_alleles = args.display_alleles.split(',') - - random.seed(args.random_seed) - genotyping_locus(args.base_fname, - locus_list, - args.genotype_genome, - only_locus_list, - args.partial, - args.aligners, - args.read_fname, - args.fastq, - args.alignment_fname, - args.threads, - args.simulate_interval, - args.read_len, - args.fragment_len, - args.best_alleles, - args.num_editdist, - args.perbase_errorrate, - args.perbase_snprate, - skip_fragment_regions, - args.assembly, - args.output_base, - args.error_correction, - args.keep_alignment, - args.discordant, - args.type_primary_exons, - args.remove_low_abundance_alleles, - display_alleles, - args.verbose_level, - args.assembly_verbose, - debug) - diff --git a/hisatgenotype_modules/__init__.py b/hisatgenotype_modules/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/hisatgenotype_modules/hisatgenotype_assembly_graph.py b/hisatgenotype_modules/hisatgenotype_assembly_graph.py deleted file mode 100755 index 16794f40..00000000 --- a/hisatgenotype_modules/hisatgenotype_assembly_graph.py +++ /dev/null @@ -1,1902 +0,0 @@ -#!/usr/bin/env python - -import sys -import math, random -from datetime import datetime, date, time -from collections import deque -from copy import deepcopy - - -# -def get_major_nt(nt_dic): - nt = '' - max_count = 0 - for tmp_nt, tmp_value in nt_dic.items(): - tmp_count, tmp_var_id = tmp_value - if len(tmp_nt) == 1: - assert tmp_nt in "ACGTDN" - else: - assert len(tmp_nt) == 2 and tmp_nt[0] == 'I' and tmp_nt[1] in "ACGT" - if tmp_count > max_count: - max_count = tmp_count - nt = tmp_nt - if len(nt) == 1: - assert nt in "ACGTDN" - else: - assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT" - return nt - - -# -def match_score(nt_dic1, nt_dic2): - sum_1 = sum([count for count, _ in nt_dic1.values()]) - sum_2 = sum([count for count, _ in nt_dic2.values()]) - total1, total2 = sum_1 * 2.0, sum_2 * 2.0 - best = 0.0 - for nt in "ACGT": - if nt not in nt_dic1 or nt not in nt_dic2: - continue - tmp_best = nt_dic1[nt][0] / total1 + nt_dic2[nt][0] / total2 - if tmp_best > best: - best = tmp_best - return best - - -# -def get_ungapped_seq(seq): - ungapped_seq = [] - for i in range(len(seq)): - nt_dic = seq[i] - nt = get_major_nt(nt_dic) - if nt == 'D': - continue - ungapped_seq.append(nt_dic) - return ungapped_seq - - -# -def get_ungapped_seq_pos(seq, pos): - tot_del_len, tot_ins_len = 0, 0 - for i in range(len(seq)): - nt_dic = seq[i] - nt = get_major_nt(nt_dic) - if nt == 'D': - tot_del_len += 1 - elif nt[0] == 'I': - tot_ins_len += 1 - if i - tot_ins_len == pos: - return pos - tot_del_len - return -1 - - -# Get mate node id -# HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|L to HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|R or vice versa -def get_mate_node_id(node_id): - node_id2, end = node_id.split('|') - if end == 'L': - end = 'R' - else: - end = 'L' - node_id2 = '|'.join([node_id2, end]) - return node_id2 - - - -class Node: - # Initialize - def __init__(self, - id, - left, - seq, - qual, - var, - ref_seq, - ref_vars, - mpileup, - simulation): - self.next = [] # list of next nodes - - if simulation: - id = id.split('_')[0] - self.id = id # Node ID - self.left = left # starting position - - # sequence that node represents - # with information about how the sequence is related to backbone - assert len(seq) == len(var) - assert len(seq) == len(qual) - self.seq = [] - self.ins_len = 0 - for s in range(len(seq)): - nt = seq[s] - if len(nt) == 1: - assert nt in "ACGTDN" - else: - assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT" - self.ins_len += 1 - var_id = var[s] - self.seq.append({nt : [1, var_id]}) - self.qual = [] - for q in qual: - if q != '': - self.qual.append(max(0, ord(q) / 10 - 3)) - else: - self.qual.append(0) - - self.right = self.left + len(seq) - 1 - self.ins_len - - self.read_ids = set([id]) - self.mate_ids = set([id.split('|')[0]]) - - self.calculate_avg_cov() - - self.ref_seq = ref_seq - self.ref_vars = ref_vars - - self.mpileup = mpileup - - - # Check how compatible allele is in regard to read or pair - def compatible_with_rnode(self, rnode): - assert False - assert rnode.left + len(rnode.seq) <= len(self.seq) - score = 0 - for i in range(len(rnode.seq)): - allele_bp = self.seq[rnode.left + i] - read_bp = rnode.seq[i] - if allele_bp == read_bp: - score += 1 - - return float(score) / len(rnode.seq) - - - # Check how nodes overlap with each other without considering deletions - def overlap_with(self, other, vars, skipN = False, debug = False): - assert self.left <= other.left - if self.right < other.left: - return -1, -1 - seq = get_ungapped_seq(self.seq) - other_seq = get_ungapped_seq(other.seq) - add_mm = len(self.mate_ids & other.mate_ids) - i_left = get_ungapped_seq_pos(self.seq, other.left - self.left) - for i in range(i_left - 5, i_left + 6): - max_mm = 0.012 * (len(seq) - i) # 1 mismatch per 83 bases - tmp_mm = 0.0 - for j in range(len(other_seq)): - if i + j >= len(seq): - break - nt_dic, other_nt_dic = seq[i+j], other_seq[j] - nt, other_nt = get_major_nt(nt_dic), get_major_nt(other_nt_dic) - mismatch = 0.0 - if skipN and (nt == 'N' or other_nt == 'N'): - mismatch = 0.0 - elif nt != other_nt: - mismatch = 1.0 - match_score(seq[i+j], other_seq[j]) - - # Higher penalty for mismatches in variants - nt_var, other_nt_var = nt_dic[nt][1], other_nt_dic[other_nt][1] - if nt_var != other_nt_var: - mismatch = 5.0 - adjust = min(1.0, nt_dic[nt][0] / self.get_avg_cov()) * \ - min(1.0, other_nt_dic[other_nt][0] / other.get_avg_cov()) - mismatch *= adjust - if mismatch < 1.0: - mismatch = 1.0 - - assert mismatch >= 0.0 - tmp_mm += mismatch - if tmp_mm > max_mm: - break - - if debug: - print "at %d (%d) with overlap of %d and mismatch of %.2f" % (i, self.left + i, j, tmp_mm) - - if tmp_mm <= max_mm: - return i, min(len(seq) - i, len(other_seq)), tmp_mm - - return -1, -1, sys.maxint - - - # Combine two nodes with considering deletions - def combine_with(self, other): - # DK - debugging purposes - if self.left > other.left: - self.print_info() - other.print_info() - return - - assert self.left <= other.left - - # Merge two sequences - assert len(other.seq) > 0 and 'D' not in other.seq[0].keys() - j = 0 - # Merge the overlapped parts - if self.right >= other.left: - overlap, ins_len = False, 0 - for i in range(len(self.seq)): - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt.startswith('I'): - ins_len += 1 - if i == other.left - self.left + ins_len: - overlap = True - break - assert overlap - new_seq = self.seq[:i] - while i < len(self.seq) and j < len(other.seq): - nt_dic, nt_dic2 = self.seq[i], other.seq[j] - for nt, value in nt_dic2.items(): - count, var_id = value - if nt in nt_dic: - nt_dic[nt][0] += count - # if nt != 'D': - # assert nt_dic[nt][1] == var_id - else: - nt_dic[nt] = [count, var_id] - new_seq.append(nt_dic) - i += 1 - j += 1 - # this node contains the other node - if i < len(self.seq): - new_seq += self.seq[i:] - # Fill in the gap between the two nodes if exists - else: - new_seq = self.seq[:] - sum_1 = sum([count for count, _ in self.seq[-1].values()]) - sum_2 = sum([count for count, _ in other.seq[0].values()]) - flank_cov = (sum_1 + sum_2) / 2.0 - for k in range(other.left - self.right - 1): - ref_nt_dic = self.mpileup[k + 1 + self.right][1] - nt_dic = {} - # Fill in the gap with Ns for now - if len(ref_nt_dic) == 0 or True: - nt_dic = {'N' : [1, ""]} - else: - weight = flank_cov / max(1.0, sum([count for count, _ in ref_nt_dic.values()])) - for nt, value in ref_nt_dic.items(): - count, var_id = value - nt_dic[nt] = [count * weight, var_id] - new_seq.append(nt_dic) - - # Append the rest of the other sequence to it - if j < len(other.seq): - new_seq += deepcopy(other.seq[j:]) - self.read_ids |= other.read_ids - self.mate_ids |= other.mate_ids - - self.seq = new_seq - self.ins_len = 0 - for i in range(len(self.seq)): - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt[0] == 'I': - self.ins_len += 1 - self.right = self.left + len(self.seq) - 1 - self.ins_len - - # Update coverage - self.calculate_avg_cov() - - - # Return the length of the ungapped sequence - def ungapped_length(self): - return len(get_ungapped_seq(self.seq)) - - - # Contains Ns? - def contain_Ns(self): - for i in range(len(self.seq)): - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt == 'N': - return True - return False - - - # Get variant ids - def get_var_ids(self, left = 0, right = sys.maxint): - vars = [] - left = max(left, self.left) - right = min(right, self.right) - ins_len = 0 - for pos in range(left, right + 1): - var_i = pos - self.left + ins_len - while var_i < len(self.seq): - nt_dic = self.seq[var_i] - nt = get_major_nt(nt_dic) - if nt.startswith('I'): - var_i += 1 - ins_len += 1 - else: - break - for _, var in nt_dic.values(): - if var == "" or \ - var == "unknown": - continue - assert var in self.ref_vars - if len(vars) > 0 and var == vars[-1]: - continue - type, pos, data = self.ref_vars[var] - if (type == "single" and data == nt) or \ - (type == "deletion" and nt == 'D') or \ - (type == "insertion" and len(nt) == 2 and nt[1] == data): - vars.append(var) - - return vars - - - # Get variant ids - # left and right are gene-level coordinates - def get_vars(self, left = 0, right = sys.maxint): - vars = [] - left = max(left, self.left) - right = min(right, self.right) - skip_pos = -1 - ins_len = 0 - for pos in range(left, right + 1): - if pos <= skip_pos: - continue - var_i = pos - self.left + ins_len - while var_i < len(self.seq): - nt_dic = self.seq[var_i] - nt = get_major_nt(nt_dic) - if nt.startswith('I'): - var_i += 1 - ins_len += 1 - var = nt_dic[nt][1] - if len(vars) > 0 and var != vars[-1][0]: - vars.append([var, pos]) - else: - break - if nt == self.ref_seq[pos]: - continue - if nt == 'N': - vars.append(["gap", pos]) - continue - added = False - for _, var in nt_dic.values(): - if var == "" or \ - var == "unknown": - continue - if len(vars) > 0 and var == vars[-1][0]: - continue - assert var in self.ref_vars - type, var_pos, data = self.ref_vars[var] - if data == nt or (type == "deletion" and nt == 'D'): - assert pos + ins_len >= var_pos - if type == "deletion" and pos > var_pos: - continue - if type == "deletion": - skip_pos = pos + int(data) - 1 - added = True - vars.append([var, pos]) - if not added and "unknown" in [var_id for _, var_id in nt_dic.values()]: - vars.append(["unknown", pos]) - - return vars - - - # Get average coverage - def get_avg_cov(self): - return self.avg - - - # Calculate average coverage - def calculate_avg_cov(self): - self.avg = 0.0 - for nt_dic in self.seq: - for count, _ in nt_dic.values(): - self.avg += count - self.avg /= len(self.seq) - return self.avg - - - # Display node information - def print_info(self, output=sys.stderr): - seq, var_str = "", "" - prev_var = "" - ins_len = 0 - for i in range(len(self.seq)): - if (self.left + i - ins_len) % 100 == 0: - seq += ("|%d|" % (self.left + i - ins_len)) - elif (self.left + i - ins_len) % 20 == 0: - seq += '|' - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt[0] == 'I': - seq += "\033[93m" - elif nt != self.ref_seq[self.left + i - ins_len]: - var_id = nt_dic[nt][1] - if var_id == "unknown" or var_id.startswith("nv"): - seq += "\033[91m" # red - else: - seq += "\033[94m" # blue - if nt[0] == 'I': - seq += nt[1] - else: - seq += nt - if nt[0] == 'I' or nt != self.ref_seq[self.left + i - ins_len]: - seq += "\033[00m" - - var = [] - for _, var_id in nt_dic.values(): - if var_id == "": - continue - var.append(var_id) - var = '-'.join(var) - if var != "" and var != prev_var: - var_str += "\t%d: %s %s" % (self.left + i - ins_len, var, str(nt_dic)) - prev_var = var - if nt[0] == 'I': - ins_len += 1 - - print >> output, "Node ID:", self.id - print >> output, "Pos: [%d, %d], Avg. coverage: %.1f" % (self.left, self.right, self.get_avg_cov()) - print >> output, "\t", seq - print >> output, "\t", var_str - print >> output, "mates:", len(self.mate_ids) # sorted(self.mate_ids) - print >> output, "reads:", len(self.read_ids) # sorted(self.read_ids) - print >> output - - -class Graph: - def __init__(self, - backbone, - gene_vars, - exons, - primary_exons, - partial_allele_ids, - true_allele_nodes = {}, - predicted_allele_nodes = {}, - display_allele_nodes = {}, - simulation = False): - self.backbone = backbone # backbone sequence - self.gene_vars = gene_vars - self.exons = exons - self.primary_exons = primary_exons - self.partial_allele_ids = partial_allele_ids - self.true_allele_nodes = true_allele_nodes - self.predicted_allele_nodes = predicted_allele_nodes - self.allele_node_order = [] - self.display_allele_nodes = display_allele_nodes - self.simulation = simulation - - self.read_nodes = self.nodes = {} - self.other_nodes = {} - self.edges = {} - self.to_node, self.from_node = {}, {} - - self.left_margin = 350 - self.right_margin = 20 - self.top_margin = 20 - self.bottom_margin = 20 - - self.scalex, self.scaley = 5, 2 - self.width = len(self.backbone) * self.scalex + self.left_margin + self.right_margin - self.unscaled_height = 6000 - self.height = self.unscaled_height * self.scaley - self.coverage = {} - - - # Add node, which is an alignment w.r.t. the reference - def add_node(self, id, id_i, node, simulation = False): - if simulation: - id = id.split('_')[0] - - if id_i == 0: - if id in self.nodes: - print >> sys.stderr, "Warning) multi-mapped read:", id - # assert False - return - assert id not in self.nodes - self.nodes[id] = node - else: - if id not in self.other_nodes: - self.other_nodes[id] = [] - self.other_nodes[id].append(node) - - - # Remove nodes that are inside other nodes or with low coverage - def remove_nodes(self, nodes): - delete_ids = set() - node_list = [[id, node.left, node.right] for id, node in nodes.items()] - def node_cmp(a, b): - if a[2] != b[2]: - return a[2] - b[2] - else: - return a[1] - b[1] - node_list = sorted(node_list, cmp=node_cmp) - for n in range(len(node_list)): - id, left, right = node_list[n] - node = nodes[id] - i = n - 1 - while i >= 0: - id2, left2, right2 = node_list[i] - if right2 < left: - break - node2 = nodes[id2] - if left <= left2 and right2 <= right: - at, overlap, mm = node.overlap_with(node2, self.gene_vars) - - # DK - debugging purposes - """ - print node.id, "vs.", node2.id - print "at %d: overlap of %d with %d mismatches (mult: %.2f)" % \ - (at, overlap, mm, mult) - """ - if mm < 1.0: - mult = overlap / float(max(right - left, right2 - left2)) - if node2.get_avg_cov() * mult * 10 < node.get_avg_cov(): - delete_ids.add(id2) - elif left == left2 and right == right2: - delete_ids.add(id) - elif overlap > 0: - if node2.get_avg_cov() * 10 < node.get_avg_cov(): - delete_ids.add(id2) - elif node.get_avg_cov() * 10 < node2.get_avg_cov(): - delete_ids.add(id) - i -= 1 - - for delete_id in delete_ids: - del nodes[delete_id] - - - # - # - def guided_DeBruijn(self, - print_msg = False): - assert len(self.nodes) > 0 - k = 60 # k-mer - - DRB1_debug = False - - node_seq = {} - def add_node_seq(node_seq, id): - nodes = [self.nodes[id]] - if id in self.other_nodes: - nodes += self.other_nodes[id] - for node_i in range(len(nodes)): - node = nodes[node_i] - s, seq = 0, [] - while s < len(node.seq): - nt_dic = node.seq[s] # {'C': [1, '']} - nt = get_major_nt(nt_dic) - if nt in "ACGTND": - seq.append(nt) - else: - assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT" - s += 1 - - if len(seq) < k: - continue - - def leftshift(seq, ref_seq): - seq_len = len(seq) - assert seq_len > 0 and seq[0] != 'D' - - bp_i = 0 - while bp_i < seq_len: - bp = seq[bp_i] - if bp != 'D': - bp_i += 1 - continue - bp_j = bp_i + 1 - while bp_j < seq_len: - bp2 = seq[bp_j] - if bp2 != 'D': - break - else: - bp_j += 1 - - if bp_j >= seq_len: - bp_i = bp_j - break - - prev_i, prev_j = bp_i, bp_j - while bp_i > 0 and seq[bp_i-1] in "ACGT" and ref_seq[bp_j-1] in "ACGT": - if seq[bp_i-1] != ref_seq[bp_j-1]: - break - seq[bp_j-1] = seq[bp_i-1] - seq[bp_i-1] = 'D' - bp_i -= 1 - bp_j -= 1 - bp_i = bp_j - while bp_i < seq_len: - if seq[bp_i] in "ACGT": - break - bp_i += 1 - - if DRB1_debug: - leftshift(seq, self.backbone[node.left:node.left + len(seq)]) - node_seq["%s.%d" % (id, node_i)] = seq - - for id in self.nodes.keys(): - add_node_seq(node_seq, id) - - # AAA.1 => AAA, 1 - def get_id_and_sub(id): - id_split = id.split('.') - return '.'.join(id_split[:-1]), int(id_split[-1]) - - try_hard = False - while True: - delete_ids = set() - nodes = [] - for id, node in self.nodes.items(): - nodes_ = [node] - if id in self.other_nodes: - nodes_ += self.other_nodes[id] - for node_i in range(len(nodes_)): - node = nodes_[node_i] - id_ = "%s.%d" % (id, node_i) - if id_ not in node_seq: - continue - seq = node_seq[id_] - - if len(seq) < k or \ - 'N' in seq: - continue - kmer, seq = seq[:k], seq[k:] - nodes.append([id_, node.left, node.right, kmer, seq]) - - def node_cmp(a, b): - if a[1] != b[1]: - return a[1] - b[1] - else: - return a[2] - b[2] - nodes = sorted(nodes, cmp=node_cmp) - - # Generate numerical read IDs - id_to_num = {} - num_to_id = [] - for id in [node[0] for node in nodes]: - id_to_num[id] = len(id_to_num) - num_to_id.append(id) - - # Construct De Bruijn graph with 60-mer - self.debruijn = debruijn = [[] for i in range(len(self.backbone) - k + 1)] - min_n = 0 - for pos in range(len(debruijn)): - for n in range(min_n, len(nodes)): - id, node_pos, node_right, kmer, seq = nodes[n] - if node_pos < pos: - min_n = n + 1 - continue - elif node_pos > pos: - break - - assert len(kmer) == k - - # Add a new node or update the De Bruijn graph - curr_vertices = debruijn[pos] - found = False - kmer_seq = ''.join(kmer) - for v in range(len(curr_vertices)): - cmp_nt, cmp_k_m1_mer = curr_vertices[v][:2] - if kmer_seq == cmp_k_m1_mer + cmp_nt: - curr_vertices[v][3].append(n) - found = True - break - - if not found: - predecessors = [] - if pos > 0: - prev_vertices = debruijn[pos - 1] - for v in range(len(prev_vertices)): - cmp_nt, cmp_k_m1_mer = prev_vertices[v][:2] - if kmer_seq[:-1] == cmp_k_m1_mer[1:] + cmp_nt: - predecessors.append(v) - debruijn[pos].append([kmer_seq[-1], # base - ''.join(kmer_seq[:-1]), # (k-1)-mer - predecessors, # predecessors - [n]]) # numeric read IDs - - # Update k-mer - if len(seq) > 0: - kmer, seq = kmer[1:] + seq[:1], seq[1:] - nodes[n] = [id, node_pos + 1, node_right, kmer, seq] - - # Average number of kmers - total_kmers = 0 - for pos in range(len(debruijn)): - vertices = debruijn[pos] - for _, _, _, num_ids in vertices: - total_kmers += len(num_ids) - avg_kmers = float(total_kmers) / len(debruijn) - - # Filter out reads - for pos in range(len(debruijn)): - vertices = debruijn[pos] - num_vertices = 0 - num_kmers = 0 - for v in range(len(vertices)): - _, _, predecessors, num_ids = vertices[v] - if not (set(num_ids) <= delete_ids): - num_vertices += 1 - if DRB1_debug: - num_kmers = len(set(num_ids) - delete_ids) - if num_vertices <= 1: - if DRB1_debug: - if pos > 300 and pos + 300 < len(debruijn): - if num_vertices == 1 and num_kmers * 8 < avg_kmers: - for _, _, _, num_ids in vertices: - delete_ids |= set(num_ids) - continue - - vertice_count = [0] * len(vertices) - for v in range(len(vertices)): - _, _, predecessors, num_ids = vertices[v] - for num_id in num_ids: - if num_id in delete_ids: - continue - read_id = get_id_and_sub(num_to_id[num_id])[0] - if read_id in self.other_nodes: - continue - mate_read_id = get_mate_node_id(read_id) - if mate_read_id in self.nodes: - vertice_count[v] += 1 - - # First look at and remove reads that are multi-aligned locally - first_pair = None - for v in range(len(vertices)): - read_ids = set([get_id_and_sub(num_to_id[num_id])[0] for num_id in vertices[v][3]]) - for v2 in range(v + 1, len(vertices)): - read_ids2 = set([get_id_and_sub(num_to_id[num_id])[0] for num_id in vertices[v2][3]]) - if read_ids & read_ids2: - first_pair = [v, v2, read_ids & read_ids2] - break - - debug_msg = False - if debug_msg: - print >> sys.stderr, "at", pos, vertices - print >> sys.stderr, "count:", vertice_count - - if try_hard: - vertice_with_id = [[vertice_count[v], v] for v in range(len(vertice_count))] - vertice_with_id = sorted(vertice_with_id, key=lambda a: a[0]) - for v in range(len(vertice_count) - 2): - v = vertice_with_id[v][1] - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - if debug_msg: - print >> sys.stderr, v, "is removed with", num_ids - else: - if first_pair: - v, v2, multi_read_ids = first_pair - v_ = v if vertice_count[v] < vertice_count[v2] else v2 - for num_id in vertices[v_][3]: - id = get_id_and_sub(num_to_id[num_id])[0] - if id in multi_read_ids: - delete_ids.add(num_id) - else: - assert len(vertices) >= 2 - relative_avg = (sum(vertice_count) - vertice_count[v]) / float(len(vertice_count) - 1) - if len(vertices) == 2: - for v in range(len(vertices)): - # Eliminate reads that have conflicts with other reads due to a deletion - if vertice_count[v] * 2 < relative_avg: - nt, kmer, _, num_ids = vertices[1-v] - if nt == 'D': - num_id = num_ids[0] - id_sub = num_to_id[num_id] - id, sub = get_id_and_sub(id_sub) - if sub == 0: - left = pos - self.nodes[id].left - else: - left = pos - self.other_nodes[id][sub - 1].left - seq = node_seq[id_sub] - seq_right = ''.join(seq[left+k:]) - seq_right = seq_right.replace('D', '') - success = True - for num_id2 in vertices[v][3]: - id_sub2 = num_to_id[num_id2] - id2, sub2 = get_id_and_sub(id_sub2) - if sub2 == 0: - left2 = pos - self.nodes[id2].left - else: - left2 = pos - self.other_nodes[id2][sub2 - 1].left - seq2 = node_seq[id_sub2] - seq2_right = ''.join(seq2[left2+k:]) - if seq_right.find(seq2_right) != 0: - success = False - break - if success: - delete_ids |= set(vertices[v][3]) - - # DK - working on ... - if DRB1_debug: - if vertice_count[v] * 8 < relative_avg: - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - if debug_msg: - print >> sys.stderr, v, "is removed with", num_ids - elif vertice_count[v] * 8 < avg_kmers: - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - else: - second2last = sorted(vertice_count)[1] - for v in range(len(vertices)): - # if vertice_count[v] * 3 < relative_avg: - if vertice_count[v] < second2last: - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - if debug_msg: - print >> sys.stderr, v, "is removed with", num_ids - - if debug_msg: - print >> sys.stderr - print >> sys.stderr - - # delete nodes - ids_to_be_updated = set() - for num_id in delete_ids: - id_sub = num_to_id[num_id] - id, sub = get_id_and_sub(id_sub) - ids_to_be_updated.add(id) - if sub == 0: - self.nodes[id] = None - else: - self.other_nodes[id][sub-1] = None - - for id in self.nodes.keys(): - other_nodes = [] - if id in self.other_nodes: - for other_node in self.other_nodes[id]: - if other_node != None: - other_nodes.append(other_node) - if self.nodes[id] == None: - if len(other_nodes) == 0: - del self.nodes[id] - else: - self.nodes[id] = other_nodes[0] - del other_nodes[0] - if id in self.other_nodes: - if len(other_nodes) == 0: - del self.other_nodes[id] - else: - self.other_nodes[id] = other_nodes - - for id in ids_to_be_updated: - if id in self.nodes: - add_node_seq(node_seq, id) - - if len(delete_ids) == 0: - if try_hard: - break - else: - try_hard = True - - # Print De Bruijn graph - for i in range(len(debruijn)): - curr_vertices = debruijn[i] - if len(curr_vertices) == 0: - continue - consensus_seq = [{} for j in range(k)] - for v in range(len(curr_vertices)): - nt, k_m1_mer = curr_vertices[v][:2] - kmer = k_m1_mer + nt - assert len(kmer) == k - for j in range(k): - nt = kmer[j] - if nt not in consensus_seq[j]: - consensus_seq[j][nt] = 1 - else: - consensus_seq[j][nt] += 1 - - if print_msg: print >> sys.stderr, i - for v in range(len(curr_vertices)): - nt, k_m1_mer, predecessors, num_ids = curr_vertices[v] - kmer = k_m1_mer + nt - kmer_seq = "" - for j in range(k): - nt = kmer[j] - if len(consensus_seq[j]) >= 2: - kmer_seq += "\033[94m" - kmer_seq += nt - if len(consensus_seq[j]) >= 2: - kmer_seq += "\033[00m" - - if print_msg: print >> sys.stderr, "\t%d:" % v, kmer_seq, len(num_ids), predecessors, num_ids - - id_to_num = {} - for num in range(len(num_to_id)): - id_sub = num_to_id[num] - id = get_id_and_sub(id_sub)[0] - num_to_id[num] = id - if id not in id_to_num: - id_to_num[id] = set() - id_to_num[id].add(num) - - # Generate compressed nodes - paths = [] - path_queue, done = deque(), set() - for i in range(len(debruijn)): - if len(debruijn[i]) == 0: - continue - for i2 in range(len(debruijn[i])): - path_queue.append("%d-%d" % (i, i2)) - break - - while len(path_queue) > 0: - i_str = path_queue.popleft() - if i_str in done: - continue - - i, i2 = i_str.split('-') - i, i2 = int(i), int(i2) - num_ids = debruijn[i][i2][3] - j = i + 1 - while j < len(debruijn): - merge, branch = len(debruijn[j-1]) > len(debruijn[j]), len(debruijn[j-1]) < len(debruijn[j]) - new_i2 = -1 - tmp_num_ids = [] - found = False - for j2 in range(len(debruijn[j])): - _, _, predecessors, add_read_ids = debruijn[j][j2] - if len(predecessors) == 0: - branch = True - path_queue.append("%d-%d" % (j, j2)) - elif i2 in predecessors: - found = True - # merge into one node - if len(predecessors) > 1: - merge = True - if new_i2 >= 0: - branch = True - new_i2 = j2 - tmp_num_ids += add_read_ids - - if merge or branch: - for j2 in range(len(debruijn[j])): - _, _, predecessors, add_num_ids = debruijn[j][j2] - if i2 in predecessors: - path_queue.append("%d-%d" % (j, j2)) - break - if not found: - break - - num_ids += tmp_num_ids - i2 = new_i2 - j += 1 - - done.add(i_str) - - num_ids = set(num_ids) - paths.append([i, j, num_ids]) - - if j < len(debruijn) and len(debruijn[j]) == 0: - j += 1 - while j < len(debruijn) and len(debruijn[j]) == 0: - j += 1 - if j < len(debruijn): - for j2 in range(len(debruijn[j])): - path_queue.append("%d-%d" % (j, j2)) - - - def get_mate_num_ids(num_ids): - mate_num_ids = set() - for num_id in num_ids: - read_id = num_to_id[num_id] - mate_read_id = get_mate_node_id(read_id) - if mate_read_id in id_to_num: - mate_num_id = id_to_num[mate_read_id] - mate_num_ids |= mate_num_id - - return mate_num_ids - - - # Generate a compressed assembly graph - def path_cmp(a, b): - if a[0] != b[0]: - return a[0] - b[0] - else: - return a[1] - b[1] - paths = sorted(paths, cmp=path_cmp) - - for p in range(len(paths)): - if print_msg: print >> sys.stderr, "path:", p, paths[p] - - excl_num_ids = set() # exclusive num ids - equiv_list = [] - p = 0 - while p < len(paths): - left, right, num_ids = paths[p] - p2 = p + 1 - while p2 < len(paths): - next_left, next_right, next_num_ids = paths[p2] - if next_left >= right: - break - p2 += 1 - - equiv_list.append([]) - for i in range(p, p2): - left, right, num_ids = paths[i] - equiv_list[-1].append([[i], num_ids, num_ids | get_mate_num_ids(num_ids), []]) - if p + 1 < p2: - assert p + 2 == p2 - excl_num_ids |= num_ids - - p = p2 - - new_equiv_list = [] - for classes in equiv_list: - if len(classes) > 1: - new_equiv_list.append(classes) - continue - assert len(classes) == 1 - num_ids = classes[0][1] - excl_num_ids - if len(num_ids) <= 0: - continue - classes[0][1] = num_ids - classes[0][2] = num_ids | get_mate_num_ids(num_ids) - new_equiv_list.append(classes) - equiv_list = new_equiv_list - - known_alleles = False - while True: - for i in range(len(equiv_list)): - classes = equiv_list[i] - for j in range(len(classes)): - ids, num_ids, all_ids, alleles = classes[j] - if print_msg: print >> sys.stderr, i, j, ids, len(num_ids), sorted(list(num_ids))[:20], alleles - - if print_msg: print >> sys.stderr - - if known_alleles: - for i in range(len(equiv_list)): - classes = equiv_list[i] - for j in range(len(classes)): - num_ids = sorted(list(classes[j][1])) - node_id = "(%d-%d)%s" % (i, j, num_to_id[num_ids[0]]) - node = self.nodes2[node_id] - node_vars = node.get_var_ids() - max_alleles, max_common = set(), -sys.maxint - for anode in self.predicted_allele_nodes.values(): - allele_vars = anode.get_var_ids(node.left, node.right) - tmp_common = len(set(node_vars) & set(allele_vars)) - len(set(node_vars) | set(allele_vars)) - if tmp_common > max_common: - max_common = tmp_common - max_alleles = set([anode.id]) - elif tmp_common == max_common: - max_alleles.add(anode.id) - classes[j][3] = max_alleles - - - best_common_mat, best_stat, best_i, best_i2 = [], -sys.maxint, -1, -1 - for i in range(len(equiv_list) - 1): - classes = equiv_list[i] - for i2 in range(i + 1, len(equiv_list)): - classes2 = equiv_list[i2] - common_mat = [] - for j in range(len(classes)): - common_mat.append([]) - if known_alleles: - ids = classes[j][3] - else: - ids = classes[j][2] - for j2 in range(len(classes2)): - if known_alleles: - ids2 = classes2[j2][3] - else: - ids2 = classes2[j2][2] - common_mat[-1].append(len(ids & ids2)) - - # Calculate stat - common_stat = 0 - if len(classes) == 1 or len(classes2) == 1: - for row in common_mat: - common_stat += sum(row) - else: - for row in common_mat: - sorted_row = sorted(row, reverse=True) - common_stat += (sorted_row[0] - sorted_row[1]) - if common_mat[0][0] + common_mat[1][1] == \ - common_mat[1][0] + common_mat[0][1]: - common_stat = -1 - - if common_stat > best_stat: - best_common_mat, best_stat, best_i, best_i2 = common_mat, common_stat, i, i2 - - if print_msg: - print >> sys.stderr, "best:", best_i, best_i2, best_stat, best_common_mat - print >> sys.stderr - print >> sys.stderr - - if known_alleles and best_stat < 0: - self.remove_nodes(self.nodes2) - break - if best_stat < 0: - known_alleles = True - new_nodes = {} - for i in range(len(equiv_list)): - classes = equiv_list[i] - for j in range(len(classes)): - ids, num_ids, all_ids, alleles = classes[j] - num_ids = sorted(list(num_ids)) - - if print_msg: print >> sys.stderr, i, j, num_ids - - assert (num_ids) > 0 - read_id = num_to_id[num_ids[0]] - node = deepcopy(self.nodes[read_id]) - for num_id2 in num_ids[1:]: - read_id2 = num_to_id[num_id2] - node2 = self.nodes[read_id2] - node.combine_with(node2) - - new_read_id = "(%d-%d)%s" % (i, j, read_id) - node.id = new_read_id - new_read_id not in new_nodes - new_nodes[new_read_id] = node - - self.nodes = new_nodes - self.nodes2 = deepcopy(self.nodes) - self.remove_nodes(self.nodes) - continue - - mat = best_common_mat - classes, classes2 = equiv_list[best_i], equiv_list[best_i2] - - # Filter vertices further if necessary - def del_row(classes, mat, r): - return classes[:r] + classes[r+1:], mat[:r] + mat[r+1:] - - def del_col(classes, mat, c): - new_mat = [] - for row in mat: - row = row[:c] + row[c+1:] - new_mat.append(row) - return classes[:c] + classes[c+1:], new_mat - - assert len(classes) <= 2 and len(classes2) <= 2 - if len(classes) == 2 and len(classes2) == 2: - # Check row - num_ids1, num_ids2 = len(classes[0][1]), len(classes[1][1]) - if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1: - row_sum1, row_sum2 = sum(mat[0]), sum(mat[1]) - if row_sum1 > max(2, row_sum2 * 6): - classes, mat = del_row(classes, mat, 1) - classes[0][1] -= excl_num_ids - elif row_sum2 > max(2, row_sum1 * 6): - classes, mat = del_row(classes, mat, 0) - classes[0][1] -= excl_num_ids - # Check column - if len(classes) == 2: - num_ids1, num_ids2 = len(classes2[0][1]), len(classes2[1][1]) - if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1: - col_sum1, col_sum2 = mat[0][0] + mat[1][0], mat[0][1] + mat[1][1] - if col_sum1 > max(2, col_sum2 * 6): - classes2, mat = del_col(classes2, mat, 1) - classes2[0][1] -= excl_num_ids - elif col_sum2 > max(2, col_sum1 * 6): - classes2, mat = del_col(classes2, mat, 0) - classes2[0][1] -= excl_num_ids - - merge_list = [] - def add_merge(classes, classes2, i, j, k): - if known_alleles: - num_ids1, num_ids2 = classes[i][1], classes2[j][1] - num_ids1, num_ids2 = sorted(list(num_ids1)), sorted(list(num_ids2)) - num_id1, num_id2 = num_ids1[0], num_ids2[0] - node_id1 = "(%d-%d)%s" % (best_i, i, num_to_id[num_id1]) - node_id2 = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id2]) - node_id3 = "(%d-%d)%s" % (best_i, k, num_to_id[min(num_id1, num_id2)]) - merge_list.append([node_id1, node_id2, node_id3]) - - classes[i][0] = sorted(classes[i][0] + classes2[j][0]) - classes[i][1] |= classes2[j][1] - - copy_list = [] - def add_copy(classes, classes2, i, j, k): - if known_alleles: - num_ids = classes2[j][1] - num_ids = sorted(list(num_ids)) - num_id = num_ids[0] - node_id = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id]) - node_id2 = "(%d-%d)%s" % (best_i, k, num_to_id[num_id]) - copy_list.append([node_id, node_id2]) - - classes[i] = classes2[j] - - remove_list = [] - def add_remove(classes, i): - if known_alleles: - num_ids = classes[i][1] - num_ids = sorted(list(num_ids)) - num_id = num_ids[0] - node_id = "(%d-%d)%s" % (best_i, i, num_to_id[num_id]) - remove_list.append([node_id]) - - classes = [classes[1-i]] - - if len(classes) == 1 and len(classes2) == 1: - add_merge(classes, classes2, 0, 0, 0) - - elif len(classes) == 1: - if 0 not in classes[0][0] and \ - mat[0][0] > max(2, mat[0][1] * 6) and \ - len(classes2[0][1]) > len(classes2[1][1]) * 2: - add_merge(classes, classes2, 0, 0, 0) - elif 0 not in classes[0][0] and \ - mat[0][1] > max(2, mat[0][0] * 6) and \ - len(classes2[1][1]) > len(classes2[0][1]) * 2: - add_merge(classes, classes2, 0, 1, 0) - else: - classes.append(deepcopy(classes[0])) - - # Handle a special case at 5' end - if 0 in classes[0][0] and \ - len(classes[0][0]) == 1 and \ - (mat[0][0] > mat[0][1] * 2 or mat[0][1] > mat[0][0] * 2): - if mat[0][0] > mat[0][1]: - add_merge(classes, classes2, 0, 0, 0) - add_copy(classes, classes2, 1, 1, 1) - else: - assert mat[0][1] > mat[0][0] - add_copy(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 1, 1) - else: - add_merge(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 1, 1) - - elif len(classes2) == 1: - if mat[0][0] > max(2, mat[1][0] * 6): - add_merge(classes, classes2, 0, 0, 0) - if len(classes[0][1]) > len(classes[1][1]) * 6: - add_remove(classes, 1) - elif mat[1][0] > max(2, mat[0][0] * 6): - add_merge(classes, classes2, 1, 0, 0) - if len(classes[1][1]) > len(classes[0][1]) * 6: - add_remove(classes, 0) - else: - add_merge(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 0, 1) - - else: - score00 = mat[0][0] + mat[1][1] - score01 = mat[0][1] + mat[1][0] - if score00 > score01: - add_merge(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 1, 1) - elif score00 < score01: - add_merge(classes, classes2, 0, 1, 0) - add_merge(classes, classes2, 1, 0, 1) - else: - break - - for c in range(len(classes)): - classes[c][2] = classes[c][1] | get_mate_num_ids(classes[c][1]) - - equiv_list[best_i] = classes - equiv_list = equiv_list[:best_i2] + equiv_list[best_i2+1:] - - if known_alleles: - exclude_ids = set() - new_nodes = {} - for node_id1, node_id2, node_id3 in merge_list: - if self.nodes2[node_id1].left <= self.nodes2[node_id2].left: - node = deepcopy(self.nodes2[node_id1]) - node2 = self.nodes2[node_id2] - else: - node = deepcopy(self.nodes2[node_id2]) - node2 = self.nodes2[node_id1] - node.combine_with(node2) - node.id = node_id3 - new_nodes[node_id3] = node - exclude_ids.add(node_id1) - exclude_ids.add(node_id2) - - for node_id1, node_id2 in copy_list: - node = self.nodes2[node_id1] - node.id = node_id2 - new_nodes[node_id2] = node - exclude_ids.add(node_id1) - - exclude_ids |= set(remove_list) - - for node_id, node in self.nodes2.items(): - if node_id in exclude_ids: - continue - num, id = node_id.split(')') - i, i2 = num[1:].split('-') - i, i2 = int(i), int(i2) - if i > best_i2: - i -= 1 - node_id = "(%d-%d)%s" % (i, i2, id) - node.id = node_id - new_nodes[node_id] = node - - self.nodes2 = new_nodes - - - # Display graph information - def print_info(self): - print >> sys.stderr, "Backbone len: %d" % len(self.backbone) - print >> sys.stderr, "\t%s" % self.backbone - - - # Compare nodes and get information - def get_node_comparison_info(self, node_dic): - assert len(node_dic) > 0 - nodes = [[id, node.left, node.right] for id, node in node_dic.items()] - def node_cmp(a, b): - if a[1] != b[1]: - return a[1] - b[1] - else: - return a[2] - b[2] - nodes = sorted(nodes, cmp=node_cmp) - seqs, colors = [], [] - for p in range(len(self.backbone)): - nts = set() - for n in range(len(nodes)): - id, left, right = nodes[n] - node = node_dic[id] - if p >= left and p <= right: - nt_dic = node.seq[p - left] - nt = get_major_nt(nt_dic) - nts.add(nt) - - for n in range(len(nodes)): - if p == 0: - seqs.append([]) - colors.append([]) - id, left, right = nodes[n] - node = node_dic[id] - if p >= left and p <= right: - nt_dic = node.seq[p - left] - nt = get_major_nt(nt_dic) - seqs[n].append(nt) - if nt != self.backbone[p]: - if len(nts) > 1: - colors[n].append('R') - else: - colors[n].append('B') - else: - colors[n].append('N') - else: - seqs[n].append(' ') - - assert len(nodes) == len(seqs) - for n in range(len(nodes)): - node, seq, color = nodes[n], seqs[n], colors[n] - new_left, new_right = 0, len(seq) - 1 - while seq[new_left] == 'D': - new_left += 1 - while seq[new_right] == 'D': - new_right -= 1 - - node[1] = new_left - node[2] = new_right - seqs[n] = seq[new_left:new_right+1] - colors[n] = color[new_left:new_right+1] - - return nodes, seqs, colors - - - # Compare nodes - def print_node_comparison(self, node_dic): - nodes, seqs, colors = self.get_node_comparison_info(node_dic) - interval = 100 - for p in range(0, (len(self.backbone) + interval - 1) / interval * interval, interval): - cur_seqs = [] - for n in range(len(nodes)): - id, left, right = nodes[n] # inclusive coordinate - right += 1 - seq = [] - seq_left, seq_right = max(p, left), min(p+interval, right) - if seq_left >= seq_right: - continue - if p < left: - seq += ([' '] * (left - p)) - for s in range(seq_left, seq_right): - nt, color = seqs[n][s-left], colors[n][s-left] - if color in "RB": - if color == 'R': - nt = "\033[91m" + nt - else: - nt = "\033[94m" + nt - nt += "\033[00m" - seq.append(nt) - if right < p + interval: - seq += ([' '] * (p + interval - right)) - seq = ''.join(seq) - cur_seqs.append([seq, id]) - - if len(cur_seqs) <= 0: - continue - - print >> sys.stderr, p - for seq, id in cur_seqs: - print >> sys.stderr, "\t", seq, id - - - # Calculate coverage - def calculate_coverage(self): - allele_nodes = self.true_allele_nodes if self.simulation else self.predicted_allele_nodes - allele_nodes = [[id, node.left, node.right] for id, node in allele_nodes.items()] - coverage = {} - for allele_id, _, _ in allele_nodes: - coverage[allele_id] = [0.0 for _ in range(len(self.backbone))] - - nodes = [[id, node.left, node.right] for id, node in self.nodes.items()] - for id, left, right in nodes: - node = self.nodes[id] - nodes2 = [[node, left, right]] - if id in self.other_nodes: - for node in self.other_nodes[id]: - nodes2.append([node, node.left, node.right]) - - for node, left, right in nodes2: - node_vars = node.get_vars() - node_var_ids = node.get_var_ids() - max_common = -sys.maxint - max_allele_node_ids = [] - for allele_node_id, allele_left, allele_right in allele_nodes: - if right - left <= 500 and (left < allele_left or right > allele_right): - continue - if self.simulation: - allele_node = self.true_allele_nodes[allele_node_id] - else: - allele_node = self.predicted_allele_nodes[allele_node_id] - allele_vars = allele_node.get_var_ids(left, right) - common_vars = set(node_var_ids) & set(allele_vars) - tmp_common = len(common_vars) - len(set(node_var_ids) | set(allele_vars)) - if max_common < tmp_common: - max_common = tmp_common - max_allele_node_ids = [allele_node_id] - elif max_common == tmp_common: - max_allele_node_ids.append(allele_node_id) - if len(max_allele_node_ids) <= 0: - continue - add_cov = 1.0 / len(nodes2) / len(max_allele_node_ids) - assert add_cov > 0.0 - for allele_node_id in max_allele_node_ids: - for p in range(left, right + 1): - coverage[allele_node_id][p] += add_cov - - max_cov = 0.0 - for allele_id, cov in coverage.items(): - max_cov = max(max_cov, max(cov)) - for allele_id, cov in coverage.items(): - cov2 = [c / max_cov for c in cov] - coverage[allele_id] = cov2 - self.coverage = coverage - - - # Begin drawing graph - def begin_draw(self, fname_base): - pdfDraw = self.pdfDraw = open(fname_base + '.pdf', 'w') - print >> pdfDraw, r'%PDF-1.7' - self.objects, self.stream = [], [] - self.draw_items = [] - - # End drawing graph - def end_draw(self): - self.unscaled_height += 50 - self.height = self.unscaled_height * self.scaley - - def get_x(x): - return self.left_margin + x * self.scalex - - def get_y(y): - return self.height - self.top_margin - y * self.scaley - - # Get scalar - def get_sx(x): - return x * self.scalex - - def get_sy(y): - return y * self.scaley - - pdfDraw = self.pdfDraw - self.add_pdf_object('<>') - self.add_pdf_object('<>') - self.add_pdf_object('<>' % \ - (self.width, self.height)) - self.add_pdf_object('<>>>') - self.add_pdf_object('<>') - - # Draw vertical dotted lines at every 100nt and thick lines at every 500nt - pre_items = [] - for pos in range(0, len(self.backbone), 100): - main_line = (pos != 0 and pos % 500 == 0) - dic = {"coord": [pos, 2, pos, self.unscaled_height - 2], - "stroke" : "0.5 0.5 0.5", - "line_width" : 1 if main_line else 0.2} - if not main_line: - dic["line_dash"] = "[3] 0" - pre_items.append(["line", dic]) - self.draw_items = pre_items + self.draw_items - - fill, stroke, line_width, line_dash = "0 0 0", "0 0 0", 2.0, "" - for type, dic in self.draw_items: - commands = [] - if type != "state": - assert "coord" in dic - - if "fill" in dic and dic["fill"] != fill: - fill = dic["fill"] - commands.append("%s rg" % fill) - if "stroke" in dic and dic["stroke"] != stroke: - stroke = dic["stroke"] - commands.append("%s RG" % stroke) - if "line_width" in dic and dic["line_width"] != line_width: - line_width = dic["line_width"] - commands.append("%.1f w" % line_width) - if "line_dash" in dic: - if dic["line_dash"] != line_dash: - line_dash = dic["line_dash"] - commands.append("%s d" % line_dash) - elif line_dash != "": - line_dash = "" - commands.append("[] 0 d") - - if type == "rect": - x, y, sx, sy = dic["coord"] - re_str = "%d %d %d %d" % (get_x(x), get_y(y), get_sx(sx), get_sy(sy)) - if "fill" in dic: - commands.append("%s re f" % re_str) - if "stroke" in dic: - commands.append("%s re S" % re_str) - - elif type == "line": - x, y, x2, y2 = dic["coord"] - commands.append("%d %d m %d %d l h S" % \ - (get_x(x), get_y(y), get_x(x2), get_y(y2))) - elif type == "text": - assert "text" in dic and "font_size" in dic - x, y = dic["coord"] - commands.append("BT /F1 %d Tf %d %d Td (%s) Tj ET" % \ - (dic["font_size"], get_x(x), get_y(y), dic["text"])) - else: - assert type == "state" - - self.stream.append(' '.join(commands)) - - # Write stream - self.add_pdf_stream('\n'.join(self.stream)) - - # Write xref and trailer - to_xref = pdfDraw.tell() - print >> pdfDraw, 'xref' - print >> pdfDraw, "0 %d" % (len(self.objects) + 1) - print >> pdfDraw, r'0000000000 65535 f' - for object in self.objects: - print >> pdfDraw, "%s 00000 n" % "{:010}".format(object) - print >> pdfDraw, 'trailer <>' % (len(self.objects) + 1) - print >> pdfDraw, 'startxref' - print >> pdfDraw, str(to_xref) - print >> pdfDraw, r'%%EOF' - - self.pdfDraw.close() - - - def add_pdf_object(self, obj): - self.objects.append(self.pdfDraw.tell()) - print >> self.pdfDraw, "%d 0 obj %s" % (len(self.objects), obj) - print >> self.pdfDraw, 'endobj' - - - def add_pdf_stream(self, stream): - self.add_pdf_object("<>\nstream\n%s\nendstream" % (len(stream), stream)) - - - # Draw graph - # Top left as (0, 0) and Bottom right as (width, height) - def draw(self, - begin_y, - title = ""): - assert len(self.nodes) > 0 - nodes = [[id, node.left, node.right] for id, node in self.nodes.items()] - def node_cmp(a, b): - return a[1] - b[1] - nodes = sorted(nodes, cmp=node_cmp) - max_right = len(self.backbone) - - # display space - end_y = begin_y + 10000 - dspace = [[[begin_y, end_y]]] * (max_right + 1) - def get_dspace(left, right, height): - assert left < len(dspace) and right < len(dspace) - range1 = dspace[left] - for range2 in dspace[left + 1:right + 1]: - new_range = [] - # sub range - for t1, b1 in range1: - for t2, b2 in range2: - if b1 < t2: - break - if b2 < t1: - continue - t, b = max(t1, t2), min(b1, b2) - if b - t >= height: - new_range.append([t, b]) - - range1 = new_range - if len(range1) <= 0: - return -1 - - t, b = range1[0] - assert b - t >= height - b = t + height - for i in range(left, right+1): - range1 = dspace[i] - range2 = [] - found = False - for j in range(len(range1)): - t2, b2 = range1[j] - if t2 <= t and b <= b2: - found = True - if t2 < t: - range2.append([t2, t]) - if b < b2: - range2.append([b, b2]) - else: - range2.append([t2, b2]) - dspace[i] = range2 - assert found - return t - - def get_x(x): - return self.left_margin + x * self.scalex - - def get_y(y): - return self.height - self.top_margin - y * self.scaley - - # Get scalar - def get_sx(x): - return x * self.scalex - - def get_sy(y): - return y * self.scaley - - # Draw exons - y = get_dspace(0, max_right, 14) - for e in range(len(self.exons)): - left, right = self.exons[e] - right += 1 - - # Draw exon - self.draw_items.append(["rect", - {"coord" : [left, y + 10, right - left, 10], - "fill" : "1 1 1", - "stroke" : "0 0 0", - "line_width" : 2}]) - - primary = False - for left_, _ in self.primary_exons: - if left == left_: - primary = True - break - - # Draw label - self.draw_items.append(["text", - {"coord" : [left + 2, y + 7], - "text" : "Exon %d%s" % (e+1, " (primary)" if primary else ""), - "fill" : "0 0 0", - "font_size" : 12}]) - if e > 0: - prev_right = self.exons[e-1][1] + 1 - self.draw_items.append(["line", - {"coord": [prev_right, y + 5, left, y + 5], - "line_width" : 2}]) - - # Draw backbone sequence - y = get_dspace(0, max_right, 4) - for pos in range(len(self.backbone)): - base = self.backbone[pos] - self.draw_items.append(["text", - {"coord" : [pos, y + 2], - "text" : base, - "fill" : "0.5 0 0.5", - "font_size" : 8}]) - - # Draw true or predicted alleles - node_colors = ["1 1 0", "0 1 0", "1 0.8 0.64", "0.76 0.27 0.5"] - allele_node_colors = ["0.87 0.87 0", "0 0.53 0", "0.87 0.66 0.5", "0.63 0.14 0.38"] - def draw_alleles(allele_node_dic, allele_node_colors, display = False): - if len(allele_node_dic) <= 0: - return - allele_nodes, seqs, colors = self.get_node_comparison_info(allele_node_dic) - - def draw_coverage(allele_node, allele_id, left, right, allele_node_color): - if allele_id not in self.coverage: - return - y = get_dspace(0, max_right, 14) - for p in range(left, right): - cov = math.ceil(self.coverage[allele_id][p] * 12) - self.draw_items.append(["rect", - {"coord" : [p, y + 13, 1, cov], - "fill" : allele_node_color}]) - - - for n_ in range(len(allele_nodes)): - n = -1 - prob = "" - if not display and \ - not self.simulation and \ - len(self.allele_node_order) == len(allele_node_dic): - allele_id, prob = self.allele_node_order[n_] - for n2_ in range(len(allele_nodes)): - if allele_id == allele_nodes[n2_][0]: - n = n2_ - break - prob = ": %.2f" % prob - else: - n = n_ - assert n >= 0 and n < len(allele_nodes) - allele_id, left, right = allele_nodes[n] - right += 1 - allele_node = allele_node_dic[allele_id] - allele_node_color = allele_node_colors[n % len(allele_node_colors)] - - draw_coverage(allele_node, allele_id, left, right, allele_node_color) - - y = get_dspace(0, max_right, 14) - - # Draw allele name - if display: - allele_type = "display" - else: - if self.simulation: - allele_type = "true" - else: - allele_type = "predicted" - self.draw_items.append(["text", - {"coord" : [-55, y + 7], - "text" : "%s (%s, %s)" % (allele_id, "partial" if allele_id in self.partial_allele_ids else "full", allele_type), - "fill" : "0 0 1", - "font_size" : 18}]) - # Draw node - self.draw_items.append(["rect", - {"coord" : [left, y + 10, right - left, 10], - "fill" : allele_node_color, - "stroke" : "0 0 0", - "line_width" : 2}]) - - - color_boxes = [] - c = 0 - while c < len(colors[n]): - color = colors[n][c] - c2 = c + 1 - if color != 'N': - while c2 < len(colors[n]): - color2 = colors[n][c2] - if color != color2: - break - c2 += 1 - color_boxes.append([c, c2, color]) - c = c2 - - # Draw variants - for color_box in color_boxes: - cleft, cright, color = color_box - cleft += left; cright += left - if color == 'B': - color = "0 0 1" # blue - else: - color = "0.12 0.56 1" - # DK - debugging purposes - color = "0 0 1" - self.draw_items.append(["rect", - {"coord" : [cleft, y + 9, cright - cleft, 8], - "fill" : color}]) - - return allele_nodes, seqs, colors - - allele_nodes, seqs, colors = draw_alleles(self.true_allele_nodes if self.simulation else self.predicted_allele_nodes, - allele_node_colors) - draw_alleles(self.display_allele_nodes, - ["1 0.96 0.95"], - True) # display alleles? - - # Draw location at every 100bp - y = get_dspace(0, nodes[-1][2], 14) - for pos in range(0, nodes[-1][2], 100): - # Draw label - self.draw_items.append(["text", - {"coord" : [pos + 1, y + 2], - "text" : "%d" % (pos + 1), - "fill" : "0 0 0", - "font_size" : 10}]) - - # Draw nodes - node_to_y = {} - draw_title = False - for id, left, right in nodes: - node = self.nodes[id] - nodes2 = [[node, left, right]] - if id in self.other_nodes: - for node in self.other_nodes[id]: - nodes2.append([node, node.left, node.right]) - if left > node.left: - left = node.left - if right < node.right: - right = node.right - - # Get y position - y = get_dspace(left, right, 14 * len(nodes2)) - for node, left, right in nodes2: - if y < 0: - continue - node_to_y[id] = y - - node_vars = node.get_vars() - node_var_ids = node.get_var_ids() - if len(nodes2) > 1: - color = "0.85 0.85 0.85" - elif len(allele_nodes) > 0: - color = "1 1 1" - max_common = -sys.maxint - for a in range(len(allele_nodes)): - allele_node_id, allele_left, allele_right = allele_nodes[a] - if right - left <= 500 and (left < allele_left or right > allele_right): - continue - if self.simulation: - allele_node = self.true_allele_nodes[allele_node_id] - else: - allele_node = self.predicted_allele_nodes[allele_node_id] - allele_vars = allele_node.get_var_ids(left, right) - common_vars = set(node_var_ids) & set(allele_vars) - tmp_common = len(common_vars) - len(set(node_var_ids) | set(allele_vars)) - if max_common < tmp_common: - max_common = tmp_common - color = node_colors[a % len(node_colors)] - elif max_common == tmp_common: - color = "1 1 1" - else: - color = "1 1 0" # yellow - - # Draw node - right += 1 - self.draw_items.append(["rect", - {"coord" : [left, y + 10, right - left, 10], - "fill" : color, - "stroke" : "0 0 0", - "line_width" : 2}]) - - # Draw variants - for var_id, pos in node_vars: - if var_id == "gap": - var_type, var_left = "single", pos - color = "0 0 0" - elif var_id == "unknown" or var_id.startswith("nv"): - var_type, var_left = "single", pos - color = "1 0 0" - else: - var_type, var_left, var_data = self.gene_vars[var_id] - color = "0 0 1" - if var_type == "single": - var_right = var_left + 1 - elif var_type == "insertion": - var_right = var_left + len(var_data) - else: - assert var_type == "deletion" - var_right = var_left + int(var_data) - self.draw_items.append(["rect", - {"coord" : [var_left, y + 9, var_right - var_left, 8], - "fill" : color}]) - - # Draw label - if get_sx(right - left) >= 300: - self.draw_items.append(["text", - {"coord" : [left + 2, y + 7], - "text" : node.id, - "fill" : "0 0 1", - "font_size" : 12}]) - - - if not draw_title: - draw_title = True - self.draw_items.append(["text", - {"coord" : [-68, y + 7], - "text" : title, - "fill" : "0 0 0", - "font_size" : 24}]) - - y += 14 - - curr_y = get_dspace(0, nodes[-1][2], 1) - self.unscaled_height = curr_y if curr_y > 0 else end_y - return self.unscaled_height - diff --git a/hisatgenotype_modules/hisatgenotype_typing_common.py b/hisatgenotype_modules/hisatgenotype_typing_common.py deleted file mode 100755 index 04cb95f3..00000000 --- a/hisatgenotype_modules/hisatgenotype_typing_common.py +++ /dev/null @@ -1,1552 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re -import math -import random -from copy import deepcopy -from datetime import datetime - - -################################################## -# Sequence processing routines -################################################## - - -""" -""" -def reverse_complement(seq): - comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} - rc_seq = "" - for s in reversed(seq): - if s in comp_table: - rc_seq += comp_table[s] - else: - rc_seq += s - return rc_seq - - -""" -""" -def read_genome(genome_file): - chr_dic, chr_names, chr_full_names = {}, [], [] - chr_name, chr_full_name, sequence = "", "", "" - for line in genome_file: - if line.startswith(">"): - if chr_name and sequence: - chr_dic[chr_name] = sequence - chr_names.append(chr_name) - chr_full_name = line.strip()[1:] - chr_name = line.strip().split()[0][1:] - chr_full_names.append(chr_full_name) - sequence = "" - else: - sequence += line.strip() - if chr_name and sequence: - chr_dic[chr_name] = sequence - chr_names.append(chr_name) - chr_full_names.append(chr_full_name) - return chr_dic, chr_names, chr_full_names - - -################################################## -# Alleles, variants, haplotypes, etc. -################################################## - - -""" -""" -def read_allele_sequences(fname): - allele_seqs = {} - allele_name, sequence = "", "" - for line in open(fname): - if line.startswith(">"): - if allele_name != "" and allele_name not in allele_seqs: - allele_seqs[allele_name] = sequence - allele_name = line.strip()[1:] - sequence = "" - else: - sequence += line.strip() - if allele_name != "" and allele_name not in allele_seqs: - allele_seqs[allele_name] = sequence - return allele_seqs - - -""" -""" -def read_variants(fname): - allele_vars = {} - for line in open(fname): - var_id, type, allele_name, left, data = line.strip().split() - left = int(left) - if type == "deletion": - data = int(data) - if allele_name not in allele_vars: - allele_vars[allele_name] = [] - allele_vars[allele_name].append([left, type, data, var_id]) - return allele_vars - - -""" -""" -def read_haplotypes(fname): - allele_haplotypes = {} - for line in open(fname): - haplotype_id, allele_name, left, right, vars = line.strip().split() - vars = vars.split(',') - left, right = int(left), int(right) - if allele_name not in allele_haplotypes: - allele_haplotypes[allele_name] = [] - allele_haplotypes[allele_name].append([left, right, vars]) - return allele_haplotypes - - -""" -""" -def read_links(fname): - links = [] - for line in open(fname): - var_id, allele_names = line.strip().split('\t') - links.append([var_id, allele_names]) - return links - - -""" -Compare two variants -""" -def compare_vars(a, b): - a_pos, a_type, a_data = a[:3] - b_pos, b_type, b_data = b[:3] - - if a_pos != b_pos: - return a_pos - b_pos - if a_type != b_type: - if a_type == 'I': - return -1 - elif b_type == 'I': - return 1 - if a_type == 'S': - return -1 - else: - return 1 - if a_data < b_data: - return -1 - elif a_data > b_data: - return 1 - else: - return 0 - - -""" -""" -def lower_bound(Var_list, pos): - low, high = 0, len(Var_list) - while low < high: - m = (low + high) / 2 - m_pos = Var_list[m][0] - if m_pos < pos: - low = m + 1 - elif m_pos > pos: - high = m - else: - assert m_pos == pos - while m > 0: - if Var_list[m-1][0] < pos: - break - m -= 1 - return m - return low - - - -""" -""" -def check_files(fnames): - for fname in fnames: - if not os.path.exists(fname): - return False - return True - - -################################################## -# Database releated routines -################################################## - - -""" -Download GRCh38 human reference and HISAT2 indexes -""" -def download_genome_and_index(): - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not check_files(HISAT2_fnames): - os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz") - os.system("hisat2-inspect grch38/genome > genome.fa") - os.system("samtools faidx genome.fa") - - -""" -""" -def clone_hisatgenotype_database(): - os.system("git clone https://github.com/DaehwanKimLab/hisatgenotype_db.git") - os.system("cd hisatgenotype_db; git checkout hisatgenotype_v1.0.2_beta; cd ..") - - -""" -""" -def extract_database_if_not_exists(base, - locus_list, - inter_gap = 30, - intra_gap = 50, - partial = True, - verbose = False): - fnames = [base + "_backbone.fa", - base + "_sequences.fa", - base + ".locus", - base + ".snp", - base + ".index.snp", - base + ".haplotype", - base + ".link", - base + ".allele", - base + ".partial"] - if check_files(fnames): - return - - extract_cmd = ["hisatgenotype_extract_vars.py"] - extract_cmd += ["--base", base] - if len(locus_list) > 0: - extract_cmd += ["--locus-list", ','.join(locus_list)] - if not partial: - extract_cmd += ["--no-partial"] - extract_cmd += ["--inter-gap", str(inter_gap), - "--intra-gap", str(intra_gap)] - if base == "hla": - extract_cmd += ["--min-var-freq", "0.1"] - - if base == "codis": - extract_cmd += ["--leftshift"] - - # DK - debugging purposes - # extract_cmd += ["--ext-seq", "300"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - - if not check_files(fnames): - print >> sys.stderr, "Error: hisatgenotype_extract_vars failed!" - sys.exit(1) - - -""" -""" -def build_index_if_not_exists(base, - aligner, - index_type, - threads = 1, - verbose = False): - if aligner == "hisat2": - # Build HISAT2 graph indexes based on the above information - if index_type == "graph": - hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base, i+1) for i in range(8)] - if not check_files(hisat2_graph_index_fnames): - build_cmd = ["hisat2-build", - "-p", str(threads), - "--snp", "%s.index.snp" % base, - "--haplotype", "%s.haplotype" % base, - "%s_backbone.fa" % base, - "%s.graph" % base] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(build_cmd) - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(hisat2_graph_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed! Perhaps, you may have forgotten to build hisat2 executables?" - sys.exit(1) - # Build HISAT2 linear indexes based on the above information - else: - assert index_type == "linear" - hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base, i+1) for i in range(8)] - if not check_files(hisat2_linear_index_fnames): - build_cmd = ["hisat2-build", - "%s_backbone.fa,%s_sequences.fa" % (base, base), - "%s.linear" % base] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(hisat2_linear_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - else: - # Build Bowtie2 indexes based on the above information - assert aligner == "bowtie2" and index_type == "linear" - bowtie2_index_fnames = ["%s.%d.bt2" % (base, i+1) for i in range(4)] - bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base, i+1) for i in range(2)] - if not tcheck_files(bowtie2_index_fnames): - build_cmd = ["bowtie2-build", - "%s_backbone.fa,%s_sequences.fa" % (base, base), - base] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w')) - proc.communicate() - if not check_files(bowtie2_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - - - -################################################## -# Read simulation and alignment -################################################## - - -""" -Simulate reads from alleles with headers (>) filled with mapping information. - For an example, see hisat2_test_HLA_genotyping.py. -""" -def simulate_reads(seq_dic, # seq_dic["A"]["A*24:36N"] = "ACGTCCG ..." - base_fname, # hla, codis, cyp, or so on - allele_list, # ["A*32:29", "B*07:02:01"] - Vars, # Vars["A"]["hv326"] = ["single", 604, "C"] - Links, - simulate_interval = 1, - read_len = 100, - frag_len = 250, - perbase_errorrate = 0.0, - perbase_snprate = 0.0, - skip_fragment_regions = []): - reads_1, reads_2 = [], [] - num_pairs = [] - for allele_names in allele_list: - gene = allele_names[0].split('*')[0] - num_pairs.append([]) - - # Introduce SNPs into allele sequences - def introduce_snps(seq): - seq = list(seq) - for i in range(len(seq)): - if random.random() * 100 < perbase_snprate: - if seq[i] == 'A': - alt_bases = ['C', 'G', 'T'] - elif seq[i] == 'C': - alt_bases = ['A', 'G', 'T'] - elif seq[i] == 'G': - alt_bases = ['A', 'C', 'T'] - else: - assert seq[i] == 'T' - alt_bases = ['A', 'C', 'G'] - random.shuffle(alt_bases) - alt_base = alt_bases[0] - seq[i] = alt_base - seq = ''.join(seq) - return seq - - # Simulate reads from two alleles - def simulate_reads_impl(seq, - seq_map, - ex_seq_map, - ex_seq, - ex_desc, - simulate_interval = 1, - read_len = 100, - frag_len = 250, - perbase_errorrate = 0.0, - skip_fragment_regions = []): - # Introduce sequencing errors - def introduce_seq_err(read_seq, pos): - read_seq = list(read_seq) - for i in range(read_len): - map_pos = seq_map[pos + i] - if ex_desc[map_pos] != "": - continue - if random.random() * 100 < perbase_errorrate: - if read_seq[i] == 'A': - alt_bases = ['C', 'G', 'T'] - elif read_seq[i] == 'C': - alt_bases = ['A', 'G', 'T'] - elif read_seq[i] == 'G': - alt_bases = ['A', 'C', 'T'] - else: - assert read_seq[i] == 'T' - alt_bases = ['A', 'C', 'G'] - random.shuffle(alt_bases) - alt_base = alt_bases[0] - read_seq[i] = alt_base - read_seq = ''.join(read_seq) - return read_seq - - # Get read alignment, e.g., 260|R_483_61M5D38M23D1M_46|S|hv154,3|S|hv162,10|D|hv185,38|D|hv266 - def get_info(read_seq, pos): - info = "%d_" % (seq_map[pos] + 1) - total_match, match, sub_match = 0, 0, 0 - var_str = "" - ins_len, ins_var = 0, "" - for i in range(pos, pos + read_len): - map_i = ex_seq_map[i] - assert ex_seq[map_i] != 'D' - total_match += 1 - match += 1 - if ex_seq[map_i] == 'I': - if ins_var != "": - assert ins_var == ex_desc[map_i] - ins_var = ex_desc[map_i] - ins_len += 1 - elif ins_var != "": - if var_str != "": - var_str += ',' - var_str += ("%s|I|%s" % (sub_match, ins_var)) - ins_len, ins_var = 0, "" - sub_match = 0 - if ex_seq[map_i] != 'I': - if ex_desc[map_i] != "" or read_seq[i-pos] != ex_seq[map_i]: - if var_str != "": - var_str += ',' - var_str += ("%d|S|%s" % (sub_match, ex_desc[map_i] if ex_desc[map_i] != "" else "unknown")) - sub_match = 0 - else: - sub_match += 1 - if i + 1 < pos + read_len and ex_seq[map_i+1] == 'D': - assert match > 0 - info += ("%dM" % match) - match = 0 - del_len = 1 - while map_i + 1 + del_len < len(ex_seq): - if ex_seq[map_i + 1 + del_len] != 'D': - break - del_len += 1 - info += ("%dD" % del_len) - if var_str != "": - var_str += ',' - var_str += ("%s|D|%s" % (sub_match, ex_desc[map_i + 1])) - sub_match = 0 - assert match > 0 - info += ("%dM" % match) - assert total_match == read_len - if var_str: - info += "_" - info += var_str - return info - - comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} - reads_1, reads_2 = [], [] - for i in range(0, len(seq) - frag_len + 1, simulate_interval): - if len(skip_fragment_regions) > 0: - skip = False - for skip_left, skip_right in skip_fragment_regions: - if i <= skip_right and i + frag_len > skip_left: - skip = True - break - if skip: - continue - - pos1 = i - seq1 = seq[pos1:pos1+read_len] - if perbase_errorrate > 0.0: - seq1 = introduce_seq_err(seq1, pos1) - info1 = get_info(seq1, pos1) - reads_1.append([seq1, info1]) - - pos2 = i + frag_len - read_len - seq2 = seq[pos2:pos2+read_len] - if perbase_errorrate > 0.0: - seq2 = introduce_seq_err(seq2, pos2) - info2 = get_info(seq2, pos2) - tmp_read_2 = reversed(seq2) - read_2 = "" - for s in tmp_read_2: - if s in comp_table: - read_2 += comp_table[s] - else: - read_2 += s - reads_2.append([read_2, info2]) - return reads_1, reads_2 - - # for each allele in a list of alleles such as ['A*32:29', 'B*07:02:01'] - for allele_name in allele_names: - allele_seq = seq_dic[gene][allele_name] - backbone_seq = seq_dic[gene]["%s*BACKBONE" % gene] - allele_ex_seq = list(backbone_seq) - allele_ex_desc = [''] * len(allele_ex_seq) - allele_seq_map = [i for i in range(len(allele_seq))] - allele_ex_seq_map = [i for i in range(len(allele_seq))] - - if perbase_snprate > 0: - HLA_seq = introduce_snps(allele_seq) - - # Extract variants included in each allele - var_ids = [] - for var_id, allele_list in Links.items(): - if allele_name in allele_list: - var_ids.append(var_id) - - def var_cmp(a, b): - assert a.startswith("hv") and b.startswith("hv") - return int(a[2:]) - int(b[2:]) - var_ids = sorted(var_ids, cmp=var_cmp) - - # Build annotated sequence for the allele w.r.t backbone sequence - add_pos = 0 - for var_id in var_ids: - var_type, var_pos, var_data = Vars[gene][var_id] - var_pos += add_pos - if var_type == "single": - allele_ex_seq[var_pos] = var_data - allele_ex_desc[var_pos] = var_id - elif var_type == "deletion": - del_len = int(var_data) - assert var_pos + del_len <= len(allele_ex_seq) - allele_ex_seq[var_pos:var_pos+del_len] = ['D'] * del_len - allele_ex_desc[var_pos:var_pos+del_len] = [var_id] * del_len - else: - assert var_type == "insertion" - ins_len = len(var_data) - allele_ex_seq = allele_ex_seq[:var_pos] + (['I'] * ins_len) + allele_ex_seq[var_pos:] - allele_ex_desc = allele_ex_desc[:var_pos] + ([var_id] * ins_len) + allele_ex_desc[var_pos:] - add_pos += ins_len - allele_ex_seq = ''.join(allele_ex_seq) - assert len(backbone_seq) + add_pos == len(allele_ex_seq) - - # Build mapping from the allele to the annotated sequence - prev_j, minus_pos = 0, 0 - for i in range(len(allele_seq)): - for j in range(prev_j, len(allele_ex_seq)): - if allele_ex_seq[j] != 'D': - if allele_ex_seq[j] == 'I': - minus_pos += 1 - break - allele_seq_map[i] = j - minus_pos - allele_ex_seq_map[i] = j - prev_j = j + 1 - - # DK - debugging purposes - """ - for t in range(0, len(allele_ex_seq), 100): - print t, allele_ex_seq[t:t+100] - print t, '-'.join(allele_ex_desc[t:t+100]) - print t, allele_seq_map[t:t+100] - print "allele_seq length:", len(allele_seq) - print len(allele_ex_seq), "vs.", len(seq_dic[gene]["A*BACKBONE"]), "vs.", len(allele_seq_map) - print allele_ex_seq[1943:1946] - print allele_ex_desc[1943:1946] - sys.exit(1) - """ - - tmp_reads_1, tmp_reads_2 = simulate_reads_impl(allele_seq, - allele_seq_map, - allele_ex_seq_map, - allele_ex_seq, - allele_ex_desc, - simulate_interval, - read_len, - frag_len, - perbase_errorrate, - skip_fragment_regions) - reads_1 += tmp_reads_1 - reads_2 += tmp_reads_2 - num_pairs[-1].append(len(tmp_reads_1)) - - # Write reads into a FASTA file - def write_reads(reads, idx): - read_file = open('%s_input_%d.fa' % (base_fname, idx), 'w') - for read_i in range(len(reads)): - query_name = "%d|%s_%s" % (read_i + 1, "LR"[idx-1], reads[read_i][1]) - if len(query_name) > 254: - query_name = query_name[:254] - print >> read_file, ">%s" % query_name - print >> read_file, reads[read_i][0] - read_file.close() - write_reads(reads_1, 1) - write_reads(reads_2, 2) - - return num_pairs - - -""" -Align reads, and sort the alignments into a BAM file -""" -def align_reads(aligner, - simulation, - index_name, - index_type, - base_fname, - read_fname, - fastq, - threads, - out_fname, - verbose): - if aligner == "hisat2": - aligner_cmd = [aligner, "--mm"] - if not simulation: - aligner_cmd += ["--no-unal"] - DNA = True - if DNA: - aligner_cmd += ["--no-spliced-alignment"] # no spliced alignment - aligner_cmd += ["-X", "1000"] # max fragment length - if index_type == "linear": - aligner_cmd += ["-k", "10"] - else: - aligner_cmd += ["--max-altstried", "64"] - aligner_cmd += ["--haplotype"] - if base_fname == "codis": - aligner_cmd += ["--enable-codis"] - aligner_cmd += ["--no-softclip"] - - elif aligner == "bowtie2": - aligner_cmd = [aligner, - "--no-unal", - "-k", "10"] - else: - assert False - aligner_cmd += ["-x", index_name] - assert len(read_fname) in [1,2] - aligner_cmd += ["-p", str(threads)] - if not fastq: - aligner_cmd += ["-f"] - if len(read_fname) == 1: - aligner_cmd += ["-U", read_fname[0]] - else: - aligner_cmd += ["-1", "%s" % read_fname[0], - "-2", "%s" % read_fname[1]] - - if verbose >= 1: - print >> sys.stderr, ' '.join(aligner_cmd) - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sambam_cmd = ["samtools", - "view", - "-bS", - "-"] - sambam_proc = subprocess.Popen(sambam_cmd, - stdin=align_proc.stdout, - stdout=open(out_fname + ".unsorted", 'w'), - stderr=open("/dev/null", 'w')) - sambam_proc.communicate() - if index_type == "graph": - bamsort_cmd = ["samtools", - "sort", - out_fname + ".unsorted", - "-o", out_fname] - bamsort_proc = subprocess.Popen(bamsort_cmd, - stderr=open("/dev/null", 'w')) - bamsort_proc.communicate() - - bamindex_cmd = ["samtools", - "index", - out_fname] - bamindex_proc = subprocess.Popen(bamindex_cmd, - stderr=open("/dev/null", 'w')) - bamindex_proc.communicate() - - os.system("rm %s" % (out_fname + ".unsorted")) - - -""" -HISAT-genotype's mpileup -""" -def get_mpileup(alignview_cmd, - ref_seq, - base_locus, - vars, - allow_discordant): - ref_seq_len = len(ref_seq) - mpileup = [] - for i in range(ref_seq_len): - mpileup.append([[], {}]) - - proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - prev_pos = -1 - cigar_re = re.compile('\d+\w') - for line in proc.stdout: - line = line.strip() - cols = line.split() - read_id, flag, _, pos, _, cigar_str = cols[:6] - read_seq = cols[9] - flag, pos = int(flag), int(pos) - # Unalined? - if flag & 0x4 != 0: - continue - pos -= (base_locus + 1) - if pos < 0: - continue - - # Concordantly mapped? - if flag & 0x2 != 0: - concordant = True - else: - concordant = False - - if not allow_discordant and not concordant: - continue - - read_pos, left_pos = 0, pos - right_pos = left_pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MD": - for j in range(length): - if cigar_op == 'M': - read_nt = read_seq[read_pos + j] - else: - read_nt = 'D' - if right_pos + j < len(mpileup): - if read_nt not in mpileup[right_pos + j][1]: - mpileup[right_pos + j][1][read_nt] = 1 - else: - mpileup[right_pos + j][1][read_nt] += 1 - - if cigar_op in "MND": - right_pos += length - - if cigar_op in "MIS": - read_pos += length - - # Choose representative bases or 'D' - for i in range(len(mpileup)): - nt_dic = mpileup[i][1] - num_nt = sum(nt_dic.values()) - nt_set = [] - if num_nt >= 20: - for nt, count in nt_dic.items(): - if nt not in "ACGT": - continue - if count >= num_nt * 0.2 or count >= 7: - nt_set.append(nt) - mpileup[i][0] = nt_set - - # Sort variants - var_list = [[] for i in range(len(mpileup))] - for var_id, value in vars.items(): - var_type, var_pos, var_data = value - assert var_pos < len(var_list) - var_list[var_pos].append([var_id, var_type, var_data]) - - # Assign known or unknown variants - skip_i, prev_del_var_id = -1, "" - for i in range(len(mpileup)): - nt_dic = mpileup[i][1] - ref_nt = ref_seq[i] - new_nt_dic = {} - for nt, count in nt_dic.items(): - var_id = "" - if nt == 'D': - if i <= skip_i: - assert prev_del_var_id != "" - var_id = prev_del_var_id - else: - for var_id_, var_type, var_data in var_list[i]: - if var_type != "deletion": - continue - del_len = int(var_data) - del_exist = True - for j in range(i + 1, i + del_len): - assert j < len(mpileup) - nt_dic2 = mpileup[j][1] - if 'D' not in nt_dic2: - del_exist = False - break - if del_exist: - var_id = var_id_ - prev_del_var_id = var_id - skip_i = i + del_len - 1 - break - elif nt != 'N' and nt != ref_nt: - assert nt in "ACGT" - id = "unknown" - for var_id_, var_type, var_data in var_list[i]: - if var_type != "single": - continue - if nt == var_data: - var_id = var_id_ - break - new_nt_dic[nt] = [count, var_id] - - mpileup[i][1] = new_nt_dic - - return mpileup - - -""" -""" -def get_pair_interdist(alignview_cmd, - simulation, - verbose): - bamview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - dist_list = [] - prev_read_id = None - cigar_re = re.compile('\d+\w') - reads = [] - for line in alignview_proc.stdout: - line = line.strip() - cols = line.split() - read_id, flag, _, pos, _, cigar_str = cols[:6] - read_seq = cols[9] - flag, pos = int(flag), int(pos) - # Unalined? - if flag & 0x4 != 0: - continue - - if simulation: - read_id = read_id.split('|')[0] - - # Concordantly mapped? - if flag & 0x2 != 0: - concordant = True - else: - concordant = False - - NH, YT = sys.maxint, "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("NH"): - NH = int(col[5:]) - elif col.startswith("YT"): - YT = col[5:] - if NH > 1 or YT != "CP": - continue - - if prev_read_id != None and read_id != prev_read_id: - if len(reads) == 2: - left1, right1 = reads[0] - left2, right2 = reads[1] - if left1 <= left2: - dist = left2 - right1 - 1 - else: - dist = left1 - right2 - 1 - dist_list.append(dist) - reads = [] - - left_pos = right_pos = pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MND": - right_pos += length - - reads.append([left_pos, right_pos - 1]) - - prev_read_id = read_id - - dist_list = sorted(dist_list) - dist_avg = sum(dist_list) / max(1, len(dist_list)) - if len(dist_list) > 0: - dist_median = dist_list[len(dist_list)/2] - else: - dist_median = -1 - - return dist_median - - -################################################## -# Statistical routines -################################################## - - -""" -""" -def prob_diff(prob1, prob2): - diff = 0.0 - for allele in prob1.keys(): - if allele in prob2: - diff += abs(prob1[allele] - prob2[allele]) - else: - diff += prob1[allele] - return diff - - -""" -""" -def Gene_prob_cmp(a, b): - if a[1] != b[1]: - if a[1] < b[1]: - return 1 - else: - return -1 - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - - -""" -""" -def single_abundance(Gene_cmpt, - remove_low_abundance_allele = False, - Gene_length = {}): - def normalize(prob): - total = sum(prob.values()) - for allele, mass in prob.items(): - prob[allele] = mass / total - - def normalize_len(prob, length): - total = 0 - for allele, mass in prob.items(): - assert allele in length - total += (mass / length[allele]) - for allele, mass in prob.items(): - assert allele in length - prob[allele] = mass / length[allele] / total - - Gene_prob, Gene_prob_next = {}, {} - for cmpt, count in Gene_cmpt.items(): - alleles = cmpt.split('-') - for allele in alleles: - if allele not in Gene_prob: - Gene_prob[allele] = 0.0 - Gene_prob[allele] += (float(count) / len(alleles)) - if len(Gene_length) > 0: - normalize_len(Gene_prob, Gene_length) - else: - normalize(Gene_prob) - - def next_prob(Gene_cmpt, Gene_prob, Gene_length): - Gene_prob_next = {} - for cmpt, count in Gene_cmpt.items(): - alleles = cmpt.split('-') - alleles_prob = 0.0 - for allele in alleles: - if allele not in Gene_prob: - continue - alleles_prob += Gene_prob[allele] - if alleles_prob <= 0.0: - continue - for allele in alleles: - if allele not in Gene_prob: - continue - if allele not in Gene_prob_next: - Gene_prob_next[allele] = 0.0 - Gene_prob_next[allele] += (float(count) * Gene_prob[allele] / alleles_prob) - if len(Gene_length) > 0: - normalize_len(Gene_prob_next, Gene_length) - else: - normalize(Gene_prob_next) - return Gene_prob_next - - def select_alleles(Gene_prob): - if len(Gene_prob) == 0: - return Gene_prob - Gene_prob2 = {} - max_prob = max(Gene_prob.values()) - for allele, prob in Gene_prob.items(): - if prob >= max_prob / 10.0: - Gene_prob2[allele] = prob - return Gene_prob2 - - fast_EM = True - diff, iter = 1.0, 0 - while diff > 0.0001 and iter < 1000: - Gene_prob_next = next_prob(Gene_cmpt, Gene_prob, Gene_length) - if fast_EM: - # Accelerated version of EM - SQUAREM iteration - # Varadhan, R. & Roland, C. Scand. J. Stat. 35, 335-353 (2008) - # Also, this algorithm is used in Sailfish - http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html - Gene_prob_next2 = next_prob(Gene_cmpt, Gene_prob_next, Gene_length) - sum_squared_r, sum_squared_v = 0.0, 0.0 - p_r, p_v = {}, {} - for a in Gene_prob.keys(): - p_r[a] = Gene_prob_next[a] - Gene_prob[a] - sum_squared_r += (p_r[a] * p_r[a]) - p_v[a] = Gene_prob_next2[a] - Gene_prob_next[a] - p_r[a] - sum_squared_v += (p_v[a] * p_v[a]) - if sum_squared_v > 0.0: - gamma = -math.sqrt(sum_squared_r / sum_squared_v) - for a in Gene_prob.keys(): - Gene_prob_next2[a] = max(0.0, Gene_prob[a] - 2 * gamma * p_r[a] + gamma * gamma * p_v[a]); - Gene_prob_next = next_prob(Gene_cmpt, Gene_prob_next2, Gene_length) - - diff = prob_diff(Gene_prob, Gene_prob_next) - Gene_prob = Gene_prob_next - - # Accelerate convergence - if iter >= 10 and remove_low_abundance_allele: - Gene_prob = select_alleles(Gene_prob) - - # DK - debugging purposes - if iter % 10 == 0 and False: - print >> sys.stderr, "iter", iter - for allele, prob in Gene_prob.items(): - if prob >= 0.001: - print >> sys.stderr, "\t", iter, allele, prob - - iter += 1 - - if remove_low_abundance_allele: - Gene_prob = select_alleles(Gene_prob) - if len(Gene_length) > 0: - normalize_len(Gene_prob, Gene_length) - else: - normalize(Gene_prob) - Gene_prob = [[allele, prob] for allele, prob in Gene_prob.items()] - Gene_prob = sorted(Gene_prob, cmp=Gene_prob_cmp) - return Gene_prob - - -################################################## -# Realignment, alternative alignments -################################################## - - -""" -Identify alternative haplotypes - insertions are not considered... - - INPUT: see the function's parameters below - OUPUT: 529-hv8-hv22-606: set(['529-hv13-570', '529-hv4-hv18-590', '529-hv2-hv16-582']) - 529-hv3-hv17-598: set(['529-hv6-hv21-hv26-610']) -""" -def get_alternatives(ref_seq, # GATAACTAGATACATGAGATAGATTTGATAGATAGATAGATACATACATACATACATACATACAGGATAGATAACTAGG... - allele_vars, # {'VWA*20(22)': ['hv231', 'hv245'], "VWA*16(18')": ['hv235', 'hv250', 'hv256'], ...} - Vars, # {'hv241': ['deletion', 529, '52'], 'hv240': ['deletion', 529, '48'], ... } - Var_list, # [[529, 'hv230'], [529, 'hv231'], [529, 'hv232'], [529, 'hv233'], ...] - verbose): - haplotype_alts_left, haplotype_alts_right = {}, {} - second_order_haplotypes = set() - for allele_name, vars in allele_vars.items(): - for v in range(len(vars) - 1): - ht = vars[v] + "-" + vars[v+1] - second_order_haplotypes.add(ht) - - rev_Var_list = [] - for _, var_id in Var_list: - var_type, var_pos, var_data = Vars[var_id] - if var_type == "deletion": - var_pos = var_pos + int(var_data) - 1 - elif var_type == "insertion": - var_pos += 1 - rev_Var_list.append([var_pos, var_id]) - rev_Var_list = sorted(rev_Var_list, cmp=lambda a, b: a[0] - b[0]) - - def nextbases(haplotype, - left = True, - exclude_list = []): - if left: - pos = int(haplotype[0]) - 1 - else: - pos = haplotype[-1] + 1 - if pos < 0 or pos >= len(ref_seq): - return [] - - if left: - bases = [[[pos] + haplotype[1:], ref_seq[pos]]] - prev_id = None - if len(haplotype) > 2: - prev_id = haplotype[1] - - var_i = lower_bound(rev_Var_list, pos + 1) - for var_j in reversed(range(0, var_i)): - _, var_id = rev_Var_list[var_j] - var_type, var_pos, var_data = Vars[var_id] - if var_type == "deletion": - if var_pos == 0: - continue - var_pos = var_pos + int(var_data) - 1 - if var_pos > pos: - continue - if var_pos < pos: - break - if var_id in exclude_list: - continue - if prev_id: - second_ht = var_id + "-" + prev_id - if second_ht not in second_order_haplotypes: - continue - - if var_type == "single": - bases.append([[var_pos, var_id] + haplotype[1:], var_data]) - elif var_type == "deletion": - bases2 = nextbases([var_pos - int(var_data) + 1, var_id] + haplotype[1:], - left, - exclude_list) - bases += bases2 - else: - assert var_type == "insertion" - else: - bases = [[haplotype[:-1] + [pos], ref_seq[pos]]] - prev_id = None - if len(haplotype) > 2: - prev_id = haplotype[-2] - - var_i = lower_bound(Var_list, pos) - for var_j in range(var_i, len(Var_list)): - _, var_id = Var_list[var_j] - var_type, var_pos, var_data = Vars[var_id] - if var_pos < pos: - continue - if var_pos > pos: - break - if var_id in exclude_list: - continue - if prev_id: - second_ht = prev_id + "-" + var_id - if second_ht not in second_order_haplotypes: - continue - - if var_type == "single": - bases.append([haplotype[:-1] + [var_id, var_pos], var_data]) - elif var_type == "deletion": - bases2 = nextbases(haplotype[:-1] + [var_id, var_pos + int(var_data) - 1], - left, - exclude_list) - bases += bases2 - else: - assert var_type == "insertion" - - return bases - - def get_haplotype_seq(haplotype): - seq = "" - pos = int(haplotype[0]) - for i in range(1, len(haplotype) - 1): - var_id = haplotype[i] - var_type, var_pos, var_data = Vars[var_id] - if pos < var_pos: - seq += ref_seq[pos:var_pos] - if var_type == "single": - seq += var_data - pos = var_pos + 1 - elif var_type == "deletion": - pos = var_pos + int(var_data) - else: - assert var_type == "insertion" - seq += var_data - pos = var_pos - - last_pos = int(haplotype[-1]) + 1 - assert pos <= last_pos - if pos < last_pos: - seq += ref_seq[pos:last_pos] - return seq - - def get_alternative_recur(var_orig_id, - haplotype, - haplotype_alt, - left = True, - dep = 0): - bases1 = nextbases(haplotype, - left) - bases2 = nextbases(haplotype_alt, - left, - [var_orig_id]) # exclude - - found = False - for base1 in bases1: - next_haplotype, bp = base1 - for base2 in bases2: - next_haplotype_alt, bp2 = base2 - if bp != bp2: - continue - - # Todo: implement a routine to handle haplotypes ending with the same coordinate - if left: - left1, left2 = int(next_haplotype[0]), int(next_haplotype_alt[0]) - if left1 == left2: - continue - else: - right1, right2 = int(next_haplotype[-1]), int(next_haplotype_alt[-1]) - if right1 == right2: - continue - - found = True - get_alternative_recur(var_orig_id, - next_haplotype, - next_haplotype_alt, - left, - dep + 1) - - if dep > 0: - if not found: - def to_haplotype_str(haplotype): - if len(haplotype) <= 2: - haplotype = "%d-%d" % (haplotype[0], haplotype[1]) - else: - haplotype = "%d-%s-%d" % (haplotype[0], '-'.join(haplotype[1:-1]), haplotype[-1]) - return haplotype - - haplotype, haplotype_alt = to_haplotype_str(haplotype), to_haplotype_str(haplotype_alt) - haplotype_alts = haplotype_alts_left if left else haplotype_alts_right - if haplotype not in haplotype_alts: - haplotype_alts[haplotype] = set() - haplotype_alts[haplotype].add(haplotype_alt) - - if haplotype_alt not in haplotype_alts: - haplotype_alts[haplotype_alt] = set() - haplotype_alts[haplotype_alt].add(haplotype) - - # Search alternative haplotypes in both left and right directions - for var_i in range(len(Var_list)): - _, var_id = Var_list[var_i] - var_type, var_pos, var_data = Vars[var_id] - if var_pos == 0: - continue - if var_type != "deletion": - continue - del_len = int(var_data) - if var_pos + del_len >= len(ref_seq): - continue - - # Left direction - get_alternative_recur(var_id, - [var_pos, var_id, var_pos + del_len - 1], - [var_pos + del_len, var_pos + del_len - 1]) - - # Right direction - get_alternative_recur(var_id, - [var_pos, var_id, var_pos + del_len - 1], - [var_pos, var_pos - 1], - False) - - # Print alternative haplotypes / Sanity check - def print_haplotype_alts(haplotype_alts): - for haplotype, haplotype_set in haplotype_alts.items(): - if verbose: print "\t%s:" % haplotype, haplotype_set - haplotype_seq = get_haplotype_seq(haplotype.split('-')) - for haplotype_alt in haplotype_set: - haplotype_alt_seq = get_haplotype_seq(haplotype_alt.split('-')) - assert haplotype_seq == haplotype_alt_seq - - if verbose: print "number of left haplotypes:", len(haplotype_alts_left) - print_haplotype_alts(haplotype_alts_left) - if verbose: print "number of right haplotypes:", len(haplotype_alts_right) - print_haplotype_alts(haplotype_alts_right) - - return haplotype_alts_left, haplotype_alts_right - - -""" -Identify ambigious differences that may account for other alleles, - given a list of differences (cmp_list) between a read and a potential allele -""" -def identify_ambigious_diffs(ref_seq, - Vars, - Alts_left, - Alts_right, - Alts_left_list, - Alts_right_list, - cmp_list, - verbose, - debug = False): - cmp_left, cmp_right = 0, len(cmp_list) - 1 - left, right = cmp_list[0][1], cmp_list[-1][1] + cmp_list[-1][2] - 1 - left_alt_set, right_alt_set = set(), set() - - def get_haplotype_and_seq(cmp_list): - ht, seq = [], "" - for i in range(len(cmp_list)): - cmp_i = cmp_list[i] - type, pos, length = cmp_i[:3] - if len(cmp_i) <= 3: - var_id = "" - else: - var_id = cmp_i[3] - if type == "match": - seq += ref_seq[pos:pos+length] - elif type == "mismatch": - seq += ref_seq[pos] - elif type == "insertion": - None - # seq += data - else: - assert type == "deletion" - - if var_id != "" and var_id != "unknown": - ht.append(var_id) - return ht, seq - - # Left direction - found = False - for i in reversed(range(len(cmp_list))): - i_found = False - cmp_i = cmp_list[i] - type, cur_left, length = cmp_i[:3] - var_id = cmp_i[3] if type in ["mismatch", "deletion"] else "" - - # DK - debugging purposes - if type in ["mismatch", "deletion", "insertion"]: - if not var_id.startswith("hv"): - continue - - if type in ["match", "deletion"]: - cur_right = cur_left + length - 1 - else: - cur_right = cur_left - - cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[:i+1]) - if len(cur_ht) == 0: - cur_ht_str = str(left) - else: - cur_ht_str = "%d-%s" % (left, '-'.join(cur_ht)) - ht_i = lower_bound(Alts_left_list, cur_right + 1) - for ht_j in reversed(range(0, min(ht_i + 1, len(Alts_left_list)))): - ht_pos, ht = Alts_left_list[ht_j] - if ht_pos < cur_left: - break - if ht_pos > cur_right: - continue - - if len(cur_ht) > 0: - if ht.find('-'.join(cur_ht)) == -1: - continue - - ht = ht.split('-')[:-1] - if len(cur_ht) + 1 == len(ht): - ht_pos = int(ht[0]) - if left < ht_pos: - continue - else: - var_id2 = ht[len(ht) - len(cur_ht) - 1] - ht_type, ht_pos, ht_data = Vars[var_id2] - if ht_type == "deletion": - ht_pos = ht_pos + int(ht_data) - 1 - if left <= ht_pos: - continue - - i_found = True - if debug: - print cmp_list[:i+1] - print "\t", cur_ht, "vs", Alts_left_list[ht_j] - - _, rep_ht = Alts_left_list[ht_j] - - if debug: - print "DK1:", cmp_i, cmp_list - print "DK2:", rep_ht, Alts_left[rep_ht] - print "DK3:", left, right - - for alt_ht_str in Alts_left[rep_ht]: - alt_ht = alt_ht_str.split('-') - alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1]) - assert alt_ht_right <= cur_right - seq_pos = cur_right - alt_ht_right - cur_pos = alt_ht_right - part_alt_ht = [] - alt_ht = alt_ht[1:-1] - for var_id_ in reversed(alt_ht): - var_type_, var_pos_, var_data_ = Vars[var_id_] - if var_type_ == "deletion": - del_len = int(var_data_) - var_pos_ = var_pos_ + del_len - 1 - assert var_pos_ <= cur_pos - next_seq_pos = seq_pos + (cur_pos - var_pos_) - if next_seq_pos >= len(cur_seq): - break - if var_type_ == "single": - next_seq_pos += 1 - next_cur_pos = var_pos_ - 1 - elif var_type_ == "deletion": - next_cur_pos = var_pos_ - del_len - else: - assert var_type_ == "insertion" - assert False - - part_alt_ht.insert(0, var_id_) - if next_seq_pos >= len(cur_seq): - break - seq_pos, cur_pos = next_seq_pos, next_cur_pos - - if len(part_alt_ht) > 0: - seq_left = len(cur_seq) - seq_pos - 1 - part_alt_ht_str = "" - if found: - var_id_list = [] - for j in range(i + 1, cmp_left): - cmp_j = cmp_list[j] - if cmp_j[0] in ["mismatch", "deletion", "insertion"]: - var_id_ = cmp_j[3] - if var_id_.startswith("hv"): - var_id_list.append(var_id_) - if len(var_id_list) > 0: - part_alt_ht_str = '-' + '-'.join(var_id_list) - part_alt_ht_str = ("%d-%s" % (cur_pos - seq_left, '-'.join(part_alt_ht))) + part_alt_ht_str - left_alt_set.add(part_alt_ht_str) - - if debug: - print "\t\t", cur_left, alt_ht_str - - if i_found: - if not found: - cmp_left = i + 1 - left_alt_set.add(cur_ht_str) - found = True - - if not found: - left_alt_set.add(str(left)) - - # Right direction - found = False - for i in range(0, len(cmp_list)): - i_found = False - cmp_i = cmp_list[i] - type, cur_left, length = cmp_i[:3] - var_id = cmp_i[3] if type in ["mismatch", "deletion"] else "" - - # DK - debugging purpose - if type in ["mismatch", "deletion", "insertion"]: - if not var_id.startswith("hv"): - continue - - if type in ["match", "deletion"]: - cur_right = cur_left + length - 1 - else: - cur_right = cur_left - - cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[i:]) - if len(cur_ht) == 0: - cur_ht_str = str(right) - else: - cur_ht_str = "%s-%d" % ('-'.join(cur_ht), right) - - ht_i = lower_bound(Alts_right_list, cur_left) - for ht_j in range(ht_i, len(Alts_right_list)): - ht_pos, ht = Alts_right_list[ht_j] - if ht_pos > cur_right: - break - if ht_pos < cur_left: - continue - - if len(cur_ht) > 0: - if ht.find('-'.join(cur_ht)) == -1: - continue - - ht = ht.split('-')[1:] - if len(cur_ht) + 1 == len(ht): - ht_pos = int(ht[-1]) - if right > ht_pos: - continue - else: - var_id2 = ht[len(cur_ht)] - var_type, ht_pos, _ = Vars[var_id2] - if right >= ht_pos: - continue - - i_found = True - _, rep_ht = Alts_right_list[ht_j] - - if debug: - print "DK1:", cmp_i, cmp_list - print "DK2:", rep_ht, Alts_right[rep_ht] - print "DK3:", left, right, ht_pos - - for alt_ht_str in Alts_right[rep_ht]: - alt_ht = alt_ht_str.split('-') - alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1]) - assert cur_left <= alt_ht_left - seq_pos = alt_ht_left - cur_left - cur_pos = alt_ht_left - part_alt_ht = [] - alt_ht = alt_ht[1:-1] - for var_id_ in alt_ht: - var_type_, var_pos_, var_data_ = Vars[var_id_] - assert var_pos_ >= cur_pos - next_seq_pos = seq_pos + (var_pos_ - cur_pos) - if next_seq_pos >= len(cur_seq): - break - - if var_type_ == "single": - next_seq_pos += 1 - next_cur_pos = var_pos_ + 1 - elif var_type_ == "deletion": - next_cur_pos = var_pos_ + int(var_data_) - else: - assert var_type_ == "insertion" - assert False - - part_alt_ht.append(var_id_) - if next_seq_pos >= len(cur_seq): - break - seq_pos, cur_pos = next_seq_pos, next_cur_pos - - if len(part_alt_ht) > 0: - seq_left = len(cur_seq) - seq_pos - 1 - assert seq_left >= 0 - part_alt_ht_str = "" - if found: - var_id_list = [] - for j in range(cmp_right + 1, i): - cmp_j = cmp_list[j] - if cmp_j[0] in ["mismatch", "deletion", "insertion"]: - var_id_ = cmp_j[3] - if var_id_.startswith("hv"): - var_id_list.append(var_id_) - if len(var_id_list) > 0: - part_alt_ht_str = '-'.join(var_id_list) + '-' - part_alt_ht_str += ("%s-%d" % ('-'.join(part_alt_ht), cur_pos + seq_left)) - right_alt_set.add(part_alt_ht_str) - - if i_found: - if not found: - cmp_right = i - 1 - right_alt_set.add(cur_ht_str) - found = True - - if not found: - right_alt_set.add(str(right)) - - if cmp_right < cmp_left: - cmp_left = 0 - left_alt_set = set([str(left)]) - - # Sanity check - ht_set_ = set() - for ht in left_alt_set: - ht = '-'.join(ht.split('-')[1:]) - if ht == "": - continue - if ht in ht_set_: - print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_ - - # DK - debugging purposes - print "DK: cmp_list_range: [%d, %d]" % (cmp_left, cmp_right) - print "DK: cmp_list:", cmp_list - print "DK: left_alt_set:", left_alt_set, "right_alt_set:", right_alt_set - - assert False - ht_set_.add(ht) - for ht in right_alt_set: - ht = '-'.join(ht.split('-')[:-1]) - if ht == "": - continue - if ht in ht_set_: - print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_ - assert False - ht_set_.add(ht) - - if debug: - print "cmp_list_range: [%d, %d]" % (cmp_left, cmp_right) - print "left alt set:", left_alt_set - print "right alt set:", right_alt_set - - return cmp_left, cmp_right, list(left_alt_set), list(right_alt_set) - diff --git a/hisatgenotype_scripts/compare_HLA.py b/hisatgenotype_scripts/compare_HLA.py deleted file mode 100755 index d32b593c..00000000 --- a/hisatgenotype_scripts/compare_HLA.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python - -import sys, os -from argparse import ArgumentParser, FileType -use_message = ''' -''' - -def compare(hisatgenotype_fname, - utsw_fname): - hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"] - for level in [1,2]: - print >> sys.stderr, "Level: %d" % level - def read_hla_types(fname): - hla, hla_orig = {}, {} - for line in open(fname): - line = line.strip() - fields = line.split('\t') - if len(fields) == 2: - sample, allele = fields - abundance, vars_covered = 0.0, "" - elif len(fields) == 3: - sample, allele, abundance = fields - vars_covered = "" - else: - assert len(fields) == 4 - sample, allele, abundance, vars_covered = fields - # sample = sample.split('_')[0] - abundance = float(abundance) - if sample not in hla: - hla[sample] = {} - hla_orig[sample] = {} - gene, allele = allele.split('*') - if gene not in hla[sample]: - hla[sample][gene] = [] - hla_orig[sample][gene] = [] - hla_orig[sample][gene].append([allele, abundance]) - - if level == 1: - allele = allele.split(':')[0] - else: - assert level == 2 - allele = ':'.join(allele.split(':')[:2]) - - found = False - for i in range(len(hla[sample][gene])): - cmp_allele, cmp_abundance = hla[sample][gene][i] - if level == 1 or allele.find(':') == -1: - one = two = allele - cmp_one = cmp_two = cmp_allele - else: - one, two = allele.split(':') - cmp_one, cmp_two = cmp_allele.split(':') - if one == cmp_one and two == cmp_two: - found = True - hla[sample][gene][i][1] = cmp_abundance + abundance - break - - if not found: - hla[sample][gene].append([allele, abundance]) - - for sample_hla in hla.values(): - for gene, allele_list in sample_hla.items(): - sample_hla[gene] = sorted(allele_list, key=lambda a: a[1], reverse=True) - - return hla, hla_orig - - hla1, hla1_orig = read_hla_types(hisatgenotype_fname) - hla2, hla2_orig = read_hla_types(utsw_fname) - - for gene in hla_list: - count, count_10 = [0, 0, 0], [0, 0, 0] - print >> sys.stderr, "\t%s" % gene - for sample in hla2.keys(): - if sample not in hla1: - continue - hla1_sample = hla1[sample] - hla2_sample = hla2[sample] - if gene not in hla1_sample or gene not in hla2_sample: - continue - hla1_gene = hla1_sample[gene] - hla2_gene = hla2_sample[gene] - num_match, num_match_10 = 0, 0 - for hla2_allele, _ in hla2_gene: - hla2_allele = hla2_allele.split(':') - for allele_idx in range(len(hla1_gene)): - hla1_allele = hla1_gene[allele_idx][0] - hla1_allele = hla1_allele.split(':') - equal = True - for i in range(min(len(hla1_allele), len(hla2_allele), level)): - hla1_num = hla1_allele[i] - hla2_num = hla2_allele[i] - if hla1_num != hla2_num: - equal = False - break - - if equal: - if allele_idx < 2: - num_match += 1 - if len(hla2_gene) == 1: - num_match += 1 - num_match_10 += 1 - if len(hla2_gene) == 1: - num_match_10 += 1 - break - - # DK - for debugging purposes - # """ - # if gene in ["A", "B", "C", "DQA1", "DQB1", "DRB1"] and num_match < 2: - if level == 3 and gene in ["B"] and num_match < 2: - print sample - print "\t", hla1_gene, "orig:", hla1_orig[sample][gene] - print "\t", hla2_gene, "orig:", hla2_orig[sample][gene] - # sys.exit(1) - # """ - - # DK - debugging purposes - if num_match >= len(count) or num_match_10 >= len(count_10): - print sample, num_match, num_match_10 - - assert num_match < len(count) and num_match_10 < len(count_10) - count[num_match] += 1 - count_10[num_match_10] += 1 - - if sum(count) <= 0: - continue - - print >> sys.stderr, "\t\tTop two\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count[0], count[1], count[2], (count[1] + count[2] * 2) / float(sum(count) * 2) * 100.0) - print >> sys.stderr, "\t\tTop ten\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count_10[0], count_10[1], count_10[2], (count_10[1] + count_10[2] * 2) / float(sum(count_10) * 2) * 100.0) - - -if __name__ == "__main__": - parser = ArgumentParser( - description='Compare HISAT-genotype and Utsw HLA typing results') - parser.add_argument('hisatgenotype_fname', - nargs='?', - type=str, - help='hisatgenotype file name (e.g. cp_hla.txt)') - parser.add_argument('utsw_fname', - nargs='?', - type=str, - help='utsw file name (e.g. utsw_caapa_hla.txt)') - - args = parser.parse_args() - - compare(args.hisatgenotype_fname, - args.utsw_fname) - diff --git a/hisatgenotype_scripts/compare_HLA_Omixon.py b/hisatgenotype_scripts/compare_HLA_Omixon.py deleted file mode 100755 index ad79c19e..00000000 --- a/hisatgenotype_scripts/compare_HLA_Omixon.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python - -import sys, os -from argparse import ArgumentParser, FileType -use_message = ''' -''' - -def compare(hisatgenotype_fname, omixon_fname): - hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"] - - # Read HISAT-genotype predicted HLA alleles for the CAAPA genomes - hisat_hla = {} - for line in open(hisatgenotype_fname): - line = line.strip() - fields = line.split('\t') - if len(fields) == 2: - sample, allele = fields - abundance, vars_covered = 0.0, "" - elif len(fields) == 3: - sample, allele, abundance = fields - vars_covered = "" - else: - assert len(fields) == 4 - sample, allele, abundance, vars_covered = fields - abundance = float(abundance) - if sample not in hisat_hla: - hisat_hla[sample] = {} - gene, allele = allele.split('*') - if gene not in hisat_hla[sample]: - hisat_hla[sample][gene] = [] - hisat_hla[sample][gene].append([allele, abundance]) - - # Read Omixon predicted HLA alleles for the CAAPA genomes - omixon_hla = {} - for line in open(omixon_fname): - line = line.strip() - sample, allele1, allele2 = line.split('\t') - gene1, allele1 = allele1.split('*') - gene2, allele2 = allele2.split('*') - - assert gene1 == gene2 - if sample not in omixon_hla: - omixon_hla[sample] = {} - if gene1 not in omixon_hla[sample]: - omixon_hla[sample][gene1] = [] - - if len(omixon_hla[sample][gene1]) >= 2: - continue - - omixon_hla[sample][gene1].append(allele1) - omixon_hla[sample][gene1].append(allele2) - - for gene in hla_list: - count, count_10 = [0, 0, 0], [0, 0, 0] - print >> sys.stderr, gene - for sample in omixon_hla.keys(): - if sample not in hisat_hla: - continue - hisat_sample = hisat_hla[sample] - omixon_sample = omixon_hla[sample] - if gene not in omixon_sample or gene not in hisat_sample: - continue - hisat_gene = hisat_sample[gene] - omixon_gene = omixon_sample[gene] - num_match, num_match_10 = 0, 0 - for omixon_allele in omixon_gene: - omixon_allele = omixon_allele.split(':') - for hisat_allele_idx in range(len(hisat_gene)): - hisat_allele = hisat_gene[hisat_allele_idx] - hisat_allele = hisat_allele[0].split(':') - equal = True - for i in range(min(len(omixon_allele), len(hisat_allele), 2)): - omixon_num = omixon_allele[i] - hisat_num = hisat_allele[i] - """ - if not omixon_num[-1].isdigit(): - omixon_num = omixon_num[:-1] - if not hisat_num[-1].isdigit(): - hisat_num = hisat_num[:-1] - if int(hisat_num) != int(omixon_num): - equal = False - break - """ - if hisat_num != omixon_num: - equal = False - break - if equal: - if hisat_allele_idx < 2: - num_match += 1 - num_match_10 += 1 - break - - # DK - for debugging purposes - """ - if gene in ["A", "B", "C", "DQA1", "DQB1", "DRB1"] and num_match < 2: - print sample - print "\t", omixon_gene - print "\t", hisat_gene - # sys.exit(1) - """ - - assert num_match < len(count) - count[num_match] += 1 - count_10[num_match_10] += 1 - - if sum(count) <= 0: - continue - - print >> sys.stderr, "\tTop two\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count[0], count[1], count[2], (count[1] + count[2] * 2) / float(sum(count) * 2) * 100.0) - print >> sys.stderr, "\tTop ten\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count_10[0], count_10[1], count_10[2], (count_10[1] + count_10[2] * 2) / float(sum(count_10) * 2) * 100.0) - - -if __name__ == "__main__": - parser = ArgumentParser( - description='Compare HISAT-genotype and Omixon HLA typing results') - parser.add_argument('hisatgenotype_fname', - nargs='?', - type=str, - help='hisatgenotype file name (e.g. cp_hla.txt)') - parser.add_argument('omixon_fname', - nargs='?', - type=str, - help='omixon file name (e.g. omixon_caapa_hla.txt)') - - args = parser.parse_args() - - compare(args.hisatgenotype_fname, - args.omixon_fname) - diff --git a/hisatgenotype_scripts/extract_Omixon_HLA.py b/hisatgenotype_scripts/extract_Omixon_HLA.py deleted file mode 100755 index 23aaa045..00000000 --- a/hisatgenotype_scripts/extract_Omixon_HLA.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2016, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import sys, os, subprocess, glob - -if __name__ == '__main__': - hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"] - gen_alleles = {} - for hla in hla_list: - for line in open("IMGTHLA/fasta/%s_gen.fasta" % hla): - if line.startswith(">"): - allele = line.split()[1] - gene = allele.split('*')[0] - if gene not in gen_alleles: - gen_alleles[gene] = set() - gen_alleles[gene].add(allele) - - nuc_alleles = {} - for hla in hla_list: - for line in open("IMGTHLA/fasta/%s_nuc.fasta" % hla): - if line.startswith(">"): - allele = line.split()[1] - gene = allele.split('*')[0] - if gene not in nuc_alleles: - nuc_alleles[gene] = set() - nuc_alleles[gene].add(allele) - - print >> sys.stderr, "IMGTHLA" - for gene, alleles in nuc_alleles.items(): - print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles)) - - # Read HLA alleles from Omixon data - omixon_alleles = {} - omixon_fnames = glob.glob("HLAresults/*.gz") - for fname in omixon_fnames: - genome = fname.split("/")[1].split("_HLA")[0] - view_cmd = ["gzip", "-cd", fname] - proc = subprocess.Popen(view_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) - allele_count = {} - prev_allele1, prev_allele2 = "", "" - for line in proc.stdout: - if not line.startswith("HLA"): - continue - - fields = line.strip().split() - if len(fields) > 6: - allele1, allele2 = fields[0][4:-1], fields[6][4:-1] - else: - allele1 = allele2 = fields[0][4:-1] - - gene = allele1.split("*")[0] - if gene not in hla_list: - continue - if gene not in omixon_alleles: - omixon_alleles[gene] = set() - if gene not in allele_count: - allele_count[gene] = 0 - if allele_count[gene] >= 10: - continue - - if allele2 == "": - allele2 = prev_allele2 - assert allele1 != "" and allele2 != "" - - def update_allele(allele): - if allele == "DRB1*08:01:03": - allele = "DRB1*08:01:01" - elif allele == "DRB1*11:11:02": - allele = "DRB1*11:11:01" - return allele - - allele1, allele2 = update_allele(allele1), update_allele(allele2) - - allele_count[gene] += 1 - omixon_alleles[gene].add(allele1) - omixon_alleles[gene].add(allele2) - prev_allele1, prev_allele2 = allele1, allele2 - - print "%s\t%s\t%s" % (genome, allele1, allele2) - - print >> sys.stderr, "Omixon" - for gene, alleles in omixon_alleles.items(): - print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles)) - for allele in alleles: - if allele in nuc_alleles[gene]: - continue - found = False - for allele_cmp in nuc_alleles[gene]: - if allele_cmp.find(allele) != -1: - found = True - break - - if not found: - print >> sys.stderr, "\t\t%s is missing" % allele - - diff --git a/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py b/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py deleted file mode 100755 index 34cd4ecf..00000000 --- a/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2015, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import sys, os, subprocess, re -import inspect -import random -from argparse import ArgumentParser, FileType - -# Gold Standard (experimentally verified, a lot of literature, ...) -gold_allele_info = { - "NA12877" : {"A" : ["03:01", "02:01"], "B" : ["15:01", "44:02"], "C" : ["05:01", "03:04"], "DQA1" : ["03:03", "03:01"], "DQB1" : ["03:02", "03:01"], "DRB1" : ["04:01", "04:01"]}, - "NA12878" : {"A" : ["01:01", "11:01"], "B" : ["08:01", "56:01"], "C" : ["01:02", "07:01"], "DQA1" : ["05:01", "01:01"], "DQB1" : ["02:01", "05:01"], "DRB1" : ["03:01", "01:01"]}, - "NA12879" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12880" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12881" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["05:01", "01:02"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["04:01", "01:01"]}, - "NA12882" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["04:01", "01:01"]}, - "NA12883" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["01:02", "05:01"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["01:01", "04:01"]}, - "NA12884" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:01"]}, - "NA12885" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["05:01", "07:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["03:01", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12886" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["07:01", "05:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["02:01", "03:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12887" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12888" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["07:01", "03:04"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12889" : {"A" : ["03:01", "03:01"], "B" : ["07:02", "44:02"], "C" : ["05:01", "07:02"], "DQA1" : ["03:03", "01:02"], "DQB1" : ["03:01", "06:02"], "DRB1" : ["15:01", "04:01"]}, - "NA12890" : {"A" : ["03:01", "02:01"], "B" : ["44:03", "15:01"], "C" : ["16:01", "03:04"], "DQA1" : ["03:01", "02:01"], "DQB1" : ["03:02", "02:02"], "DRB1" : ["04:03", "07:01"]}, - "NA12891" : {"A" : ["24:02", "01:01"], "B" : ["08:01", "07:02"], "C" : ["07:02", "07:01"], "DQA1" : ["05:01", "01:02"], "DQB1" : ["06:02", "02:01"], "DRB1" : ["03:01", "15:01"]}, - "NA12892" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "04:01"], "DQA1" : ["01:01", "01:01"], "DQB1" : ["05:01", "05:01"], "DRB1" : ["01:01", "01:01"]}, - "NA12893" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:01"]} - } - -# CEPH pedigree (17 family members) -pedigree = { - "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]}, - "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]}, - "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]}, - "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]}, - "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - } - - -""" -""" -def test_HLA_genotyping(reference_type, - hla_list, - aligners, - query_genomes, - exclude_allele_list, - num_mismatch, - verbose): - # Current script directory - curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping)) - ex_path = os.path.dirname(curr_script) - - if not os.path.exists("illumina/HLA"): - print >> sys.stderr, "Error: illumina/HLA data is needed (please send an email to infphilo@gmail.com for getting the data)" - sys.exit(1) - - num_test, num_success = 0, 0 - for genome in sorted(gold_allele_info.keys()): - if not genome in query_genomes: - continue - genes = gold_allele_info[genome] - read_fname_1, read_fname_2 = "illumina/HLA/%s.fished_1.fq" % genome, "illumina/HLA/%s.fished_2.fq" % genome - if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2): - continue - print >> sys.stderr, genome - cmd_aligners = ['.'.join(aligners[i]) for i in range(len(aligners))] - test_hla_script = os.path.join(ex_path, "hisat2_test_HLA_genotyping.py") - for gene in sorted(genes.keys()): - if not gene in hla_list: - continue - alleles = genes[gene] - print >> sys.stderr, "\t%s - %s" % (gene, ' / '.join(alleles)) - test_hla_cmd = [test_hla_script, - "--reference-type", reference_type, - "--hla-list", gene, - "--aligner-list", ','.join(cmd_aligners), - "--reads", "%s,%s" % (read_fname_1, read_fname_2), - "--best-alleles", - "--exclude-allele-list", ','.join(exclude_allele_list), - "--num-mismatch", str(num_mismatch)] - - if verbose: - print >> sys.stderr, ' '.join(test_hla_cmd) - - proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) - num_test += 2 - test_alleles = set() - for line in proc.stdout: - print "\t\t", line, - model, allele = line.split()[:2] - if model != "SingleModel": - continue - allele = allele.split('*')[1] - allele = ':'.join(allele.split(':')[:2]) - test_alleles.add(allele) - proc.communicate() - for allele in alleles: - if allele in test_alleles: - num_success += 1 - - print >> sys.stderr, "%d/%d (%.2f%%)" % (num_success, num_test, num_success * 100.0 / num_test) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='test HLA genotyping for Platinum Genomes') - parser.add_argument("--reference-type", - dest="reference_type", - type=str, - default="gene", - help="Reference type: gene, chromosome, and genome (default: gene)") - parser.add_argument("--hla-list", - dest="hla_list", - type=str, - default="A,B,C,DQA1,DQB1,DRB1", - help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)") - parser.add_argument("--aligner-list", - dest="aligners", - type=str, - default="hisat2.graph", - help="A comma-separated list of aligners (default: hisat2.graph)") - genomes_default = ','.join(gold_allele_info.keys()) - parser.add_argument("--genome-list", - dest="genome_list", - type=str, - default=genomes_default, - help="A comma-separated list of genomes (default: %s)" % genomes_default) - parser.add_argument("--exclude-allele-list", - dest="exclude_allele_list", - type=str, - default="", - help="A comma-separated list of allleles to be excluded") - parser.add_argument("--num-mismatch", - dest="num_mismatch", - type=int, - default=0, - help="Maximum number of mismatches per read alignment to be considered (default: 0)") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - - args = parser.parse_args() - - if not args.reference_type in ["gene", "chromosome", "genome"]: - print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type) - sys.exit(1) - args.hla_list = args.hla_list.split(',') - if args.aligners == "": - print >> sys.stderr, "Error: --aligners must be non-empty." - sys.exit(1) - args.aligners = args.aligners.split(',') - for i in range(len(args.aligners)): - args.aligners[i] = args.aligners[i].split('.') - args.genome_list = args.genome_list.split(',') - args.exclude_allele_list = args.exclude_allele_list.split(',') - - test_HLA_genotyping(args.reference_type, - args.hla_list, - args.aligners, - args.genome_list, - args.exclude_allele_list, - args.num_mismatch, - args.verbose) diff --git a/hisatgenotype_scripts/hisatgenotype_convert_codis.py b/hisatgenotype_scripts/hisatgenotype_convert_codis.py deleted file mode 100755 index 415a42c8..00000000 --- a/hisatgenotype_scripts/hisatgenotype_convert_codis.py +++ /dev/null @@ -1,654 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import os, sys, subprocess, re -import inspect, operator -from copy import deepcopy -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common -try: - import openpyxl -except ImportError: - print >> sys.stderr, "Error: please install openpyxl by running 'pip install openpyxl'." - sys.exit(1) - - -# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm - -orig_CODIS_seq = { - "CSF1PO" : - # http://www.cstl.nist.gov/biotech/strbase/str_CSF1PO.htm - # allele 13: 5:150076172-150076490 - (samtools faidx genome.fa - GRCh38) - ["[AGAT]13", - "AACCTGAGTCTGCCAAGGACTAGCAGGTTGCTAACCACCCTGTGTCTCAGTTTTCCTACCTGTAAAATGAAGATATTAACAGTAACTGCCTTCATAGATAGAAGATAGATAGATT", # left flanking sequence - "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", # STR - "AGGAAGTACTTAGAACAGGGTCTGACACAGGAAATGCTGTCCAAGTGTGCACCAGGAGATAGTATCTGAGAAGGCTCAGTCTGGCACCATGTGGGTTGGGTGGGAACCTGGAGGCTGGAGAATGGGCTGAAGATGGCCAGTGGTGTGTGGAA"], # right flanking sequence - - "FGA" : - # http://www.cstl.nist.gov/biotech/strbase/str_FGA.htm - # allele 22: 4:154587696-154587891 - - ["[TTTC]3TTTTTTCT[CTTT]14CTCC[TTCC]2", - "GCCCCATAGGTTTTGAACTCACAGATTAAACTGTAACCAAAATAAAATTAGGCATATTTACAAGCTAG", - "TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC", - "TTTCTTCCTTTCTTTTTTGCTGGCAATTACAGACAAATCA"], - - "TH01" : - # http://www.cstl.nist.gov/biotech/strbase/str_TH01.htm - # allele 7: 11:2170990-2171176 + - ["[AATG]7", - "GTGGGCTGAAAAGCTCCCGATTATCCAGCCTGGCCCACACAGTCCCCTGTACACAGGGCTTCCGAGTGCAGGTCACAGGGAACACAGACTCCATGGTG", - "AATGAATGAATGAATGAATGAATGAATG", - "AGGGAAATAAGGGAGGAACAGGCCAATGGGAATCACCCCAGAGCCCAGATACCCTTTGAAT"], - - "TPOX" : - # http://www.cstl.nist.gov/biotech/strbase/str_TPOX.htm - # allele 8: 2:1489617-1489848 - ["[AATG]8", - "ACTGGCACAGAACAGGCACTTAGGGAACCCTCACTG", - "AATGAATGAATGAATGAATGAATGAATGAATG", - "TTTGGGCAAATAAACGCTGACAAGGACAGAAGGGCCTAGCGGGAAGGGAACAGGAGTAAGACCAGCGCACAGCCCGACTTGTGTTCAGAAGACCTGGGATTGGACCTGAGGAGTTCAATTTTGGATGAATCTCTTAATTAACCTGTGGGGTTCCCAGTTCCTCC"], - - "VWA" : - # http://www.cstl.nist.gov/biotech/strbase/str_VWA.htm - # allele unknown: 12:5983938-5984087 - - ["TCTA[TCTG]5[TCTA]11TCCA TCTA", - "CCCTAGTGGATGATAAGAATAATCAGTATGTGACTTGGATTGA", - "TCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA", - "TCCATCCATCCTATGTATTTATCATCTGTCC"], - - "D3S1358" : - # http://www.cstl.nist.gov/biotech/strbase/str_D3S1358.htm - # allele unknown: 3:45540713-45540843 + - ["TCTATCTG[TCTA]14", - "ATGAAATCAACAGAGGCTTGCATGTA", - "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", - "TGAGACAGGGTCTTGCTCTGTCACCCAGATTGGACTGCAGT"], - - "D5S818" : - # http://www.cstl.nist.gov/biotech/strbase/str_D5S818.htm - # allele 11: 5:123775504-123775638 - - ["[AGAT]11", - "GGTGATTTTCCTCTTTGGTATCCTTATGTAATATTTTGA", - "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", - "AGAGGTATAAATAAGGATACAGATAAAGATACAAATGTTGTAAACTGTGGCT"], - - "D7S820" : - # http://www.cstl.nist.gov/biotech/strbase/str_D7S820.htm - # allele 13: 7:84160125-84160367 - - ["[GATA]13", - "ATGTTGGTCAGGCTGACTATGGAGTTATTTTAAGGTTAATATATATAAAGGGTATGATAGAACACTTGTCATAGTTTAGAACGAACTAAC", - "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA", - "GACAGATTGATAGTTTTTTTTAATCTCACTAAATAGTCTATAGTAAACATTTAATTACCAATATTTGGTGCAATTCTGTCAATGAGGATAAATGTGGAATC"], - - "D8S1179" : - # http://www.cstl.nist.gov/biotech/strbase/str_D8S1179.htm - # allele 13: 8:124894838-124895018 + - ["[TCTA]1[TCTG]1[TCTA]11", - "TTTTTGTATTTCATGTGTACATTCGTA", - "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", - "TTCCCCACAGTGAAAATAATCTACAGGATAGGTAAATAAATTAAGGCATATTCACGCAATGGGATACGATACAGTGATGAAAATGAACTAATTATAGCTACG"], - - "D13S317" : - # http://www.cstl.nist.gov/biotech/strbase/str_D13S317.htm - # Perhaps, allele 11: 13:82147921-82148112 + - ["[TATC]11A", - "ATCACAGAAGTCTGGGATGTGGAGGAGAGTTCATTTCTTTAGTGGGCATCCGTGACTCTCTGGACTCTGACCCATCTAACGCCTATCTGTATTTACAAATACAT", - "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCA", - "ATCAATCATCTATCTATCTTTCTGTCTGTCTTTTTGGGCTGCC"], - - "D16S539" : - # http://www.cstl.nist.gov/biotech/strbase/str_D16S539.htm - # allele 11: 16:86352518-86352805 + - ["[GATA]11", - "GGGGGTCTAAGAGCTTGTAAAAAGTGTACAAGTGCCAGATGCTCGTTGTGCACAAATCTAAATGCAGAAAAGCACTGAAAGAAGAATCCAGAAAACCACAGTTCCCATTTTTATATGGGAGCAAACAAAGGCAGATCCCAAGCTCTTCCTCTTCCCTAGATCAATACAGACAGACAGACAGGTG", - "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA", - "TCATTGAAAGACAAAACAGAGATGGATGATAGATACATGCTTACAGATGCACACACAAAC"], - - "D18S51" : - # http://www.cstl.nist.gov/biotech/strbase/str_D18S51.htm - # allele 18: 18:63281611-63281916 + - ["[AGAA]18", - "GAGCCATGTTCATGCCACTGCACTTCACTCTGAGTGACAAATTGAGACCTTGTCTC", - "AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA", - "AAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTAAGACATCTCCACACACCAGAGAAGTTAATTTTAATTTTAACATGTTAAGAACAGAGAGAAGCCAACATGTCCACCTTAGGCTGACGGTTTGTTTATTTGTGTTGTTGCTGGTAGTCGGGTTTG"], - - "D21S11" : - # http://www.cstl.nist.gov/biotech/strbase/str_D21S11.htm - # Perhaps, allele 29: 21:19181945-19182165 + - ["[TCTA]4[TCTG]6[TCTA]3TA[TCTA]3TCA[TCTA]2TCCATA[TCTA]11", - "GTGAGTCAATTCCCCAAGTGAATTGCCT", - "TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", - "TCGTCTATCTATCCAGTCTATCTACCTCCTATTAGTCTGTCTCTGGAGAACATTGACTAATACAAC"], - - # "AMEL" - http://www.cstl.nist.gov/biotech/strbase/jpg_amel.htm - # X chromosome has 6 bp deletion and Y chromosome doesn't - "AMELX" : - ["", - "TGTTGATTCTTTATCCCAGATGTTTCTCAAGTGG", # chromosome X at 11296898 - "", - ""], - - "AMELY" : - ["", - "AGAAACCACTTTATTTGGGATGAAGAATCCACC", # chromosome Y at 6869902 - "", - ""] -} - -CODIS_ref_name = {} - - -""" -""" -def get_flanking_seqs(seq, - flank_len = 500): - def align_seq(seq): - aligner_cmd = ["hisat2", - "--score-min", "C,0", - "--no-unal", - "-x", "grch38/genome", - "-c", seq] - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - chr, left, right, strand = "", -1, -1, '+' - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - allele_id, flag, chr, left, _, cigar_str = cols[:6] - assert cigar_str[-1] == 'M' - left = int(left) - flag = int(flag) - strand = '-' if flag & 0x10 else '+' - assert cigar_str == ("%dM" % len(seq)) - right = left + len(seq) - break - - assert chr != "" and left >= 0 and right > left - return chr, left, right, strand - - chr, left, right, strand = align_seq(seq) - left_flank_seq, right_flank_seq = "", "" - if left > 1: - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, max(1, left - flank_len), left - 1)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - left_flank_seq += line - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + flank_len - 1)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - right_flank_seq += line - - if strand == '-': - left_flank_seq, right_flank_seq = typing_common.reverse_complement(right_flank_seq), typing_common.reverse_complement(left_flank_seq) - - chr, _, _, _ = align_seq(left_flank_seq + seq + right_flank_seq) - assert chr != "" - - return left_flank_seq, right_flank_seq - - - -""" -""" -def get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j): - if repeat_i == repeat_j: - # DK - experimental SW alignment - min_diff = sys.maxint - for repeat_num_i in repeat_nums_i: - for repeat_num_j in repeat_nums_j: - min_diff = min(abs(repeat_num_i - repeat_num_j), min_diff) - equal_score = -min_diff / 10.0 + (len(repeat_nums_i) + len(repeat_nums_j)) / 100.0 - equal_score = max(min(0.0 if min_diff == 0 else -0.1, equal_score), -0.9) - - # DK - just for now - equal_score = 0 - - return equal_score - elif repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1]): - return -1 - else: - return -2 - - -""" -Smith Waterman Algorithm -""" -def SW_alignment(allele_i, allele_j): - n, m = len(allele_i), len(allele_j) - a = [[-(i+j) if i == 0 or j == 0 else 0 for j in range(m + 1)] for i in range(n + 1)] - - # Fill 2D array - for i in range(n): - repeat_i, repeat_nums_i = allele_i[i] - for j in range(m): - repeat_j, repeat_nums_j = allele_j[j] - equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j) - a[i+1][j+1] = max(a[i][j+1] - 1, a[i+1][j] - 1, a[i][j] + equal_score) - - return a, n, m - - -""" -""" -def combine_alleles(backbone_allele, add_allele): - allele_i, allele_j = backbone_allele, add_allele - a, n, m = SW_alignment(allele_i, allele_j) - - # Back tracking - new_backbone_allele = [] - i, j = n - 1, m - 1 - while i >= 0 or j >= 0: - if i < 0: - repeat_j, repeat_nums_j = allele_j[j] - new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])]) - j -= 1 - elif j < 0: - repeat_i, repeat_nums_i = allele_i[i] - new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])]) - i -= 1 - else: - repeat_i, repeat_nums_i = allele_i[i] - repeat_j, repeat_nums_j = allele_j[j] - equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j) - if a[i][j+1] - 1 == a[i+1][j+1]: - new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])]) - i -= 1 - elif a[i+1][j] - 1 == a[i+1][j+1]: - new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])]) - j -= 1 - else: - assert a[i][j] + equal_score == a[i+1][j+1] - if repeat_i == repeat_j: - new_backbone_allele.append([repeat_i, repeat_nums_i | repeat_nums_j]) - else: - assert repeat_nums_i == repeat_nums_j - assert repeat_nums_i == set([1]) - new_backbone_allele.append([repeat_i | repeat_j, repeat_nums_i | repeat_nums_j]) - i -= 1 - j -= 1 - - new_backbone_allele = new_backbone_allele[::-1] - return new_backbone_allele - - -""" -""" -def msf_alignment(backbone_allele, allele): - allele_i, allele_j = backbone_allele, allele - a, n, m = SW_alignment(allele_i, allele_j) - - # Back tracking - allele_seq, backbone_seq = "", "" - i, j = n - 1, m - 1 - while i >= 0 or j >= 0: - assert i >= 0 - repeats_i, repeat_nums_i = allele_i[i] - repeat_i = "" - max_repeat = "" - for repeat_str in repeats_i: - if len(repeat_str) > len(repeat_i): - repeat_i = repeat_str - repeat_num_i = max(repeat_nums_i) - if j < 0: - allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq - backbone_seq = repeat_i * repeat_num_i + backbone_seq - i -= 1 - else: - repeats_j, repeat_nums_j = allele_j[j] - assert len(repeats_j) == 1 and len(repeat_nums_j) == 1 - repeat_j, repeat_num_j = list(repeats_j)[0], list(repeat_nums_j)[0] - equal_score = get_equal_score(repeats_i, repeat_nums_i, repeats_j, repeat_nums_j) - if a[i][j+1] - 1 == a[i+1][j+1]: - allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq - backbone_seq = repeat_i * repeat_num_i + backbone_seq - i -= 1 - else: - assert a[i][j] + equal_score == a[i+1][j+1] - if repeat_i == repeat_j: - add_seq = repeat_i * repeat_num_j - dot_seq = '.' * (len(repeat_i) * (repeat_num_i - repeat_num_j)) - allele_seq = add_seq + dot_seq + allele_seq - add_seq = repeat_i * repeat_num_i - backbone_seq = add_seq + backbone_seq - else: - assert repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1]) - dot_seq = '.' * (len(repeat_i) - len(repeat_j)) - allele_seq = repeat_j + dot_seq + allele_seq - backbone_seq = repeat_i + backbone_seq - i -= 1 - j -= 1 - - return allele_seq, backbone_seq - - -""" -Extract multiple sequence alignments -""" -def extract_msa(base_dname, - base_fname, - locus_list, - min_freq, - verbose): - # Download human genome and HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index(ex_path) - - # Load allele frequency information - allele_freq = {} - if min_freq > 0.0: - excel = openpyxl.load_workbook("hisatgenotype_db/CODIS/NIST-US1036-AlleleFrequencies.xlsx") - sheet = excel.get_sheet_by_name(u'All data, n=1036') - for col in range(2, 100): - locus_name = sheet.cell(row = 3, column = col).value - if not locus_name: - break - locus_name = locus_name.encode('ascii','ignore') - locus_name = locus_name.upper() - assert locus_name not in allele_freq - allele_freq[locus_name] = {} - - for row in range(4, 101): - allele_id = sheet.cell(row = row, column = 1).value - allele_id = str(allele_id) - freq = sheet.cell(row = row, column = col).value - if not freq: - continue - allele_freq[locus_name][allele_id] = float(freq) - excel.close() - - CODIS_seq = orig_CODIS_seq - if len(locus_list) > 0: - new_CODIS_seq = {} - for locus_name, fields in CODIS_seq.items(): - if locus_name in locus_list: - new_CODIS_seq[locus_name] = fields - CODIS_seq = new_CODIS_seq - - # Add some additional sequences to allele sequences to make them reasonably long for typing and assembly - for locus_name, fields in CODIS_seq.items(): - _, left_seq, repeat_seq, right_seq = fields - allele_seq = left_seq + repeat_seq + right_seq - left_flank_seq, right_flank_seq = get_flanking_seqs(allele_seq) - CODIS_seq[locus_name][1] = left_flank_seq + left_seq - CODIS_seq[locus_name][3] = right_seq + right_flank_seq - - print >> sys.stderr, "%s is found on the reference genome (GRCh38)" % locus_name - - for locus_name in CODIS_seq.keys(): - alleles = [] - for line in open("hisatgenotype_db/CODIS/codis.dat"): - locus_name2, allele_id, repeat_st = line.strip().split('\t') - if locus_name != locus_name2: - continue - if min_freq > 0.0: - assert locus_name in allele_freq - if allele_id not in allele_freq[locus_name] or \ - allele_freq[locus_name][allele_id] < min_freq: - continue - - alleles.append([allele_id, repeat_st]) - - # From [TTTC]3TTTTTTCT[CTTT]20CTCC[TTCC]2 - # To [['TTTC', [3]], ['TTTTTTCT', [1]], ['CTTT', [20]], ['CTCC', [1]], ['TTCC', [2]]] - def read_allele(repeat_st): - allele = [] - s = 0 - while s < len(repeat_st): - ch = repeat_st[s] - if ch == ' ': - s += 1 - continue - assert ch in "[ACGT" - if ch == '[': - s += 1 - repeat = "" - while s < len(repeat_st): - nt = repeat_st[s] - if nt in "ACGT": - repeat += nt - s += 1 - else: - assert nt == ']' - s += 1 - break - assert s < len(repeat_st) - num = 0 - while s < len(repeat_st): - digit = repeat_st[s] - if digit.isdigit(): - num = num * 10 + int(digit) - s += 1 - else: - break - assert num > 0 - allele.append([set([repeat]), set([num])]) - else: - repeat = "" - while s < len(repeat_st): - nt = repeat_st[s] - if nt in "ACGT": - repeat += nt - s += 1 - else: - assert nt == ' ' or nt == '[' - break - allele.append([set([repeat]), set([1])]) - - # Sanity check - cmp_repeat_st = "" - for repeats, repeat_nums in allele: - repeat = list(repeats)[0] - repeat_num = list(repeat_nums)[0] - if repeat_num > 1 or locus_name == "D8S1179": - cmp_repeat_st += "[" - cmp_repeat_st += repeat - if repeat_num > 1 or locus_name == "D8S1179": - cmp_repeat_st += "]%d" % repeat_num - - assert repeat_st.replace(' ', '') == cmp_repeat_st.replace(' ', '') - return allele - - alleles = [[allele_id, read_allele(repeat_st)] for allele_id, repeat_st in alleles] - - def to_sequence(repeat_st): - sequence = "" - for repeats, repeat_nums in repeat_st: - repeat = list(repeats)[0] - repeat_num = list(repeat_nums)[0] - sequence += (repeat * repeat_num) - return sequence - - def remove_redundant_alleles(alleles): - seq_to_ids = {} - new_alleles = [] - for allele_id, repeat_st in alleles: - allele_seq = to_sequence(repeat_st) - if allele_seq in seq_to_ids: - print >> sys.stderr, "Warning) %s: %s has the same sequence as %s" % \ - (locus_name, allele_id, seq_to_ids[allele_seq]) - continue - if allele_seq not in seq_to_ids: - seq_to_ids[allele_seq] = [allele_id] - else: - seq_to_ids[allele_seq].append(allele_id) - new_alleles.append([allele_id, repeat_st]) - - return new_alleles - - alleles = remove_redundant_alleles(alleles) - - allele_seqs = [[allele_id, to_sequence(repeat_st)] for allele_id, repeat_st in alleles] - - ref_allele_st, ref_allele_left, ref_allele, ref_allele_right = CODIS_seq[locus_name] - ref_allele_st = read_allele(ref_allele_st) - for allele_id, allele_seq in allele_seqs: - if ref_allele == allele_seq: - CODIS_ref_name[locus_name] = allele_id - break - - # Add GRCh38 allele - if locus_name not in CODIS_ref_name: - allele_id = "GRCh38" - CODIS_ref_name[locus_name] = allele_id - allele_seqs = [[allele_id, ref_allele]] + allele_seqs - alleles = [[allele_id, ref_allele_st]] + alleles - - print >> sys.stderr, "%s: %d alleles with reference allele as %s" % (locus_name, len(alleles), CODIS_ref_name[locus_name]) - if verbose: - print >> sys.stderr, "\t", ref_allele_left, ref_allele, ref_allele_right - for allele_id, allele in alleles: - print >> sys.stderr, allele_id, "\t", allele - - # Create a backbone sequence - assert len(alleles) > 0 - backbone_allele = deepcopy(alleles[-1][1]) - for allele_id, allele_st in reversed(alleles[:-1]): - if verbose: - print >> sys.stderr - print >> sys.stderr, allele_id - print >> sys.stderr, "backbone :", backbone_allele - print >> sys.stderr, "allele :", allele_st - backbone_allele = combine_alleles(backbone_allele, allele_st) - msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st) - if verbose: - print >> sys.stderr, "combined backbone:", backbone_allele - print >> sys.stderr, "msf_allele_seq :", msf_allele_seq - print >> sys.stderr, "msf_backbone_seq:", msf_backbone_seq - print >> sys.stderr - - allele_dic = {} - for allele_id, allele_seq in allele_seqs: - allele_dic[allele_id] = allele_seq - - allele_repeat_msf = {} - for allele_id, allele_st in alleles: - msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st) - allele_repeat_msf[allele_id] = msf_allele_seq - - # Sanity check - assert len(allele_dic) == len(allele_repeat_msf) - repeat_len = None - for allele_id, repeat_msf in allele_repeat_msf.items(): - if not repeat_len: - repeat_len = len(repeat_msf) - else: - assert repeat_len == len(repeat_msf) - - # Creat full multiple sequence alignment - ref_allele_id = CODIS_ref_name[locus_name] - allele_msf = {} - for allele_id, repeat_msf in allele_repeat_msf.items(): - allele_msf[allele_id] = ref_allele_left + repeat_msf + ref_allele_right - - # Make sure the length of allele ID is short, less than 20 characters - max_allele_id_len = max([len(allele_id) for allele_id in allele_dic.keys()]) - assert max_allele_id_len < 20 - - # Write MSF (multiple sequence alignment file) - msf_len = len(ref_allele_left) + len(ref_allele_right) + repeat_len - msf_fname = "%s_gen.msf" % locus_name - msf_file = open(msf_fname, 'w') - for s in range(0, msf_len, 50): - for allele_id, msf in allele_msf.items(): - assert len(msf) == msf_len - allele_name = "%s*%s" % (locus_name, allele_id) - print >> msf_file, "%20s" % allele_name, - for s2 in range(s, min(msf_len, s + 50), 10): - print >> msf_file, " %s" % msf[s2:s2+10], - print >> msf_file - - if s + 50 >= msf_len: - break - print >> msf_file - msf_file.close() - - # Write FASTA file - fasta_fname = "%s_gen.fasta" % locus_name - fasta_file = open(fasta_fname, 'w') - for allele_id, allele_seq in allele_seqs: - gen_seq = ref_allele_left + allele_seq + ref_allele_right - print >> fasta_file, ">%s*%s %d bp" % (locus_name, allele_id, len(gen_seq)) - for s in range(0, len(gen_seq), 60): - print >> fasta_file, gen_seq[s:s+60] - fasta_file.close() - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Extract multiple sequence alignments for DNA Fingerprinting loci") - parser.add_argument("-b", "--base", - dest="base_fname", - type=str, - default="codis", - help="base filename (default: codis)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="base filename (default: empty)") - parser.add_argument("--min-freq", - dest="min_freq", - type=float, - default=0.0, - help="minimum allele frequency (default: 0.0)") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.base_fname.find('/') != -1: - elems = args.base_fname.split('/') - base_fname = elems[-1] - base_dname = '/'.join(elems[:-1]) - else: - base_fname = args.base_fname - base_dname = "" - if args.locus_list != "": - locus_list = args.locus_list.split(',') - else: - locus_list = [] - - extract_msa(base_dname, - base_fname, - locus_list, - args.min_freq, - args.verbose) - diff --git a/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py b/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py deleted file mode 100755 index c17d86c5..00000000 --- a/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import inspect, operator -from argparse import ArgumentParser, FileType - -# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm - -CODIS_loci = ["CSF1PO", "FGA", "TH01", "TPOX", "VWA", "D3S1358", "D5S818", "D7S820", "D8S1179", "D13S317", "D16S539", "D18S51", "D21S11"] - - -""" -## Download variant information from website -""" -def get_html(url): - download_cmd = ["wget", - "-O", "-", - url] - proc = subprocess.Popen(download_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - output = "" - for line in proc.stdout: - output += line - - return output - - -""" -Download CODIS data -""" -def download_codis(base_dname, - base_fname, - locus_list, - verbose): - # CODIS database base URL - base_url = "http://www.cstl.nist.gov/biotech/strbase" - - # Refer to Python's regular expression at https://docs.python.org/2/library/re.html - # 47.2 - allele_re = re.compile('>(\d+\.?\d?\"?\'*\(?\d*\.?\d?\"?\'*\)?\*?)[TTTC]4TTTT TT [CTTT]14[CTGT]3[CTTT]14 [CTTC]4[CTTT]3CTCC[TTCC]4 - # repeat_re = re.compile('^(\[[ACGT]+\]\d+|[ACGT]+)+$') - repeat_re = re.compile('^(\[[ACGT]+\]\d+|\[[ACGT]+\]|[ACGT]+|\s)+$') - # Remove extra tags - tag_re = re.compile('(<[^>]*>)') - nbsp_re = re.compile(' ') - quot_re = re.compile('"') - codis_data_file = open(base_fname + ".dat", 'w') - for locus_name in CODIS_loci: - if len(locus_list) > 0 and locus_name not in locus_list: - continue - url = "%s/str_%s.htm" % (base_url, locus_name) - content = get_html(url).split("\r\n") - content = map(lambda x: x.strip(), content) - content2 = [] - for line in content: - if line.startswith("> codis_data_file, "%s\t%s\t%s" % (locus_name, allele_id, repeat_st) - - codis_data_file.close() - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Extract multiple sequence alignments for DNA Fingerprinting loci") - parser.add_argument("-b", "--base", - dest="base_fname", - type=str, - default="codis", - help="base filename (default: codis)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="base filename (default: empty)") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.base_fname.find('/') != -1: - elems = args.base_fname.split('/') - base_fname = elems[-1] - base_dname = '/'.join(elems[:-1]) - else: - base_fname = args.base_fname - base_dname = "" - if args.locus_list != "": - locus_list = args.locus_list.split(',') - else: - locus_list = [] - - download_codis(base_dname, - base_fname, - locus_list, - args.verbose) - diff --git a/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py b/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py deleted file mode 100755 index b0b4d039..00000000 --- a/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py +++ /dev/null @@ -1,1061 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2016, Raymon Cao and Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import inspect, operator -import glob -from argparse import ArgumentParser, FileType - - -global gene_names -gene_names = ['cyp1a1','cyp1a2','cyp1b1','cyp2a6', - 'cyp2a13','cyp2b6','cyp2c8','cyp2c9', - 'cyp2c19','cyp2d6','cyp2e1','cyp2f1', - 'cyp2j2','cyp2r1','cyp2S1','cyp2w1', - 'cyp3a4','cyp3a5','cyp3a7','cyp3a43', - 'cyp4a11','cyp4a22','cyp4b1','cyp4f2', - 'cyp5a1','cyp8a1','cyp19a1','cyp21a2', - 'cyp26a1'] - -""" -Download variant information from website database -""" - -def get_html(url): - download_cmd = ["wget", - "-O", "-", - url] - proc = subprocess.Popen(download_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - output = "" - for line in proc.stdout: - output += line - - return output - - -def download_CYP(verbose): - print("Downloading data from:") - - # CYP database base URL - base_url = "http://www.cypalleles.ki.se" - - # Current script directory - curr_script = os.path.realpath(inspect.getsourcefile(download_CYP)) - ex_path = os.path.dirname(curr_script) - - # Refer to Python's regular expression at https://docs.python.org/2/library/re.html - cyp_re = re.compile('http://www.cypalleles.ki.se/cyp\w+.htm') - output = get_html(base_url) - cyp_urls = cyp_re.findall(output) - # Original list had duplicate urls, removes duplicates - cyp_urls = set(cyp_urls) - - os.system('mkdir cyp_var_files') - for cyp_url in cyp_urls: - cyp_gene_name = cyp_url.split('/')[-1] - cyp_gene_name = cyp_gene_name.split('.')[0] - - # Hardcoded for cyp21 database (has inconsistant url naming) - if cyp_gene_name.lower() == "cyp21".lower(): - cyp_gene_name = cyp_gene_name + "a2" - - # Changed to match all instances of "cyp" - if not re.compile("cyp[\d\w]+", re.IGNORECASE).search(cyp_gene_name): - continue - - # Open file to write on - cyp_file = open("cyp_var_files/%s.var" % (cyp_gene_name), 'w') - - print >> sys.stderr, cyp_url, cyp_gene_name - print >> cyp_file, cyp_url, cyp_gene_name - - cyp_output = get_html(cyp_url) - if cyp_output == "": - continue - - listA = cyp_output.split("") - for ind in range(len(tabRow)): - tabRow[ind] = tabRow[ind].replace("\r\n","") - - allele_name_re = re.compile(cyp_gene_name.upper() + '\*[\w\d]+') - varInfo_re = re.compile('-?\d+[ACGT]\>[ACGT]|-?\d+_?-?\d+?del[ACGT]+|-?\d+_?-?\d+?ins[ACGT]+|None') - - alleleName = allele_name_re.findall(tabRow[0]) - if len(alleleName) > 0: - alleleName = alleleName[0] - - # @RaymonFix - some databases have extra table, ignores headers (CYP2A6) - # @Daehwan - some databases (e.g. http://www.cypalleles.ki.se/cyp3a4.htm) - # have 2 rows of Nucleotide changes (cDNA and Gene), might need - # to look at all rows for snps - # - # @RaymonFix - look in 4th column for "Gene" nt changes first, then consider cDNA if applicable; updated re to remove "<>" formating expressions - - if cyp_url == 'http://www.cypalleles.ki.se/cyp21.htm': # Hardcoded for special format for cyp21a2 - try: - varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[1])) - except IndexError: - continue - - else: - try: - varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[3])) - if len(varInfo) == 0: - varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[2])) - except IndexError: - continue - - for varInd in range(len(varInfo)): - varInfo[varInd] = varInfo[varInd].replace('>','>') - - if 'None' in varInfo: - try: - assert len(varInfo) == 1 - except: - varInfo = filter(lambda a: a != 'None', varInfo) - - - if isinstance(alleleName, basestring): - print >> cyp_file, (str(alleleName) + "\t" + ','.join(varInfo)) - - cyp_file.close() - - -""" -Make MSF files from variants -""" - -def checkNTloc(fasta_fileName,var_fileName,gene_name): - print "\nGene: %s" % gene_name - seq = "" - for line in open(fasta_fileName,'r'): - if line[0] == '>': - continue - seq += line.strip() - - cyp_var_file = open(var_fileName,'r') - cyp_var_dict = makeVarDict(cyp_var_file) - cyp_var_file.close() - - print "len:", len(seq) - varsPos = set() - varsNeg = set() - - for varList in cyp_var_dict.values(): - for var in varList: - if ">" in var: # is SNP - posNt = int(var[:-3]) - ntChange = var[-3:].replace('>','') - assert len(ntChange) == 2 - for nt in ntChange: - assert nt in "ACGT" - - if posNt > 0: - varsPos.add(str(posNt) + '->' + ntChange[0]) - else: - assert posNt < 0 - varsNeg.add(str(posNt) + '->' + ntChange[0]) - - elif "del" in var: # is deletion - posNt = var.split('del')[0].split('_') - posNt = [int(p) for p in posNt] - ntDel = var.split('del')[1] - for nt in ntDel: - assert nt in "ACGT" - - if len(posNt) == 1: # single nt deletion - assert len(ntDel) == 1 - if posNt[0] > 0: - varsPos.add(str(posNt[0]) + '->' + ntDel) - else: - assert posNt[0] < 0 - varsNeg.add(str(posNt[0]) + '->' + ntDel) - - else: # mutliple nt deletion - assert len(posNt) == 2 - try: - assert posNt[1] - posNt[0] + 1 == len(ntDel) - except AssertionError: - print "Incorrect deletion format: %s , skipping variation" % (var) - '''sys.exit(1)''' - continue - ntDelList = list(ntDel) - for i in range(posNt[0],posNt[1] + 1): - if i > 0: - varsPos.add(str(i) + '->' + ntDelList.pop(0)) - else: - assert i < 0 - varsNeg.add(str(i) + '->' + ntDelList.pop(0)) - assert len(ntDelList) == 0 - - else: - assert ("ins" in var) or ("None" in var) - continue - - scorePos = {} # { position offset : number of alignments } for positive positions - for i in range(-len(seq), len(seq)): - align_score = 0 - for var in varsPos: - pos, base = var.split('->') - pos = int(pos) - - try: - seq[pos+i] - except IndexError: - continue - - if seq[pos+i] == base: - align_score += 1 - - scorePos[i] = align_score - oSetPos = max(scorePos.iteritems(), key=operator.itemgetter(1))[0] - print "Positive postitions offset: %d" % oSetPos - print "Score: %d out of %d\n" % (scorePos[oSetPos], len(varsPos)) - - - print "Checking negative position offset: %d" % (oSetPos + 1) - align_score = 0 - oSetNeg = oSetPos + 1 - for var in varsNeg: - pos, base = var.split('->') - pos = int(pos) - - try: - seq[pos + oSetNeg] - except IndexError: - continue - - if seq[pos + oSetNeg] == base: - align_score += 1 - print "Score: %d out of %d\n\n" % (align_score, len(varsNeg)) - - if len(varsNeg) == 0 and len(varsPos) != 0: - return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), 1.0, float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg)) - elif len(varsNeg) != 0 and len(varsPos) == 0: - return oSetPos, oSetNeg, 1.0, float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg)) - elif len(varsNeg) == 0 and len(varsPos) == 0: - return oSetPos, oSetNeg, 1.0, 1.0, 1.0 - else: - assert len(varsNeg) != 0 and len(varsPos) != 0 - return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg)) - - -def create_map(seq): - seq_map = {} - count = 0 - for i in range(len(seq)): - bp = seq[i] - if bp == '.': - continue - assert bp.upper() in "ACGT" - seq_map[count] = i - count += 1 - return seq_map - -def splitString(someStr,posList): - posList.insert(0,-1) - posList.append(len(someStr) - 1) - splitStr = [] - for i in range(len(posList) - 1): - left = posList[i] + 1 - right = posList[i+1] + 1 - splitStr.append(someStr[left:right]) - - return splitStr - -def extractSeq(faFile): - seq = "" - for line in faFile: - if line.startswith(">"): - continue - - seq += line.strip() - - return seq - -def makeVarDict(fname): - alleleVarDict = {} - - allLines = [line.strip() for line in fname] - - ref_al_id_present = False - for line in allLines[1:]: - if 'None' in line: - ref_al_id_present = True - - line_num = 0 - for line in allLines[1:]: - line_num += 1 - assert line.upper().startswith("CYP") - alleleName = line.split("\t")[0].upper() - - if (not ref_al_id_present) and line_num == 1: - varList = ['None'] - else: - try: - varList = line.split("\t")[1].split(',') - except IndexError: - continue - - try: - assert not alleleName in alleleVarDict - alleleVarDict[alleleName] = set(varList) - except: - print >> sys.stdout, ("Warning, %s allele is already represented" % alleleName) - alleleVarDict[alleleName] = alleleVarDict[alleleName] | set(varList) - - return alleleVarDict - -def makeSnp(oldSeq, pos, oldNt, newNt): - assert oldSeq[pos] == oldNt - newSeq = oldSeq[:pos] + newNt + oldSeq[pos+1:] - assert len(newSeq) == len(oldSeq) - return newSeq - -def makeDel(oldSeq, left, right, toDel): - assert right - left + 1 == len(toDel) - assert oldSeq[left:right + 1] == toDel - newSeq = oldSeq[:left] + '.'*len(toDel) + oldSeq[right + 1:] - assert len(newSeq) == len(oldSeq) - return newSeq - -def makeIns(oldSeq,left,right,toIns): - assert right - left - 1 >= len(toIns) - for nt in oldSeq[left + 1:right]: - assert nt == '.' - remDots = right - left - 1 - len(toIns) - newSeq = oldSeq[:left + 1] + toIns + '.'*remDots + oldSeq[right:] - assert len(newSeq) == len(oldSeq) - return newSeq - - -def makeMSF(gene_name, oSetPos, oSetNeg): - cyp_var_file = open("cyp_var_files/%s.var" % gene_name,'r') - cyp_var_dict = makeVarDict(cyp_var_file) - cyp_var_file.close() - - if len(cyp_var_dict) < 2: - print('\tOnly reference allele included, skipping gene') - return - - try: - blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name) - if len(blast_allele_var) > 0: - cyp_var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var) - except IOError: - print('\t%s blast file was skipped.' % gene_name) - - cyp_faFile = open("cyp_fasta/%s.fasta" % gene_name,'r') - cyp_seq = extractSeq(cyp_faFile) - cyp_faFile.close() - preBackbone_seq = '' - - - msfTable = {} - - # Building backbone structure (augment length with insertions) - longestIns = {} # { key = position : value = length } - for allele,varList in cyp_var_dict.items(): - for var in varList: - if not "ins" in var: - continue - pos = var.split('ins')[0].split('_') - pos = [int(p) for p in pos] - ntIns = var.split('ins')[1] - correctFormat = len(pos) == 2 and pos[1] - pos[0] == 1 - if not correctFormat: - correctFormat = len(pos) == 1 - try: - assert correctFormat - except: - print >> sys.stdout, "\tIncorrect format for insertion: variation %s on allele %s" % (var, allele) - continue - - # convert to position in string - if not 'GRCH38' in allele: - if pos[0] > 0: - pos = pos[0] + oSetPos - else: - pos = pos[0] + oSetNeg - else: - pos = pos[0] - - # Make dictionary of longest insertions - if not pos in longestIns: - longestIns[pos] = len(ntIns) - else: - if len(ntIns) > longestIns[pos]: - longestIns[pos] = len(ntIns) - - posInsList = sorted(longestIns.keys()) - - splitSeq = splitString(cyp_seq,posInsList) - posInsList = posInsList[1:-1] - - for i in range(len(posInsList)): - splitSeq[i] += '.' * longestIns[posInsList[i]] - - for subseq in splitSeq: - try: - assert len(subseq) > 0 and not subseq.startswith('.') - preBackbone_seq += subseq - except: - continue - # pre-backbone built - - - map_cyp = create_map(preBackbone_seq) # { Index of bp in original seq : Actual index in string } - - - for allele,varList in cyp_var_dict.items(): - for var in varList: - isSnp = False - isDel = False - isIns = False - - if ">" in var: - isSnp = True - elif "del" in var: - isDel = True - elif "ins" in var: - isIns = True - else: - assert("None" in var) - isRef = True - - if isSnp: - pos = int(var[:-3]) - dbPos = pos - ntChange = var[-3:].replace('>','') - assert len(ntChange) == 2 - for nt in ntChange: - assert nt in "ACGT" - - if not 'GRCH38' in allele: - if pos > 0: - pos = pos + oSetPos - else: - pos = pos + oSetNeg - - if pos < 0 or pos > len(cyp_seq) - 1: - print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - continue - - try: - assert(preBackbone_seq[map_cyp[pos]] == ntChange[0]) # nt at pos in seq must match database - except: - print >> sys.stdout, "\tWarning: position %d in sequence contains %s, but expected %s from database" % (dbPos, preBackbone_seq[map_cyp[pos]], ntChange[0]) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - continue - - # Adding to msf table - if not allele in msfTable: - msfTable[allele] = makeSnp(preBackbone_seq, map_cyp[pos], ntChange[0], ntChange[1]) - else: - msfTable[allele] = makeSnp(msfTable[allele], map_cyp[pos], ntChange[0], ntChange[1]) - - elif isDel: - pos = var.split('del')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707]) - pos.append(pos[0]) - assert len(pos) == 2 - dbPos = pos - ntDel = var.split('del')[1] - for nt in ntDel: - assert nt in "ACGT" - - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - - skipDel = False - for i in range(len(pos)): - if pos[i] < 0 or pos[i] > len(cyp_seq) - 1: - print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos[i]) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - skipDel = True - - if skipDel: - continue - - - try: - assert pos[1] - pos[0] + 1 == len(ntDel) - except: - print >> sys.stdout, "\tIncorrect deletion data with %s on allele %s. Skipping variation." % (var, allele) - continue - - try: - assert preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ] == ntDel - except: - print >> sys.stdout, "\tWarning, positions %d to %d in sequence contains %s, but expected %s from database" % \ - (dbPos[0], dbPos[1], preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ], ntDel) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - continue - - - # Adding to msf table - if not allele in msfTable: - msfTable[allele] = makeDel(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntDel) - else: - msfTable[allele] = makeDel(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntDel) - - - elif isIns: - pos = var.split('ins')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: - pos.append(pos[0] + 1) - assert len(pos) == 2 - dbPos = pos - try: - assert pos[1] - pos[0] == 1 - except AssertionError: - print >> sys.stdout, "\tIncorrect insertion data with %s on allele %s. Skipping variation." % (var, allele) - continue - ntIns = var.split('ins')[1] - for nt in ntIns: - assert nt in "ACGT" - - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - - skipIns = False - for i in range(len(pos)): - if pos[i] < 0 or pos[i] > len(cyp_seq) - 1: - print >> sys.stdout, "Warning: position %d out of bounds" % (dbPos[i]) - print >> sys.stdout, "\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - skipIns = True - - if skipIns: - continue - - - # Adding to msf table - if not allele in msfTable: - msfTable[allele] = makeIns(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntIns) - else: - msfTable[allele] = makeIns(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntIns) - - - else: - assert isRef - assert not allele in msfTable - msfTable[allele] = preBackbone_seq - - # Sanity checking - seq_len = 0 - for allele, msf_seq in msfTable.items(): - if seq_len == 0: - seq_len = len(msf_seq) - else: - assert seq_len == len(msf_seq) - assert seq_len > 0 - - # Follow MSF style of IMGT/HLA database - msfFile = open('cyp_msf/%s_gen.msf' % gene_name[3:].upper(),'w') - for i in range(0, seq_len, 50): - for allele, msf_seq in msfTable.items(): - output = "%12s" % allele[3:].upper() - for j in range(i, i+50, 10): - if j >= seq_len: - break - if j == i: - output += "\t" - else: - output += " " - output += msf_seq[j:j+10] - print >> msfFile, output - print >> msfFile - - msfFile.close() - - -def build_msf_files(): - os.system('mkdir cyp_msf') - - oSetPos = 0 - oSetNeg = 0 - oSetScorePos = 0.0 - oSetScoreNeg = 0.0 - tot_score = 0.0 - - print('\nBuilding MSF files:') - for gene_name in gene_names: - oSetPos, oSetNeg, oSetScorePos, oSetScoreNeg, tot_score = checkNTloc("cyp_fasta/%s.fasta" % gene_name,"cyp_var_files/%s.var" % gene_name,gene_name) - if not (tot_score >= 0.95): - print "\tLess than 95% match, skipping gene." - continue - - makeMSF(gene_name, oSetPos, oSetNeg) - - -''' -Check MSF files against variants files -''' - -global incorrect_msf_entries -incorrect_msf_entries = [] - -def create_inv_map(seq): - seq_map = {} - count = 0 - for i in range(len(seq)): - bp = seq[i] - if bp == '.': - continue - assert bp.upper() in "ACGT" - seq_map[i] = count - count += 1 - return seq_map - -def readMSF(msf_fname): # { Allele name : MSF sequence } - msf_dict = {} - all_lines = [line for line in msf_fname] - for line in all_lines: - line = line.strip().replace(' ','') - if len(line) == 0 : continue - allele_name = 'CYP' + line.split('\t')[0] - msf_seq = line.split('\t')[1] - if not allele_name in msf_dict: - msf_dict[allele_name] = msf_seq - else: - msf_dict[allele_name] = msf_dict[allele_name] + msf_seq - - return msf_dict - -def msf_removeIns(ref_seq, al_seq): - assert len(ref_seq) == len(al_seq) - ins_ind_list = [] - for i in range(len(ref_seq)): - if ref_seq[i] == '.': - ins_ind_list.append(i) - - ori_ref_seq = ref_seq.replace('.','') - ori_al_seq = list(al_seq) - - for i in ins_ind_list: - ori_al_seq[i] = '-' - - ori_al_seq = ''.join(ori_al_seq).replace('-','') - - assert len(ori_ref_seq) == len(ori_al_seq) - return ori_ref_seq, ori_al_seq - -def msfToVarList(ref_seq, al_seq): - var_list = [] - - assert len(ref_seq) == len(al_seq) - for bp in ref_seq: assert bp in "ACGT." - for bp in al_seq: assert bp in "ACGT." - inv_map = create_inv_map(ref_seq) - - ins_re = re.compile('[ACGT]\.+') - ins_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ref_seq)] # list of duples of indicies of insertions in ref_seq - ins_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in ins_subStrPos] - - for tup in ins_pos_length: - ins_pos, ins_length = tup[0], tup[1] - ins_seq = al_seq[ins_pos + 1: ins_pos + ins_length + 1] - ins_seq = ins_seq.replace('.','') - if len(ins_seq) == 0: - continue - ins_str_data = str(inv_map[tup[0]]) + '_' + str(inv_map[tup[0]] + 1) + 'ins' + ins_seq - var_list.append(ins_str_data) - - # insertions finished - - ori_ref_seq, ori_al_seq = msf_removeIns(ref_seq, al_seq) - - for i in range(len(ori_ref_seq)): - if ori_al_seq[i] == '.': - continue - elif ori_al_seq[i] != ori_ref_seq[i]: # snp - var_list.append(str(i) + ori_ref_seq[i] + '>' + ori_al_seq[i]) - - del_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ori_al_seq)] # list of duples of indicies of deletions in ori_al_seq - del_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in del_subStrPos] - - for tup in del_pos_length: - del_pos, del_length = tup[0], tup[1] - del_seq = ori_ref_seq[del_pos + 1 : del_pos + del_length + 1] - if del_length == 1: - assert len(del_seq) == 1 - del_str_data = str(tup[0] + 1) + 'del' + del_seq - else: - del_str_data = str(tup[0] + 1) + '_' + str(tup[0] + tup[1]) + 'del' + del_seq - var_list.append(del_str_data) - - # deletions finished - - return var_list - -def checkMSFfile(gene_name, msf_fname, var_fname, fasta_filename): - oSetPos, oSetNeg, oSet_pos_score, oSet_neg_score, tot_score = checkNTloc(fasta_filename, var_fname, gene_name) - - try: - msf_file = open(msf_fname,'r') - msf_dict = readMSF(msf_file) # { Allele name : MSF sequence } - msf_file.close() - except IOError: - print("\t%s msf file was skipped.\n" % (gene_name)) - return - - var_file = open(var_fname,'r') - var_dict = makeVarDict(var_file) - var_file.close() - - try: - blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name) - if len(blast_allele_var) > 0: - var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var) - except IOError: - print('\t%s blast file was skipped.' % gene_name) - - fa_file = open(fasta_filename,'r') - oriSeq = extractSeq(fa_file) - fa_file.close() - - - # Find reference allele - ref_allele = '' - for allele_name in var_dict.keys(): - if len(var_dict[allele_name]) == 1 and list(var_dict[allele_name])[0] == "None": - assert ref_allele == '' - ref_allele = allele_name - assert not ref_allele == '' - - - # Check if ref allele seq in msf matches fasta - assert ref_allele in msf_dict - - try: - assert msf_dict[ref_allele].replace('.','') == oriSeq - print("Sequences match for reference allele %s" % ref_allele) - except AssertionError: - print("Warning: sequences do not match for reference allele %s" % ref_allele) - sys.exit(1) - - - # Check all alleles are included - try: - assert set([k.upper() for k in msf_dict.keys()]).issubset(set([k.upper() for k in var_dict.keys()])) - except AssertionError: - print("Extra alleles in MSF!\n") - print(sorted(msf_dict.keys())) - print("\n\n") - print(sorted(var_dict.keys())) - sys.exit(1) - - - # Convert from database positions to sequence positions (using offset) - for allele, var_list in var_dict.items(): - oSet_var_list = [] - for var in var_list: - if '>' in var: # snp - pos = int(var.split('>')[0][:-1]) - ntSnp = [var.split('>')[0][-1]] - ntSnp.append(var.split('>')[1]) - assert len(ntSnp) == 2 - if not 'GRCH38' in allele: - if pos > 0: - pos = pos + oSetPos - else: - pos = pos + oSetNeg - - if pos < 0 or pos > len(oriSeq) - 1: # out of bounds - continue - if oriSeq[pos] != ntSnp[0]: # mismatch - print('\tMismatch on variation %s' % var) - continue - - oSet_var = str(pos) + ntSnp[0] + '>' + ntSnp[1] - oSet_var_list.append(oSet_var) - - elif 'del' in var: # deletion - pos = var.split('del')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707]) - pos.append(pos[0]) - assert len(pos) == 2 - ntDel = var.split('del')[1] - for nt in ntDel: - assert nt in "ACGT" - - skipDel = False - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds - skipDel = True - if (oriSeq[ pos[0] : pos[1] + 1 ] != ntDel): # mismatch - print('\tMismatch on variation %s' % var) - continue - - if skipDel: - continue - - assert pos[1] - pos[0] + 1 == len(ntDel) - - oSet_var = 'del' + ntDel - if pos[0] == pos[1]: - oSet_var = str(pos[0]) + oSet_var - else: - oSet_var = str(pos[0]) + '_' + str(pos[1]) + oSet_var - - oSet_var_list.append(oSet_var) - - elif 'ins' in var: # insertion - pos = var.split('ins')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: - pos.append(pos[0] + 1) - assert len(pos) == 2 - try: - assert pos[1] - pos[0] == 1 - except AssertionError: - print('\tIncorrect insertion format on variation %s' % var) - continue - ntIns = var.split('ins')[1] - for nt in ntIns: - assert nt in "ACGT" - - skipIns = False - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds - skipIns = True - - if skipIns: - continue - - oSet_var = str(pos[0]) + '_' + str(pos[1]) + 'ins' + ntIns - oSet_var_list.append(oSet_var) - - else: - assert allele == ref_allele - assert var == 'None' - assert len(oSet_var_list) == 0 - oSet_var_list.append('None') - - var_dict[allele] = set(oSet_var_list) - - # Check variants created from MSF file against variants list - num_correct_alleles = 0 - for allele, msf_seq in msf_dict.items(): - if allele == ref_allele: - num_correct_alleles += 1 - continue - msf_var_list = msfToVarList(msf_dict[ref_allele], msf_seq) - '''print('\t' + str(var_dict[allele] == set(msf_var_list)) + '\t' + str(allele) + '\t' + str(msf_var_list))''' - - try: - assert var_dict[allele] == set(msf_var_list) - num_correct_alleles += 1 - except AssertionError: - incorrect_msf_entries.append(allele) - print('\n') - print('\t\tVar File:\t' + str(var_dict[allele])) - print('\t\tMSF File:\t' + str(set(msf_var_list))) - print('\t\tDifference:\t' + str(var_dict[allele] - set(msf_var_list)) + '\n') - '''sys.exit(1)''' - - print("\t%d out of %d alleles have correct msf sequences\n" % (num_correct_alleles, len(msf_dict))) - -def check_msf_files(): - print("\nChecking MSF files:") - - for gene_name in gene_names: - checkMSFfile(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 'cyp_var_files/%s.var' % gene_name, 'cyp_fasta/%s.fasta' % gene_name) - - print('\n\n%d incorrect msf entries on alleles %s\n' % (len(incorrect_msf_entries), str(incorrect_msf_entries))) - - -""" -Write allele sequences to fasta for each gene -""" - -def writeGenFasta(gene_name, msf_fname, line_length): - try: - msf_file = open(msf_fname,'r') - msf_seq_dict = readMSF(msf_file) - msf_file.close() - except IOError: - print("\t%s msf file was skipped." % (gene_name)) - return - - gen_fasta_file = open('gen_fasta/%s_gen.fasta' % gene_name[3:].upper(), 'w') - - for allele, seq in msf_seq_dict.items(): - seq = seq.replace('.','') - print >> gen_fasta_file, ('>' + allele[3:].upper() + ' ' + str(len(seq)) + ' bp') - seq_lines = [seq[i:i+line_length] for i in range(0, len(seq), line_length)] - print >> gen_fasta_file, ('\n'.join(seq_lines)) - - gen_fasta_file.close() - print('%s_gen.fasta completed' % gene_name) - -def build_gen_fasta_files(): - os.system('mkdir gen_fasta') - - print("\nBuilding alleles sequence fasta files:") - for gene_name in gene_names: - writeGenFasta(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 60) - - -""" -Run script -""" - -def extract_cyp_data(): - download_CYP(True) - build_msf_files() - check_msf_files() - build_gen_fasta_files() - -#################################################################################################### -## Debuging BLASTN alignment ref alleles - -def adjust_blast_vars(blast_vars_list,qry_pos): - if len(blast_vars_list) == 0: - return [] - - qry_pos = qry_pos - 1 - adj_blst_var_list = [] - - for var in blast_vars_list: - if '>' in var: # SNP - old_pos = int(var[:-3]) - adj_var = str(old_pos + qry_pos) + var[-3:] - adj_blst_var_list.append(adj_var) - elif 'del' in var: # deletion - old_pos = var.split('del')[0].split('_') - old_pos = [int(i) for i in old_pos] - old_pos = [i + qry_pos for i in old_pos] - if len(old_pos) == 1: - adj_var = str(old_pos[0]) + 'del' + var.split('del')[1] - else: - assert len(old_pos) == 2 - adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'del' + var.split('del')[1] - adj_blst_var_list.append(adj_var) - else: # insertion - assert 'ins' in var - old_pos = var.split('ins')[0].split('_') - old_pos = [int(i) for i in old_pos] - old_pos = [i + qry_pos for i in old_pos] - assert len(old_pos) == 2 and (old_pos[1] - old_pos[0] == 1) - adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'ins' + var.split('ins')[1] - adj_blst_var_list.append(adj_var) - - return adj_blst_var_list - -def extract_var_from_blast(cyp_blast_fname): - blastn_file = open(cyp_blast_fname,'r') - all_lines = [line.strip() for line in blastn_file if not (len(line.strip()) == 0 or line.strip().startswith('|'))] - blastn_file.close() - - id_match = [m.group(0) for l in all_lines[0:25] for m in [re.compile('.*(Identities.*).*').search(l)] if m][0] - id_match = id_match.split('%')[0].split(' (')[0].split('= ')[1].split('/') - id_match = [int(i) for i in id_match] - - # print(id_match) - assert len(id_match) == 2 and id_match[1] - id_match[0] >= 0 - if id_match[1] - id_match[0] == 0: - print('\tPerfect match using blastn') - return [] - - - start = -1 - end = -1 - for i in range(len(all_lines)): # Get rid of headers and footers - if all_lines[i].startswith('Score ='): - assert start == -1 - start = i - - if all_lines[i].startswith('Lambda'): - assert start != -1 and end == -1 - end = i - break - - all_lines = all_lines[start + 3 : end] - # print('\n'.join(all_lines)) - - blastn_var_list = [] - for i in range(0,len(all_lines),2): - qry_seq = '\t'.join(all_lines[i].split()) - qry_seq_pos = int(qry_seq.split('\t')[1]) - sbj_seq = '\t'.join(all_lines[i + 1].split()) - qry_seq = qry_seq.split('\t')[2].replace('-','.').upper() - sbj_seq = sbj_seq.split('\t')[2].replace('-','.').upper() - #print(qry_seq) - #print(sbj_seq) - - temp_var_list = msfToVarList(qry_seq, sbj_seq) - #print(str(qry_seq_pos) + '\t' + str(temp_var_list) + '\t' + str(adjust_blast_vars(temp_var_list,qry_seq_pos))) - temp_var_list = adjust_blast_vars(temp_var_list,qry_seq_pos) - blastn_var_list = blastn_var_list + temp_var_list - - return blastn_var_list - -# extract_var_from_blast('cyp_blast_alignment/cyp2d6_blast.align') - -extract_cyp_data() diff --git a/hisatgenotype_scripts/hisatgenotype_locus_samples.py b/hisatgenotype_scripts/hisatgenotype_locus_samples.py deleted file mode 100755 index 3de636a0..00000000 --- a/hisatgenotype_scripts/hisatgenotype_locus_samples.py +++ /dev/null @@ -1,354 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re, threading -import inspect -import random -import glob -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -# Platinum genomes - CEPH pedigree (17 family members) -CEPH_pedigree = { - "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]}, - "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]}, - "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]}, - "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]}, - "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - } - - - -""" -""" -class myThread(threading.Thread): - def __init__(self, - lock, - paths, - reference_type, - region_list, - num_editdist, - max_sample, - assembly, - out_dir, - genotype_results, - verbose): - threading.Thread.__init__(self) - self.lock = lock - self.paths = paths - self.reference_type = reference_type - self.region_list = region_list - self.num_editdist = num_editdist - self.max_sample = max_sample - self.assembly = assembly - self.out_dir = out_dir - self.genotype_results = genotype_results - self.verbose = verbose - - def run(self): - global work_idx - while True: - self.lock.acquire() - my_work_idx = work_idx - work_idx += 1 - self.lock.release() - if my_work_idx >= len(self.paths) or \ - my_work_idx >= self.max_sample: - return - worker(self.lock, - self.paths[my_work_idx], - self.reference_type, - self.region_list, - self.num_editdist, - self.assembly, - self.out_dir, - self.genotype_results, - self.verbose) - - -""" -""" -work_idx = 0 -def worker(lock, - path, - reference_type, - region_list, - num_editdist, - assembly, - out_dir, - genotype_results, - verbose): - fq_name = path.split('/')[-1] - read_dir = '/'.join(path.split('/')[:-1]) - genome = fq_name.split('.')[0] - if not fq_name.endswith("extracted.1.fq.gz"): - return - read_basename = fq_name[:fq_name.find("extracted.1.fq.gz")] - read_fname_1, read_fname_2 = "%s/%sextracted.1.fq.gz" % \ - (read_dir, read_basename), "%s/%sextracted.2.fq.gz" % (read_dir, read_basename) - - if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2): - return - lock.acquire() - print >> sys.stderr, genome - lock.release() - - for family, loci in region_list.items(): - test_hla_cmd = ["hisatgenotype_locus.py", - "--base", family] - if len(loci) > 0: - test_hla_cmd += ["--locus", ','.join(loci)] - test_hla_cmd += ["--num-editdist", str(num_editdist)] - test_hla_cmd += ["-1", read_fname_1, "-2", read_fname_2] - if assembly: - test_hla_cmd += ["--assembly"] - test_hla_cmd += ["--assembly-base"] - if out_dir != "": - test_hla_cmd += ["%s/%s" % (out_dir, genome)] - else: - test_hla_cmd += [genome] - - if verbose: - lock.acquire() - print >> sys.stderr, ' '.join(test_hla_cmd) - lock.release() - - proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - test_alleles = set() - output_list = [] - for line in proc.stdout: - line = line.strip() - if line.find("abundance") == -1: - continue - - rank, _, allele, _, abundance = line.split() - output_list.append([allele, float(abundance[:-2])]) - - lock.acquire() - for allele, abundance in output_list: - print >> sys.stdout, "%s\t%s\t%.2f" % (genome, allele, abundance) - genotype_results.append([genome, allele, abundance]) - sys.stdout.flush() - lock.release() - - -""" -""" -def genotyping(read_dir, - reference_type, - region_list, - num_editdist, - nthreads, - max_sample, - assembly, - out_dir, - verbose, - platinum_check): - for database_name in region_list: - # Extract variants, backbone sequence, and other sequeces - typing_common.extract_database_if_not_exists(database_name, - []) # locus_list - # Build HISAT2's graph index - typing_common.build_index_if_not_exists(database_name, - "hisat2", - "graph", - 1, # threads - verbose) - - if not os.path.exists(read_dir): - print >> sys.stderr, "Error: %s does not exist." % read_dir - sys.exit(1) - - if out_dir != "" and not os.path.exists(out_dir): - os.mkdir(out_dir) - - # fastq files - fq_fnames = glob.glob("%s/*.extracted.1.fq.gz" % read_dir) - - genotype_results = [] - - lock = threading.Lock() - threads = [] - for t in range(nthreads): - thread = myThread(lock, - fq_fnames, - reference_type, - region_list, - num_editdist, - max_sample, - assembly, - out_dir, - genotype_results, - verbose) - thread.start() - threads.append(thread) - - for thread in threads: - thread.join() - - - if platinum_check: - genotype_dic = {} - for genome, allele, abundance in genotype_results: - region, _ = allele.split('*') - if region not in genotype_dic: - genotype_dic[region] = {} - if genome not in genotype_dic[region]: - genotype_dic[region][genome] = [] - if len(genotype_dic[region][genome]) >= 2: - continue - # DK - debugging purposes - # if abundance < 0.15 * 100: - # continue - genotype_dic[region][genome].append([allele, abundance]) - - for region, region_genotype in genotype_dic.items(): - print >> sys.stderr, region - included, total = 0, 0 - for genome, genome_alleles in region_genotype.items(): - genome_alleles = set([allele for allele, _ in genome_alleles]) - if "father" in CEPH_pedigree[genome]: - assert "mother" in CEPH_pedigree[genome] - parents = [CEPH_pedigree[genome]["father"], CEPH_pedigree[genome]["mother"]] - else: - parents = [] - parent_allele_sets = [] - assert len(parents) in [0, 2] - if len(parents) == 2 and \ - parents[0] in region_genotype and \ - parents[1] in region_genotype: - for parent_allele, _ in region_genotype[parents[0]]: - for parent_allele2, _ in region_genotype[parents[1]]: - parent_allele_sets.append(set([parent_allele, parent_allele2])) - print >> sys.stderr, "\t", genome, genome_alleles, parent_allele_sets - if len(parent_allele_sets) > 0: - total += 1 - if genome_alleles in parent_allele_sets: - included += 1 - print >> sys.stderr, "\t%d / %d" % (included, total) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='genotyping on many samples') - parser.add_argument("--reference-type", - dest="reference_type", - type=str, - default="gene", - help="Reference type: gene, chromosome, and genome (default: gene)") - parser.add_argument("--region-list", - dest="region_list", - type=str, - default="", - help="A comma-separated list of regions (default: empty)") - parser.add_argument('--read-dir', - dest="read_dir", - type=str, - default="", - help='read directory (e.g. read_input)') - parser.add_argument("--num-editdist", - dest="num_editdist", - type=int, - default=2, - help="Maximum number of mismatches per read alignment to be considered (default: 2)") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument('--assembly', - dest='assembly', - action='store_true', - help='Perform assembly') - parser.add_argument("--max-sample", - dest="max_sample", - type=int, - default=sys.maxint, - help="Number of samples to be analyzed (default: sys.maxint)") - parser.add_argument("--out-dir", - dest="out_dir", - type=str, - default="", - help='Output directory (default: (empty))') - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument('--platinum-check', - dest='platinum_check', - action='store_true', - help='Check for concordance of platinum genomes') - - args = parser.parse_args() - - if args.read_dir == "": - print >> sys.stderr, "Error: please specify --read-dir." - sys.exit(1) - - if not args.reference_type in ["gene", "chromosome", "genome"]: - print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type) - sys.exit(1) - - region_list = {} - if args.region_list != "": - for region in args.region_list.split(','): - region = region.split('.') - if len(region) < 1 or len(region) > 2: - print >> sys.stderr, "Error: --region-list is incorrectly formatted." - sys.exit(1) - - family = region[0].lower() - if len(region) == 2: - locus_name = region[1].upper() - if family not in region_list: - region_list[family] = set() - if len(region) == 2: - region_list[family].add(locus_name) - - genotyping(args.read_dir, - args.reference_type, - region_list, - args.num_editdist, - args.threads, - args.max_sample, - args.assembly, - args.out_dir, - args.verbose, - args.platinum_check) - diff --git a/hisatgenotype_scripts/run_extract_CP.sh b/hisatgenotype_scripts/run_extract_CP.sh deleted file mode 100755 index ceca077e..00000000 --- a/hisatgenotype_scripts/run_extract_CP.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.HLA.CP.extract.genome -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=40 -#SBATCH --mem=400G -#SBATCH --partition=lrgmem -#SBATCH --time=166:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/scripts/extract_reads.py --base-fname genotype_genome --reference-type genome --read-dir /home-1/dkim136@jhu.edu/aszalay1/genomes --out-dir CP_80 -p 40 --max-sample 80 --job-range 0,2 - diff --git a/hisatgenotype_scripts/run_extract_ILMN.sh b/hisatgenotype_scripts/run_extract_ILMN.sh deleted file mode 100755 index 3aaf0cbb..00000000 --- a/hisatgenotype_scripts/run_extract_ILMN.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.HLA.ILMN.extract.genome -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=17 -#SBATCH --mem=120G -#SBATCH --partition=shared -#SBATCH --time=166:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/scripts/extract_reads.py --base-fname genotype_genome --reference-type genome --read-dir /home-1/dkim136@jhu.edu/ssalzbe1/users/infphilo/platinum_genomes --out-dir ILMN -p 17 - diff --git a/hisatgenotype_scripts/run_genotype_build.sh b/hisatgenotype_scripts/run_genotype_build.sh deleted file mode 100755 index ac2a3363..00000000 --- a/hisatgenotype_scripts/run_genotype_build.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.genotype -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=4 -#SBATCH --mem=400G -#SBATCH --partition=lrgmem -#SBATCH --time=168:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/hisatgenotype_build_genome.py -p 4 --verbose --commonvar genome.fa genotype_genome diff --git a/hisatgenotype_scripts/run_hisat2_build.sh b/hisatgenotype_scripts/run_hisat2_build.sh deleted file mode 100755 index 15d25611..00000000 --- a/hisatgenotype_scripts/run_hisat2_build.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.genotype.hisat2-build -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=4 -#SBATCH --mem=400G -#SBATCH --partition=lrgmem -#SBATCH --time=168:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/hisat2/evaluation/tests/genotype - -/home-1/dkim136@jhu.edu/infphilo/hisat2/hisat2/hisat2-build -p 4 --snp genotype_genome.snp --haplotype genotype_genome.haplotype genotype_genome.fa genotype_genome diff --git a/hisatgenotype_scripts/run_type_CP.sh b/hisatgenotype_scripts/run_type_CP.sh deleted file mode 100755 index 4fd54ffd..00000000 --- a/hisatgenotype_scripts/run_type_CP.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.HLA.CP -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=24 -#SBATCH --mem=64G -#SBATCH --partition=shared -#SBATCH --time=12:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_CP_extract_genome_partial - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_CP_extract_genome_partial/hisat2_test_HLA_genotyping_CP.py CP --num-editdist 2 -p 24 > cp_hla.txt From d47e96e06552cba9462ab5b6e5bc3c4f52bc0992 Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Fri, 14 Feb 2020 16:40:52 -0600 Subject: [PATCH 2/5] Remove all hisatgenotype specific scripts --- hisatgenotype.py | 490 --- hisatgenotype_build_genome.py | 505 ---- hisatgenotype_extract_reads.py | 541 ---- hisatgenotype_extract_vars.py | 1299 -------- hisatgenotype_hla_cyp.py | 1671 ----------- hisatgenotype_locus.py | 2631 ----------------- hisatgenotype_modules/__init__.py | 0 .../hisatgenotype_assembly_graph.py | 1902 ------------ .../hisatgenotype_typing_common.py | 1552 ---------- hisatgenotype_scripts/compare_HLA.py | 147 - hisatgenotype_scripts/compare_HLA_Omixon.py | 129 - hisatgenotype_scripts/extract_Omixon_HLA.py | 115 - .../hisatgenotype_HLA_genotyping_PGs.py | 199 -- .../hisatgenotype_convert_codis.py | 654 ---- .../hisatgenotype_extract_codis_data.py | 166 -- .../hisatgenotype_extract_cyp_data.py | 1061 ------- .../hisatgenotype_locus_samples.py | 354 --- hisatgenotype_scripts/run_extract_CP.sh | 11 - hisatgenotype_scripts/run_extract_ILMN.sh | 11 - hisatgenotype_scripts/run_genotype_build.sh | 10 - hisatgenotype_scripts/run_hisat2_build.sh | 10 - hisatgenotype_scripts/run_type_CP.sh | 10 - 22 files changed, 13468 deletions(-) delete mode 100755 hisatgenotype.py delete mode 100755 hisatgenotype_build_genome.py delete mode 100755 hisatgenotype_extract_reads.py delete mode 100755 hisatgenotype_extract_vars.py delete mode 100755 hisatgenotype_hla_cyp.py delete mode 100755 hisatgenotype_locus.py delete mode 100644 hisatgenotype_modules/__init__.py delete mode 100755 hisatgenotype_modules/hisatgenotype_assembly_graph.py delete mode 100755 hisatgenotype_modules/hisatgenotype_typing_common.py delete mode 100755 hisatgenotype_scripts/compare_HLA.py delete mode 100755 hisatgenotype_scripts/compare_HLA_Omixon.py delete mode 100755 hisatgenotype_scripts/extract_Omixon_HLA.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_convert_codis.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_extract_codis_data.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py delete mode 100755 hisatgenotype_scripts/hisatgenotype_locus_samples.py delete mode 100755 hisatgenotype_scripts/run_extract_CP.sh delete mode 100755 hisatgenotype_scripts/run_extract_ILMN.sh delete mode 100755 hisatgenotype_scripts/run_genotype_build.sh delete mode 100755 hisatgenotype_scripts/run_hisat2_build.sh delete mode 100755 hisatgenotype_scripts/run_type_CP.sh diff --git a/hisatgenotype.py b/hisatgenotype.py deleted file mode 100755 index cf433b48..00000000 --- a/hisatgenotype.py +++ /dev/null @@ -1,490 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re, resource -import inspect, random -import math -from datetime import datetime, date, time -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -Align reads, and sort the alignments into a BAM file -""" -def align_reads(base_fname, - read_fnames, - fastq, - threads, - verbose): - aligner_cmd = ["hisat2", - "--no-unal", - "-p", str(threads), - "--no-spliced-alignment", - "--max-altstried", "64"] - aligner_cmd += ["-X", "1000"] - # aligner_cmd += ["--mm"] - aligner_cmd += ["-x", "%s" % base_fname] - - assert len(read_fnames) > 0 - if not fastq: - aligner_cmd += ["-f"] - single = len(read_fnames) == 1 - if single: - aligner_cmd += ["-U", read_fnames[0]] - else: - aligner_cmd += ["-1", read_fnames[0], - "-2", read_fnames[1]] - - out_base_fname = read_fnames[0].split('/')[-1].split('.')[0] - - print >> sys.stderr, "%s Aligning %s to %s ..." % (str(datetime.now()), ' '.join(read_fnames), base_fname) - if verbose: - print >> sys.stderr, "\t%s" % (' '.join(aligner_cmd)) - - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - unsorted_bam_fname = "%s_unsorted.bam" % out_base_fname - sambam_cmd = ["samtools", - "view", - "-bS", - "-"] - sambam_proc = subprocess.Popen(sambam_cmd, - stdin=align_proc.stdout, - stdout=open(unsorted_bam_fname, 'w')) - sambam_proc.communicate() - - # Increase the maximum number of files that can be opened - resource.setrlimit(resource.RLIMIT_NOFILE, (10000, 10240)) - - print >> sys.stderr, "%s Sorting %s ..." % (str(datetime.now()), unsorted_bam_fname) - bam_fname = "%s.bam" % out_base_fname - bamsort_cmd = ["samtools", - "sort", - "--threads", str(threads), - "-m", "1536M", - unsorted_bam_fname, - "-o", bam_fname] - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(bamsort_cmd) - bamsort_proc = subprocess.call(bamsort_cmd) - os.remove(unsorted_bam_fname) - - index_bam(bam_fname, - verbose) - - return bam_fname - - -""" -""" -def index_bam(bam_fname, - verbose): - print >> sys.stderr, "%s Indexing %s ..." % (str(datetime.now()), bam_fname) - bamindex_cmd = ["samtools", - "index", - bam_fname] - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(bamindex_cmd) - bamindex_proc = subprocess.call(bamindex_cmd) - - -""" -""" -def extract_reads(bam_fname, - chr, - left, - right, - read_base_fname, # sample => sample.1.fq.gz and sample.2.fq.gz - paired, - fastq, - verbose): - out_read_dname = "hisatgenotype_out" - if not os.path.exists(out_read_dname): - os.mkdir(out_read_dname) - - read_fnames = [] - if paired: - read_fnames = [out_read_dname + "/" + read_base_fname + ".1.fq.gz", - out_read_dname + "/" + read_base_fname + ".2.fq.gz"] - else: - read_fnames = [out_read_dname + "/" + read_base_fname + ".fq.gz"] - - if paired: - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open(read_fnames[0], 'w'), - stderr=open("/dev/null", 'w')) - - gzip2_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open(read_fnames[1], 'w'), - stderr=open("/dev/null", 'w')) - else: - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open(read_fnames[0], 'w'), - stderr=open("/dev/null", 'w')) - - def write_read(gzip_proc, read_name, seq, qual): - if fastq: - gzip_proc.stdin.write("@%s\n" % read_name) - gzip_proc.stdin.write("%s\n" % seq) - gzip_proc.stdin.write("+\n") - gzip_proc.stdin.write("%s\n" % qual) - else: - gzip_proc.stdin.write(">%s\n" % prev_read_name) - gzip_proc.stdin.write("%s\n" % seq) - - bamview_cmd = ["samtools", "view", bam_fname, "%s:%d-%d" % (chr, left+1, right+1)] - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(bamview_cmd) - bamview_proc = subprocess.Popen(bamview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - prev_read_name, extract_read, read1, read2 = "", False, [], [] - for line in alignview_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11] - flag, pos = int(flag), int(pos) - strand = '-' if flag & 0x10 else '+' - AS, NH = "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - elif col.startswith("NH"): - NH = int(col[5:]) - - # DK - check this out - simulation = True - if (not simulation and read_name != prev_read_name) or \ - (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]): - if extract_read: - if paired: - if len(read1) == 2 and len(read2) == 2: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - write_read(gzip2_proc, prev_read_name, read2[0], read2[1]) - else: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - prev_read_name, extract_read, read1, read2 = read_name, False, [], [] - - if NH == 1: - extract_read = True - - if flag & 0x40 or not paired: # left read - if not read1: - if flag & 0x10: # reverse complement - read1 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read1 = [read, qual] - else: - assert flag & 0x80 # right read - if flag & 0x10: # reverse complement - read2 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read2 = [read, qual] - - if extract_read: - if paired: - if len(read1) == 2 and len(read2) == 2: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - write_read(gzip2_proc, prev_read_name, read2[0], read2[1]) - else: - write_read(gzip1_proc, prev_read_name, read1[0], read1[1]) - - gzip1_proc.stdin.close() - if paired: - gzip2_proc.stdin.close() - - return read_fnames - - -""" -""" -def perform_genotyping(base_fname, - database, - locus_list, - read_fnames, - fastq, - num_editdist, - assembly, - local_database, - threads, - verbose): - genotype_cmd = ["hisatgenotype_locus.py"] - if not local_database: - genotype_cmd += ["--genotype-genome", base_fname] - genotype_cmd += ["--base", database] - if len(locus_list) > 0: - genotype_cmd += ["--locus-list", ','.join(locus_list)] - genotype_cmd += ["-p", str(threads), - "--num-editdist", str(num_editdist)] - if not fastq: - genotype_cmd += ["-f"] - - if len(read_fnames) == 2: # paired - genotype_cmd += ["-1", read_fnames[0], - "-2", read_fnames[1]] - elif len(read_fnames) == 1: - genotype_cmd += ["-U", read_fnames[0]] - else: - assert len(read_fnames) == 0 - - if assembly: - genotype_cmd += ["--assembly"] - - if verbose: - print >> sys.stderr, "\t%s" % ' '.join(genotype_cmd) - genotype_proc = subprocess.Popen(genotype_cmd) - genotype_proc.communicate() - - -""" -""" -def genotype(base_fname, - target_region_list, - fastq, - read_fnames, - alignment_fname, - threads, - num_editdist, - assembly, - local_database, - verbose, - debug): - # variants, backbone sequence, and other sequeces - genotype_fnames = ["%s.fa" % base_fname, - "%s.locus" % base_fname, - "%s.snp" % base_fname, - "%s.index.snp" % base_fname, - "%s.haplotype" % base_fname, - "%s.link" % base_fname, - "%s.coord" % base_fname, - "%s.clnsig" % base_fname] - # hisat2 graph index files - genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)] - if not typing_common.check_files(genotype_fnames): - print >> sys.stderr, "Error: some of the following files are missing!" - for fname in genotype_fnames: - print >> sys.stderr, "\t%s" % fname - sys.exit(1) - - # Read region alleles (names and sequences) - regions, region_loci = {}, {} - for line in open("%s.locus" % base_fname): - family, allele_name, chr, left, right = line.strip().split()[:5] - family = family.lower() - if len(target_region_list) > 0 and \ - family not in target_region_list: - continue - - locus_name = allele_name.split('*')[0] - if family in target_region_list and \ - len(target_region_list[family]) > 0 and \ - locus_name not in target_region_list[family]: - continue - - left, right = int(left), int(right) - if family not in region_loci: - region_loci[family] = [] - region_loci[family].append([locus_name, allele_name, chr, left, right]) - - if len(region_loci) <= 0: - print >> sys.stderr, "Warning: no region exists!" - sys.exit(1) - - # Align reads, and sort the alignments into a BAM file - if len(read_fnames) > 0: - alignment_fname = align_reads(base_fname, - read_fnames, - fastq, - threads, - verbose) - assert alignment_fname != "" and os.path.exists(alignment_fname) - if not os.path.exists(alignment_fname + ".bai"): - index_bam(alignment_fname, - verbose) - assert os.path.exists(alignment_fname + ".bai") - - # Extract reads and perform genotyping - for family, loci in region_loci.items(): - print >> sys.stderr, "Analyzing %s ..." % family.upper() - for locus_name, allele_name, chr, left, right in loci: - out_read_fname = "%s.%s" % (family, locus_name) - if verbose: - print >> sys.stderr, "\tExtracting reads beloning to %s-%s ..." % \ - (family, locus_name) - - extracted_read_fnames = extract_reads(alignment_fname, - chr, - left, - right, - out_read_fname, - len(read_fnames) != 1, # paired? - fastq, - verbose) - - perform_genotyping(base_fname, - family, - [locus_name], - extracted_read_fnames, - fastq, - num_editdist, - assembly, - local_database, - threads, - verbose) - print >> sys.stderr - - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='HISAT-genotype') - parser.add_argument("--base", "--base-name", - dest="base_fname", - type=str, - default="genotype_genome", - help="base filename for genotype genome") - parser.add_argument("--region-list", - dest="region_list", - type=str, - default="", - help="A comma-separated list of regions (default: empty)") - parser.add_argument("-f", "--fasta", - dest='fastq', - action='store_false', - help='FASTA file') - parser.add_argument("-U", - dest="read_fname_U", - type=str, - default="", - help="filename for single-end reads") - parser.add_argument("-1", - dest="read_fname_1", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("-2", - dest="read_fname_2", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("--alignment-file", - dest="alignment_fname", - type=str, - default="", - help="Sorted BAM alignment file name") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--num-editdist", - dest="num_editdist", - type=int, - default=2, - help="Maximum number of mismatches per read alignment to be considered (default: 2)") - parser.add_argument('--assembly', - dest='assembly', - action='store_true', - help='Perform assembly') - parser.add_argument('--local-database', - dest='local_database', - action='store_true', - help='Use local database') - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument("--debug", - dest="debug", - type=str, - default="", - help="e.g., test_id:10,read_id:10000,basic_test") - - args = parser.parse_args() - region_list = {} - if args.region_list != "": - for region in args.region_list.split(','): - region = region.split('.') - if len(region) < 1 or len(region) > 2: - print >> sys.stderr, "Error: --region-list is incorrectly formatted." - sys.exit(1) - - family = region[0].lower() - if len(region) == 2: - locus_name = region[1].upper() - if family not in region_list: - region_list[family] = set() - if len(region) == 2: - region_list[family].add(locus_name) - - read_fnames = [] - if args.alignment_fname != "": - if not os.path.exists(args.alignment_fname): - print >> sys.stderr, "Error: %s does not exist." % args.alignment_fname - elif args.read_fname_U != "": - read_fnames = [args.read_fname_U] - else: - if args.read_fname_1 == "" or args.read_fname_2 == "": - print >> sys.stderr, "Error: please specify read file names correctly: -U or -1 and -2" - sys.exit(1) - read_fnames = [args.read_fname_1, args.read_fname_2] - - debug = {} - if args.debug != "": - for item in args.debug.split(','): - if ':' in item: - key, value = item.split(':') - debug[key] = value - else: - debug[item] = 1 - - genotype(args.base_fname, - region_list, - args.fastq, - read_fnames, - args.alignment_fname, - args.threads, - args.num_editdist, - args.assembly, - args.local_database, - args.verbose, - debug) - - diff --git a/hisatgenotype_build_genome.py b/hisatgenotype_build_genome.py deleted file mode 100755 index 3d103d92..00000000 --- a/hisatgenotype_build_genome.py +++ /dev/null @@ -1,505 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2016, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import shutil -import inspect -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -""" -def read_clnsig(fname): - clnsig_dic = {} - for line in open(fname): - var_id, gene, clnsig = line.strip().split('\t') - clnsig_dic[var_id] = [gene, clnsig] - return clnsig_dic - - -""" -""" -def build_genotype_genome(base_fname, - inter_gap, - intra_gap, - threads, - database_list, - use_clinvar, - use_commonvar, - aligner, - graph_index, - verbose): - # Download HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index() - - # Load genomic sequences - chr_dic, chr_names, chr_full_names = typing_common.read_genome(open("genome.fa")) - - genotype_vars, genotype_haplotypes, genotype_clnsig = {}, {}, {} - if use_clinvar: - # Extract variants from the ClinVar database - CLINVAR_fnames = ["clinvar.vcf.gz", - "clinvar.snp", - "clinvar.haplotype", - "clinvar.clnsig"] - - if not typing_common.check_files(CLINVAR_fnames): - if not os.path.exists("clinvar.vcf.gz"): - os.system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive/2017/clinvar_20170404.vcf.gz") - assert os.path.exists("clinvar.vcf.gz") - - extract_cmd = ["hisat2_extract_snps_haplotypes_VCF.py"] - extract_cmd += ["--inter-gap", str(inter_gap), - "--intra-gap", str(intra_gap), - "--genotype-vcf", "clinvar.vcf.gz", - "genome.fa", "/dev/null", "clinvar"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not typing_common.check_files(CLINVAR_fnames): - print >> sys.stderr, "Error: extract variants from clinvar failed!" - sys.exit(1) - - # Read variants to be genotyped - genotype_vars = typing_common.read_variants("clinvar.snp") - - # Read haplotypes - genotype_haplotypes = typing_common.read_haplotypes("clinvar.haplotype") - - # Read information about clinical significance - genotype_clnsig = typing_common.read_clnsig("clinvar.clnsig") - - if use_commonvar: - # Extract variants from dbSNP database - commonvar_fbase = "snp144Common" - commonvar_fnames = ["%s.snp" % commonvar_fbase, - "%s.haplotype" % commonvar_fbase] - if not typing_common.check_files(commonvar_fnames): - if not os.path.exists("%s.txt.gz" % commonvar_fbase): - os.system("wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/%s.txt.gz" % commonvar_fbase) - assert os.path.exists("%s.txt.gz" % commonvar_fbase) - os.system("gzip -cd %s.txt.gz | awk 'BEGIN{OFS=\"\t\"} {if($2 ~ /^chr/) {$2 = substr($2, 4)}; if($2 == \"M\") {$2 = \"MT\"} print}' > %s.txt" % (commonvar_fbase, commonvar_fbase)) - extract_cmd = ["hisat2_extract_snps_haplotypes_UCSC.py", - "--inter-gap", str(inter_gap), - "--intra-gap", str(intra_gap), - "genome.fa", "%s.txt" % commonvar_fbase, commonvar_fbase] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not typing_common.check_files(commonvar_fnames): - print >> sys.stderr, "Error: extract variants from clinvar failed!" - sys.exit(1) - - # Read variants to be genotyped - genotype_vars = typing_common.read_variants("%s.snp" % commonvar_fbase) - - # Read haplotypes - genotype_haplotypes = typing_common.read_haplotypes("%s.haplotype" % commonvar_fbase) - - # Genes to be genotyped - genotype_genes = {} - - # Read genes or genomics regions - for database_name in database_list: - # Extract HLA variants, backbone sequence, and other sequeces - typing_common.extract_database_if_not_exists(database_name, - [], # locus_list - inter_gap, - intra_gap, - True, # partial? - verbose) - locus_fname = "%s.locus" % database_name - assert os.path.exists(locus_fname) - for line in open(locus_fname): - locus_name, chr, left, right, length, exon_str, strand = line.strip().split() - left, right = int(left), int(right) - length = int(length) - if chr not in chr_names: - continue - if chr not in genotype_genes: - genotype_genes[chr] = [] - genotype_genes[chr].append([left, right, length, locus_name, database_name, exon_str, strand]) - - # Write genotype genome - var_num, haplotype_num = 0, 0 - genome_out_file = open("%s.fa" % base_fname, 'w') - locus_out_file = open("%s.locus" % base_fname, 'w') - var_out_file = open("%s.snp" % base_fname, 'w') - index_var_out_file = open("%s.index.snp" % base_fname, 'w') - haplotype_out_file = open("%s.haplotype" % base_fname, 'w') - link_out_file = open("%s.link" % base_fname, 'w') - coord_out_file = open("%s.coord" % base_fname, 'w') - clnsig_out_file = open("%s.clnsig" % base_fname, 'w') - for c in range(len(chr_names)): - chr = chr_names[c] - chr_full_name = chr_full_names[c] - assert chr in chr_dic - chr_seq = chr_dic[chr] - chr_len = len(chr_seq) - if chr in genotype_genes: - chr_genes = genotype_genes[chr] - def gene_cmp(a, b): - a_left, a_right, a_length = a[:3] - b_left, b_right, b_length = b[:3] - if a_left != b_left: - return a_left - b_left - if a_right != b_right: - return a_right - b_right - return a_lenght - b_length - chr_genes = sorted(chr_genes, cmp=gene_cmp) - else: - chr_genes = [] - - chr_genotype_vars, chr_genotype_vari = [], 0 - if graph_index: - if chr in genotype_vars: - chr_genotype_vars = genotype_vars[chr] - chr_genotype_haplotypes, chr_genotype_hti = [], 0 - if chr in genotype_haplotypes: - chr_genotype_haplotypes = genotype_haplotypes[chr] - - def add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num): - # Output variants with clinical significance - while chr_genotype_vari < len(chr_genotype_vars): - var_left, var_type, var_data, var_id = chr_genotype_vars[chr_genotype_vari] - var_right = var_left - if var_type == "deletion": - var_right += var_data - if var_right > right: - break - if var_right >= left: - chr_genotype_vari += 1 - continue - - out_str = "%s\t%s\t%s\t%d\t%s" % (var_id, var_type, chr, var_left + off, var_data) - print >> var_out_file, out_str - print >> index_var_out_file, out_str - - if var_id in genotype_clnsig: - var_gene, clnsig = genotype_clnsig[var_id] - print >> clnsig_out_file, "%s\t%s\t%s" % \ - (var_id, var_gene, clnsig) - - chr_genotype_vari += 1 - - # Output haplotypes - while chr_genotype_hti < len(chr_genotype_haplotypes): - ht_left, ht_right, ht_vars = chr_genotype_haplotypes[chr_genotype_hti] - if ht_right > right: - break - if ht_right >= left: - chr_genotype_hti += 1 - continue - - print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (haplotype_num, chr, ht_left + off, ht_right + off, ','.join(ht_vars)) - chr_genotype_hti += 1 - haplotype_num += 1 - - return chr_genotype_vari, chr_genotype_hti, haplotype_num - - out_chr_seq = "" - - off = 0 - prev_right = 0 - for gene in chr_genes: - left, right, length, name, family, exon_str, strand = gene - - if not graph_index: - # Output gene (genotype_genome.gene) - print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \ - (family.upper(), name, chr, left, right, exon_str, strand) - continue - - chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num) - - # Read HLA backbone sequences - allele_seqs = typing_common.read_allele_sequences("%s_backbone.fa" % family) - - # Read HLA variants - allele_vars = typing_common.read_variants("%s.snp" % family) - allele_index_vars = typing_common.read_variants("%s.index.snp" % family) - - # Read HLA haplotypes - allele_haplotypes = typing_common.read_haplotypes("%s.haplotype" % family) - - # Read HLA link information between haplotypes and variants - links = typing_common.read_links("%s.link" % family) - - if name not in allele_seqs: - continue - if name not in allele_vars or name not in allele_index_vars: - vars, index_vars = [], [] - else: - vars, index_vars = allele_vars[name], allele_index_vars[name] - - allele_seq = allele_seqs[name] - index_var_ids = set() - for _, _, _, var_id in index_vars: - index_var_ids.add(var_id) - - if name not in allele_haplotypes: - haplotypes = [] - else: - haplotypes = allele_haplotypes[name] - assert length == len(allele_seq) - assert left < chr_len and right < chr_len - # Skipping overlapping genes - if left < prev_right: - print >> sys.stderr, "Warning: skipping %s ..." % (name) - continue - - varID2htID = {} - - assert left < right - prev_length = right - left + 1 - assert prev_length <= length - - if prev_right < left: - out_chr_seq += chr_seq[prev_right:left] - - # Output gene (genotype_genome.gene) - print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \ - (family.upper(), name, chr, len(out_chr_seq), len(out_chr_seq) + length - 1, exon_str, strand) - - # Output coord (genotype_genome.coord) - print >> coord_out_file, "%s\t%d\t%d\t%d" % \ - (chr, len(out_chr_seq), left, right - left + 1) - out_chr_seq += allele_seq - - # Output variants (genotype_genome.snp and genotype_genome.index.snp) - for var in vars: - var_left, var_type, var_data, var_id = var - new_var_id = "hv%d" % var_num - varID2htID[var_id] = new_var_id - new_var_left = var_left + left + off - assert var_type in ["single", "deletion", "insertion"] - assert new_var_left < len(out_chr_seq) - if var_type == "single": - assert out_chr_seq[new_var_left] != var_data - elif var_type == "deletion": - assert new_var_left + var_data <= len(out_chr_seq) - else: - assert var_type == "insertion" - - out_str = "%s\t%s\t%s\t%d\t%s" % (new_var_id, var_type, chr, new_var_left, var_data) - print >> var_out_file, out_str - if var_id in index_var_ids: - print >> index_var_out_file, out_str - var_num += 1 - - # Output haplotypes (genotype_genome.haplotype) - for haplotype in haplotypes: - ht_left, ht_right, ht_vars = haplotype - new_ht_left = ht_left + left + off - assert new_ht_left < len(out_chr_seq) - new_ht_right = ht_right + left + off - assert new_ht_left <= new_ht_right - assert new_ht_right <= len(out_chr_seq) - new_ht_vars = [] - for var_id in ht_vars: - assert var_id in varID2htID - new_ht_vars.append(varID2htID[var_id]) - print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (haplotype_num, chr, new_ht_left, new_ht_right, ','.join(new_ht_vars)) - haplotype_num += 1 - - # Output link information between alleles and variants (genotype_genome.link) - for link in links: - var_id, allele_names = link - if var_id not in varID2htID: - continue - new_var_id = varID2htID[var_id] - print >> link_out_file, "%s\t%s" % (new_var_id, allele_names) - - off += (length - prev_length) - - prev_right = right + 1 - - if not graph_index: - continue - - # Write the rest of the Vars - chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(sys.maxint, sys.maxint, chr_genotype_vari, chr_genotype_hti, haplotype_num) - - print >> coord_out_file, "%s\t%d\t%d\t%d" % \ - (chr, len(out_chr_seq), prev_right, len(chr_seq) - prev_right) - out_chr_seq += chr_seq[prev_right:] - - assert len(out_chr_seq) == len(chr_seq) + off - - # Output chromosome sequence - print >> genome_out_file, ">%s" % (chr_full_name) - line_width = 60 - for s in range(0, len(out_chr_seq), line_width): - print >> genome_out_file, out_chr_seq[s:s+line_width] - - genome_out_file.close() - locus_out_file.close() - var_out_file.close() - index_var_out_file.close() - haplotype_out_file.close() - link_out_file.close() - coord_out_file.close() - clnsig_out_file.close() - - allele_out_file = open("%s.allele" % base_fname, 'w') - if graph_index: - for database in database_list: - for line in open("%s.allele" % database): - allele_name = line.strip() - print >> allele_out_file, "%s\t%s" % (database.upper(), allele_name) - allele_out_file.close() - - partial_out_file = open("%s.partial" % base_fname, 'w') - if graph_index: - for database in database_list: - for line in open("%s.partial" % database): - allele_name = line.strip() - print >> partial_out_file, "%s\t%s" % (database.upper(), allele_name) - partial_out_file.close() - - if not graph_index: - shutil.copyfile("genome.fa", "%s.fa" % base_fname) - - # Index genotype_genome.fa - index_cmd = ["samtools", "faidx", "%s.fa" % base_fname] - subprocess.call(index_cmd) - - # Build indexes based on the above information - if graph_index: - assert aligner == "hisat2" - build_cmd = ["hisat2-build", - "-p", str(threads), - "--snp", "%s.index.snp" % base_fname, - "--haplotype", "%s.haplotype" % base_fname, - "%s.fa" % base_fname, - "%s" % base_fname] - else: - assert aligner in ["hisat2", "bowtie2"] - build_cmd = ["%s-build" % aligner, - "-p" if aligner == "hisat2" else "--threads", str(threads), - "%s.fa" % base_fname, - "%s" % base_fname] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(build_cmd) - - subprocess.call(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - - if aligner == "hisat2": - index_fnames = ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)] - else: - index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)] - index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)] - if not typing_common.check_files(index_fnames): - print >> sys.stderr, "Error: indexing failed! Perhaps, you may have forgotten to build %s executables?" % aligner - sys.exit(1) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Build genotype genome") - parser.add_argument("--base", "--base-fname", - dest="base_fname", - type=str, - default="genotype_genome", - help="base filename for genotype genome (default: genotype_genome)") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--database-list", - dest="database_list", - type=str, - default="", - help="A comma-separated list of databases (default: hla,codis,cyp)") - parser.add_argument("--commonvar", - dest="use_commonvar", - action="store_true", - help="Include common variants from dbSNP") - parser.add_argument("--clinvar", - dest="use_clinvar", - action="store_true", - help="Include variants from ClinVar database") - parser.add_argument("--inter-gap", - dest="inter_gap", - type=int, - default=30, - help="Maximum distance for variants to be in the same haplotype") - parser.add_argument("--intra-gap", - dest="intra_gap", - type=int, - default=50, - help="Break a haplotype into several haplotypes") - parser.add_argument("--aligner", - dest="aligner", - type=str, - default="hisat2", - help="Aligner (default: hisat2)") - parser.add_argument("--linear-index", - dest="graph_index", - action="store_false", - help="Build linear index") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.inter_gap > args.intra_gap: - print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap) - sys.exit(1) - - if args.database_list == "": - database_list = [] - else: - database_list = args.database_list.split(',') - - if args.use_clinvar and args.use_commonvar: - print >> sys.stderr, "Error: both --clinvar and --commonvar cannot be used together." - sys.exit(1) - - if args.aligner not in ["hisat2", "bowtie2"]: - print >> sys.stderr, "Error: --aligner should be either hisat2 or bowtie2." - sys.exit(1) - - build_genotype_genome(args.base_fname, - args.inter_gap, - args.intra_gap, - args.threads, - database_list, - args.use_clinvar, - args.use_commonvar, - args.aligner, - args.graph_index, - args.verbose) - diff --git a/hisatgenotype_extract_reads.py b/hisatgenotype_extract_reads.py deleted file mode 100755 index 98215655..00000000 --- a/hisatgenotype_extract_reads.py +++ /dev/null @@ -1,541 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re, resource -import inspect -import random -import glob -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -""" -def parallel_work(pids, - work, - fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose): - child = -1 - for i in range(len(pids)): - if pids[i] == 0: - child = i - break - - while child == -1: - status = os.waitpid(0, 0) - for i in range(len(pids)): - if status[0] == pids[i]: - child = i - pids[i] = 0 - break - - child_id = os.fork() - if child_id == 0: - work(fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose) - os._exit(os.EX_OK) - else: - # print >> sys.stderr, '\t\t>> thread %d: %d' % (child, child_id) - pids[child] = child_id - - -""" -""" -def wait_pids(pids): - for pid in pids: - if pid > 0: - os.waitpid(pid, 0) - - -""" -""" -def extract_reads(base_fname, - database_list, - read_dir, - out_dir, - suffix, - read_fname, - fastq, - paired, - simulation, - threads, - threads_aprocess, - max_sample, - job_range, - aligner, - block_size, - verbose): - if block_size > 0: - resource.setrlimit(resource.RLIMIT_NOFILE, (1000, 1000)) - resource.setrlimit(resource.RLIMIT_NPROC, (1000, 1000)) - - genotype_fnames = ["%s.fa" % base_fname, - "%s.locus" % base_fname, - "%s.snp" % base_fname, - "%s.haplotype" % base_fname, - "%s.link" % base_fname, - "%s.coord" % base_fname, - "%s.clnsig" % base_fname] - # graph index files - if aligner == "hisat2": - genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)] - else: - assert aligner == "bowtie2" - genotype_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)] - genotype_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)] - - if not typing_common.check_files(genotype_fnames): - print >> sys.stderr, "Error: %s related files do not exist as follows:" % base_fname - for fname in genotype_fnames: - print >> sys.stderr, "\t%s" % fname - sys.exit(1) - - filter_region = len(database_list) > 0 - ranges = [] - regions, region_loci = {}, {} - for line in open("%s.locus" % base_fname): - family, allele_name, chr, left, right = line.strip().split()[:5] - if filter_region and family.lower() not in database_list: - continue - region_name = "%s-%s" % (family, allele_name.split('*')[0]) - assert region_name not in regions - regions[region_name] = allele_name - left, right = int(left), int(right) - """ - exons = [] - for exon in exon_str.split(','): - exon_left, exon_right = exon.split('-') - exons.append([int(exon_left), int(exon_right)]) - """ - if chr not in region_loci: - region_loci[chr] = {} - region_loci[chr][region_name] = [allele_name, chr, left, right] - database_list.add(family.lower()) - - if out_dir != "" and not os.path.exists(out_dir): - os.mkdir(out_dir) - - # Extract reads - if len(read_fname) > 0: - if paired: - fq_fnames = [read_fname[0]] - fq_fnames2 = [read_fname[1]] - else: - fq_fnames = read_fname - else: - if paired: - fq_fnames = glob.glob("%s/*.1.%s" % (read_dir, suffix)) - else: - fq_fnames = glob.glob("%s/*.%s" % (read_dir, suffix)) - count = 0 - pids = [0 for i in range(threads)] - for file_i in range(len(fq_fnames)): - if file_i >= max_sample: - break - fq_fname = fq_fnames[file_i] - if job_range[1] > 1: - if job_range[0] != (file_i % job_range[1]): - continue - - fq_fname_base = fq_fname.split('/')[-1] - one_suffix = ".1." + suffix - if fq_fname_base.find(one_suffix) != -1: - fq_fname_base = fq_fname_base[:fq_fname_base.find(one_suffix)] - else: - fq_fname_base = fq_fname_base.split('.')[0] - - if paired: - if read_dir == "": - fq_fname2 = fq_fnames2[file_i] - else: - fq_fname2 = "%s/%s.2.%s" % (read_dir, fq_fname_base, suffix) - if not os.path.exists(fq_fname2): - print >> sys.stderr, "%s does not exist." % fq_fname2 - continue - else: - fq_fname2 = "" - - if paired: - if out_dir != "": - if os.path.exists("%s/%s.extracted.1.fq.gz" % (out_dir, fq_fname_base)): - continue - else: - if out_dir != "": - if os.path.exists("%s/%s.extracted.fq.gz" % (out_dir, fq_fname_base)): - continue - count += 1 - - print >> sys.stderr, "\t%d: Extracting reads from %s" % (count, fq_fname_base) - def work(fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose): - aligner_cmd = [aligner] - if threads_aprocess > 1: - aligner_cmd += ["-p", "%d" % threads_aprocess] - if not fastq: - aligner_cmd += ["-f"] - aligner_cmd += ["-x", base_fname] - if aligner == "hisat2": - aligner_cmd += ["--no-spliced-alignment"] - # aligner_cmd += ["--max-altstried", "64"] - aligner_cmd += ["-X", "1000"] - if paired: - aligner_cmd += ["-1", fq_fname, - "-2", fq_fname2] - else: - aligner_cmd += ["-U", fq_fname] - if verbose: - print >> sys.stderr, "\t\trunning", ' '.join(aligner_cmd) - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - gzip_dic = {} - out_dir_slash = out_dir - if out_dir != "": - out_dir_slash += "/" - for database in database_list: - if paired: - # LP6005041-DNA_A01.extracted.1.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.extracted.1.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'), - stderr=open("/dev/null", 'w')) - - # LP6005041-DNA_A01.extracted.2.fq.gz - gzip2_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.extracted.2.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'), - stderr=open("/dev/null", 'w')) - else: - # LP6005041-DNA_A01.extracted.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.extracted.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'), - stderr=open("/dev/null", 'w')) - gzip_dic[database] = [gzip1_proc, gzip2_proc if paired else None] - - whole_gzip_dic = {} - if block_size > 0: - mult = block_size / 1000000 - for chr_line in open("%s.fa.fai" % base_fname): - chr, length = chr_line.strip().split('\t')[:2] - length = int(length) - if chr not in [str(i+1) for i in range(22)] + ['X', 'Y', 'MT']: - continue - length = (length + block_size - 1) / block_size - assert chr not in whole_gzip_dic - whole_gzip_dic[chr] = [] - for region_i in range(length): - if paired: - # LP6005041-DNA_A01.extracted.1.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.%d_%dM.extracted.1.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'), - stderr=open("/dev/null", 'w')) - - # LP6005041-DNA_A01.extracted.2.fq.gz - gzip2_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.%d_%dM.extracted.2.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'), - stderr=open("/dev/null", 'w')) - else: - # LP6005041-DNA_A01.extracted.fq.gz - gzip1_proc = subprocess.Popen(["gzip"], - stdin=subprocess.PIPE, - stdout=open("%s%s.%s.%d_%dM.extracted.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'), - stderr=open("/dev/null", 'w')) - whole_gzip_dic[chr].append([gzip1_proc, gzip2_proc if paired else None]) - - - def write_read(gzip_proc, read_name, seq, qual): - if fastq: - gzip_proc.stdin.write("@%s\n" % read_name) - gzip_proc.stdin.write("%s\n" % seq) - gzip_proc.stdin.write("+\n") - gzip_proc.stdin.write("%s\n" % qual) - else: - gzip_proc.stdin.write(">%s\n" % prev_read_name) - gzip_proc.stdin.write("%s\n" % seq) - - prev_read_name, extract_read, whole_extract_read, read1, read2, read1_first, read2_first = "", set(), set(), [], [], True, True - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11] - flag, pos = int(flag), int(pos) - 1 - strand = '-' if flag & 0x10 else '+' - AS, XS, NH = "", "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - elif col.startswith("XS"): - XS = int(col[5:]) - elif col.startswith("NH"): - NH = int(col[5:]) - - if (not simulation and read_name != prev_read_name) or \ - (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]): - for region in extract_read: - write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1]) - - for chr_region_num in whole_extract_read: - region_chr, region_num = chr_region_num.split('-') - region_num = int(region_num) - if region_chr not in whole_gzip_dic: - continue - - assert region_num < len(whole_gzip_dic[region_chr]) - write_read(whole_gzip_dic[region_chr][region_num][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(whole_gzip_dic[region_chr][region_num][1], prev_read_name, read2[0], read2[1]) - - prev_read_name, extract_read, whole_extract_read, read1, read2, read1_first, read2_first = read_name, set(), set(), [], [], True, True - - if flag & 0x4 == 0 and \ - ((aligner == "hisat2" and NH == 1) or (aligner == "bowtie2" and AS > XS and read1_first if flag & 0x40 or not paired else read2_first)): - if chr in region_loci: - for region, loci in region_loci[chr].items(): - region = region.split('-')[0].lower() - _, _, loci_left, loci_right = loci - # there might be a different candidate region for each of left and right reads - if pos >= loci_left and pos < loci_right: - extract_read.add(region) - break - if block_size > 0: - chr_region_num = "%s-%d" % (chr, pos / block_size) - whole_extract_read.add(chr_region_num) - - if flag & 0x40 or not paired: # left read - read1_first = False - if not read1: - if flag & 0x10: # reverse complement - read1 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read1 = [read, qual] - else: - assert flag & 0x80 # right read - read2_first = False - if flag & 0x10: # reverse complement - read2 = [typing_common.reverse_complement(read), qual[::-1]] - else: - read2 = [read, qual] - - for region in extract_read: - write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1]) - - for chr_region_num in whole_extract_read: - region_chr, region_num = chr_region_num.split('-') - region_num = int(region_num) - if region_chr not in whole_gzip_dic: - continue - assert region_num < len(whole_gzip_dic[region_chr]) - write_read(whole_gzip_dic[region_chr][region_num][0], prev_read_name, read1[0], read1[1]) - if paired: - write_read(whole_gzip_dic[region_chr][region_num][1], prev_read_name, read2[0], read2[1]) - - for gzip1_proc, gzip2_proc in gzip_dic.values(): - gzip1_proc.stdin.close() - if paired: - gzip2_proc.stdin.close() - - for gzip_list in whole_gzip_dic.values(): - for gzip1_proc, gzip2_proc in gzip_list: - gzip1_proc.stdin.close() - if paired: - gzip2_proc.stdin.close() - - - if threads <= 1: - work(fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose) - else: - parallel_work(pids, - work, - fq_fname_base, - fq_fname, - fq_fname2, - ranges, - simulation, - verbose) - - if threads > 1: - wait_pids(pids) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='Extract reads') - parser.add_argument("--base", "--base-fname", - dest="base_fname", - type=str, - default="genotype_genome", - help="base filename for genotype genome") - parser.add_argument("--read-dir", - dest="read_dir", - type=str, - default="", - help="Directory name for read files") - parser.add_argument("--out-dir", - dest="out_dir", - type=str, - default="", - help="Directory name for extracted read files") - parser.add_argument("--suffix", - dest="suffix", - type=str, - default="fq.gz", - help="Read file suffix (Default: fq.gz)") - parser.add_argument('-f', '--fasta', - dest='fastq', - action='store_false', - help='FASTA format') - parser.add_argument("-U", - dest="read_fname_U", - type=str, - default="", - help="filename for single-end reads") - parser.add_argument("-1", - dest="read_fname_1", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("-2", - dest="read_fname_2", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("--database-list", - dest="database_list", - type=str, - default="", - help="A comma-separated list of database (default: empty)") - parser.add_argument('--simulation', - dest='simulation', - action='store_true', - help='Simulated reads (Default: False)') - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--pp", "--threads-aprocess", - dest="threads_aprocess", - type=int, - default=1, - help="Number of threads a process") - parser.add_argument("--max-sample", - dest="max_sample", - type=int, - default=sys.maxint, - help="Number of samples to be extracted (default: sys.maxint)") - parser.add_argument("--job-range", - dest="job_range", - type=str, - default="0,1", - help="two numbers (e.g. 1,3)") - parser.add_argument("--aligner", - dest="aligner", - type=str, - default="hisat2", - help="Aligner (default: hisat2)") - parser.add_argument("--extract-whole", - dest="extract_whole", - action='store_true', - help="Extract all reads") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - - args = parser.parse_args() - - database_list = set() - if args.database_list != "": - for region in args.database_list.split(','): - database_list.add(region) - if args.read_fname_U != "": - args.read_fname = [args.read_fname_U] - elif args.read_fname_1 != "" or args.read_fname_2 != "": - if args.read_fname_1 == "" or args.read_fname_2 == "": - print >> sys.stderr, "Error: please specify both -1 and -2." - sys.exit(1) - args.read_fname = [args.read_fname_1, args.read_fname_2] - else: - args.read_fname = [] - if len(args.read_fname) == 0: - if args.read_dir == "" or not os.path.exists(args.read_dir): - print >> sys.stderr, "Error: please specify --read-dir with an existing directory." - sys.exit(1) - if args.out_dir == "": - print >> sys.stderr, "Error: please specify --out-dir with a directory name." - sys.exit(1) - job_range = [] - for num in args.job_range.split(','): - job_range.append(int(num)) - - if args.aligner not in ["hisat2", "bowtie2"]: - print >> sys.stderr, "Error: --aligner should be either hisat2 or bowtie2." - sys.exit(1) - block_size = 20000000 if args.extract_whole else 0 - - extract_reads(args.base_fname, - database_list, - args.read_dir, - args.out_dir, - args.suffix, - args.read_fname, - args.fastq, - False if args.read_fname_U != "" else True, - args.simulation, - args.threads, - args.threads_aprocess, - args.max_sample, - job_range, - args.aligner, - block_size, - args.verbose) - diff --git a/hisatgenotype_extract_vars.py b/hisatgenotype_extract_vars.py deleted file mode 100755 index 4c673177..00000000 --- a/hisatgenotype_extract_vars.py +++ /dev/null @@ -1,1299 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2015, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import inspect -import glob -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -""" -Mapping from base pair to a location in MSF format -""" -def create_map(seq): - seq_map = {} - count = 0 - for i in range(len(seq)): - bp = seq[i] - if bp == '.': - continue - assert bp in "ACGT" - seq_map[count] = i - count += 1 - return seq_map - - -""" -""" -def create_consensus_seq(seqs, - seq_len, - min_var_freq, - remove_empty = True): - consensus_freq = [[0, 0, 0, 0, 0] for i in range(seq_len)] - for i in range(len(seqs)): - seq = seqs[i] - if len(seq) != seq_len: - continue - for j in range(seq_len): - nt = seq[j] - assert nt in "ACGT.E" - if nt == 'A': - consensus_freq[j][0] += 1 - elif nt == 'C': - consensus_freq[j][1] += 1 - elif nt == 'G': - consensus_freq[j][2] += 1 - elif nt == 'T': - consensus_freq[j][3] += 1 - else: - assert nt in ".E" - consensus_freq[j][4] += 1 - - for j in range(len(consensus_freq)): - for k in range(len(consensus_freq[j])): - consensus_freq[j][k] /= float(len(seqs)) - consensus_freq[j][k] *= 100.0 - - consensus_seq = "" - has_empty = False - for c in range(len(consensus_freq)): - freq = consensus_freq[c] - A, C, G, T, E = freq - # No alleles have bases at this particular location - if E >= 100.0: - has_empty = True - consensus_seq += 'E' - continue - if E >= 100.0 - min_var_freq: - idx = 4 - else: - idx = freq.index(max(freq[:4])) - assert idx < 5 - consensus_seq += "ACGT."[idx] - consensus_seq = ''.join(consensus_seq) - - # Remove dots (deletions) - skip_pos = set() - if has_empty and remove_empty: - for seq_i in range(len(seqs)): - seqs[seq_i] = list(seqs[seq_i]) - for i in range(len(consensus_seq)): - if consensus_seq[i] != 'E': - continue - skip_pos.add(i) - for seq_i in range(len(seqs)): - if i >= len(seqs[seq_i]): - continue - seqs[seq_i][i] = 'E' - for seq_i in range(len(seqs)): - seqs[seq_i] = ''.join(seqs[seq_i]) - seqs[seq_i] = seqs[seq_i].replace('E', '') - consensus_seq = consensus_seq.replace('E', '') - - # Convert a list form of consensus_freq to a dictionary form - temp_freq = [] - for j in range(len(consensus_freq)): - if j in skip_pos: - continue - freq_dic = {} - for k in range(len(consensus_freq[j])): - freq = consensus_freq[j][k] - if freq <= 0.0: - continue - nt = "ACGT."[k] - freq_dic[nt] = freq - temp_freq.append(freq_dic) - consensus_freq = temp_freq - - assert len(consensus_seq) == len(consensus_freq) - return consensus_seq, consensus_freq - - - -""" -Left-shift deletions if poissble -""" -def leftshift_deletions(backbone_seq, seq, debug = False): - if len(seq) != len(backbone_seq): - return seq - seq = list(seq) - seq_len = len(seq) - bp_i = 0 - # Skip the first deletion - while bp_i < seq_len: - if seq[bp_i] in "ACGT": - break - bp_i += 1 - - while bp_i < seq_len: - bp = seq[bp_i] - if bp != '.': - bp_i += 1 - continue - bp_j = bp_i + 1 - while bp_j < seq_len: - bp2 = seq[bp_j] - if bp2 != '.': - break - else: - bp_j += 1 - - if bp_j >= seq_len: - bp_i = bp_j - break - - if debug: - print >> sys.stderr, bp_i, bp_j, backbone_seq[bp_i-10:bp_i], backbone_seq[bp_i:bp_j], backbone_seq[bp_j:bp_j+10] - print >> sys.stderr, bp_i, bp_j, ''.join(seq[bp_i-10:bp_i]), ''.join(seq[bp_i:bp_j]), ''.join(seq[bp_j:bp_j+10]) - prev_i, prev_j = bp_i, bp_j - - while bp_i > 0 and seq[bp_i-1] in "ACGT" and backbone_seq[bp_j-1] in "ACGT": - if seq[bp_i-1] != backbone_seq[bp_j-1]: - break - seq[bp_j-1] = seq[bp_i-1] - seq[bp_i-1] = '.' - bp_i -= 1 - bp_j -= 1 - bp_i = bp_j - while bp_i < seq_len: - if seq[bp_i] in "ACGT": - break - bp_i += 1 - - # DK - debugging purposes - if debug: - print prev_i, prev_j, ''.join(seq[prev_i-10:prev_i]), ''.join(seq[prev_i:prev_j]), ''.join(seq[prev_j:prev_j+10]) - - return ''.join(seq) - - -""" -""" -def extract_vars(base_fname, - base_dname, - locus_list, - inter_gap, - intra_gap, - whole_haplotype, - min_var_freq, - ext_seq_len, - leftshift, - partial, - verbose): - base_fullpath_name = base_fname - if base_dname != "" and not os.path.exists(base_dname): - os.mkdir(base_dname) - base_fullpath_name = "%s/%s" % (base_dname, base_fname) - - # Download human genome and HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index() - - # Corresponding genomic loci found by HISAT2 (reference is GRCh38) - # e.g. hisat2 --no-unal --score-min C,0 -x grch38/genome -f hisatgenotype_db/HLA/fasta/A_gen.fasta - locus_file = open(base_fullpath_name + ".locus", 'w') - left_ext_seq_dic, right_ext_seq_dic = {}, {} - genes, gene_strand = {}, {} - - # Clone a git repository, hisatgenotype_db - if not os.path.exists("hisatgenotype_db"): - typing_common.clone_hisatgenotype_database() - fasta_dname = "hisatgenotype_db/%s/fasta" % base_fname.upper() - - # Check HLA genes - gene_names = [] - if base_fname == "hla": - fasta_fnames = glob.glob("%s/*_gen.fasta" % fasta_dname) - else: - assert base_fname in ["codis", "cyp"] - fasta_fnames = glob.glob("%s/*.fasta" % fasta_dname) - for gen_fname in fasta_fnames: - gene_name = gen_fname.split('/')[-1].split('_')[0] - if gene_name == "hla": - continue - gene_names.append(gene_name) - - if locus_list == []: - locus_list = gene_names - - cigar_re = re.compile('\d+\w') - remove_locus_list = [] - for gene in locus_list: - aligner_cmd = ["hisat2"] - if base_fname in ["hla", "coids"]: - aligner_cmd += ["--score-min", "C,0"] - aligner_cmd += ["--no-unal", - "-x", "grch38/genome", - "-f", "%s/%s_gen.fasta" % (fasta_dname, gene)] - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - allele_id = "" - best_chr, best_left, best_right, best_AS, best_strand = "", -1, -1, -sys.maxint, '' - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - temp_allele_id, flag, chr, left, _, cigar_str = cols[:6] - left = int(left) - 1 - right = left - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - if len(cigars) > 1 or cigars[0][0] != 'M': - continue - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MND": - right += length - - flag = int(flag) - strand = '-' if flag & 0x10 else '+' - AS = "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = col[5:] - assert AS != "" - AS = int(AS) - if AS > best_AS: - allele_id = temp_allele_id - best_chr, best_left, best_right, best_AS, best_strand = chr, left, right, AS, strand - - chr, left, right, strand = best_chr, best_left, best_right, best_strand - align_proc.communicate() - if allele_id == "": - remove_locus_list.append(gene) - continue - if base_fname == "hla": - allele_name = "" - for line in open("%s/%s_gen.fasta" % (fasta_dname, gene)): - line = line.strip() - if not line.startswith('>'): - continue - tmp_allele_id, tmp_allele_name = line[1:].split()[:2] - if allele_id == tmp_allele_id: - allele_name = tmp_allele_name - break - else: - allele_name = allele_id - assert allele_name != "" and strand != '' - genes[gene] = allele_name - gene_strand[gene] = strand - print >> sys.stderr, "%s-%s's reference allele is %s on '%s' strand of chromosome %s" % \ - (base_fname.upper(), gene, allele_name, strand, chr) - - assert chr != "" and left >= 0 and right > left - if ext_seq_len > 0: - left_ext_seq, right_ext_seq = "", "" - left1, left2 = max(1, left - ext_seq_len), max(1, left - 1) - if left2 > 0: - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, left1, left2)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - left_ext_seq += line - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + ext_seq_len - 1)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - right_ext_seq += line - - if strand == '-': - left_ext_seq, right_ext_seq = typing_common.reverse_complement(right_ext_seq), typing_common.reverse_complement(left_ext_seq) - left_ext_seq_dic[gene], right_ext_seq_dic[gene] = left_ext_seq, right_ext_seq - - - # Extract exon information from hla.data - gene_exons, gene_exon_counts = {}, {} - if base_fname == "hla": - skip, look_exon_num = False, False - for line in open("hisatgenotype_db/%s/hla.dat" % base_fname.upper()): - if line.startswith("DE"): - allele_name = line.split()[1][:-1] - if allele_name.startswith("HLA-"): - allele_name = allele_name[4:] - gene = allele_name.split('*')[0] - if not gene in genes: - skip = True - else: - skip = False - if skip: - continue - if not line.startswith("FT"): - continue - - if line.find("exon") != -1: - look_exon_num = True - if allele_name == genes[gene]: - exon_range = line.split()[2].split("..") - exon_left, exon_right = int(exon_range[0]) - 1, int(exon_range[1]) - 1 - assert exon_left >= 0 - assert exon_left < exon_right - if not gene in gene_exons: - gene_exons[gene] = [] - if gene in left_ext_seq_dic: - left_ext_seq_len = len(left_ext_seq_dic[gene]) - else: - left_ext_seq_len = 0 - gene_exons[gene].append([exon_left + left_ext_seq_len, exon_right + left_ext_seq_len]) - elif look_exon_num: - assert line.find("number") - look_exon_num = False - num = line.strip().split("number=")[1] - num = int(num[1:-1]) - 1 - if gene not in gene_exon_counts: - gene_exon_counts[gene] = {} - if num not in gene_exon_counts[gene]: - gene_exon_counts[gene][num] = 1 - else: - gene_exon_counts[gene][num] += 1 - - for gene, exon_counts in gene_exon_counts.items(): - print >> sys.stderr, "%s exon counts:" % gene, exon_counts - - tmp_locus_list = [] - for gene in locus_list: - if gene in remove_locus_list: - continue - if base_fname == "hla" and gene not in gene_exons: - continue - tmp_locus_list.append(gene) - locus_list = tmp_locus_list - for key in genes.keys(): - if key in locus_list: - continue - del genes[key] - del gene_strand[key] - - # Write the backbone sequences into a fasta file - backbone_file = open(base_fullpath_name + "_backbone.fa", 'w') - # variants w.r.t the backbone sequences into a SNP file - var_file = open(base_fullpath_name + ".snp", 'w') - var_index_file = open(base_fullpath_name + ".index.snp", 'w') - # variant frequence - var_freq_file = open(base_fullpath_name + ".snp.freq", 'w') - # haplotypes - haplotype_file = open(base_fullpath_name + ".haplotype", 'w') - # pairs of a variant and the corresponding HLA allels into a LINK file - link_file = open(base_fullpath_name + ".link", 'w') - # Write all the sequences with dots removed into a file - input_file = open(base_fullpath_name + "_sequences.fa", 'w') - # Write allele names into a file - allele_file = open("%s.allele" % base_fullpath_name, 'w') - # Read partial alleles from hla.data, and write them into a file - partial_file = open("%s.partial" % base_fullpath_name, 'w') - - num_vars, num_haplotypes = 0, 0 - full_alleles = {} - for gene, ref_gene in genes.items(): - strand = gene_strand[gene] - left_ext_seq, right_ext_seq = "", "" - if gene in left_ext_seq_dic: - left_ext_seq, right_ext_seq = left_ext_seq_dic[gene], right_ext_seq_dic[gene] - - def read_MSF_file(fname, left_ext_seq = "", right_ext_seq = ""): - names = {} # HLA allele names to numeric IDs - seqs = [] # HLA multiple alignment sequences - for line in open(fname): - line = line.strip() - if not line or \ - not line[0].isalnum(): - continue - - if line.startswith("MSF"): - continue - - if line.startswith("Name"): - try: - name = line.split('\t')[0] - name = name.split()[1] - except ValueError: - continue - - if name in names: - print >> sys.stderr, "Warning: %s is found more than once in Names" % (name) - continue - - names[name] = len(names) - else: - if len(seqs) == 0: - seqs = [left_ext_seq for i in range(len(names))] - try: - cols = line.split() - name = cols[0] - fives = cols[1:] - assert len(fives) > 0 - except ValueError: - continue - - if name not in names: - names[name] = len(names) - - id = names[name] - if id >= len(seqs): - assert id == len(seqs) - seqs.append(left_ext_seq) - - seqs[id] += ''.join(fives) - - # Add sub-names of the allele - sub_name = "" - for group in name.split(':')[:-1]: - if sub_name != "": - sub_name += ":" - sub_name += group - if sub_name not in full_alleles: - full_alleles[sub_name] = [name] - else: - full_alleles[sub_name].append(name) - - if len(right_ext_seq) > 0: - for i_ in range(len(seqs)): - seqs[i_] += right_ext_seq - - return names, seqs - - if base_fname == "hla": - MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene) - else: - MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene) - - if not os.path.exists(MSA_fname): - print >> sys.stderr, "Warning: %s does not exist" % MSA_fname - continue - - names, seqs = read_MSF_file(MSA_fname, left_ext_seq, right_ext_seq) - full_allele_names = set(names.keys()) - - # Identify a consensus sequence - assert len(seqs) > 0 - - # Check sequences are of equal length - def find_seq_len(seqs): - seq_lens = {} - for s in range(len(seqs)): - seq_len = len(seqs[s]) - if seq_len not in seq_lens: - seq_lens[seq_len] = 1 - else: - seq_lens[seq_len] += 1 - - max_seq_count = 0 - for tmp_seq_len, tmp_seq_count in seq_lens.items(): - if tmp_seq_count > max_seq_count: - seq_len = tmp_seq_len - max_seq_count = tmp_seq_count - return seq_len - - seq_len = find_seq_len(seqs) - backbone_name = "%s*BACKBONE" % gene - backbone_seq, backbone_freq = create_consensus_seq(seqs, - seq_len, - min_var_freq, - not partial) # Remove empty sequences? - # Allele sequences can shrink, so readjust the sequence length - if not partial: - seq_len = find_seq_len(seqs) - - if partial and base_fname == "hla": - partial_MSA_fname = "hisatgenotype_db/HLA/msf/%s_nuc.msf" % gene - if not os.path.exists(partial_MSA_fname): - print >> sys.stderr, "Warning: %s does not exist" % partial_MSA_fname - continue - partial_names, partial_seqs = read_MSF_file(partial_MSA_fname) - - # DK - debugging purposes - # Partial alleles vs. Full alleles - """ - counts = [0, 0, 0, 0] - for partial_name in partial_names.keys(): - if partial_name in names: - continue - name_group = partial_name.split(':') - for group_i in [3, 2, 1, 0]: - if group_i == 0: - counts[group_i] += 1 - if group_i > len(name_group): - continue - sub_name = ':'.join(name_group[:group_i]) - if sub_name in full_alleles: - print partial_name, sub_name, full_alleles[sub_name][:5] - counts[group_i] += 1 - break - print "DK: counts:", counts - sys.exit(1) - """ - - ref_seq = seqs[names[ref_gene]] - ref_seq_map = create_map(ref_seq) - ref_partial_seq = partial_seqs[partial_names[ref_gene]] - ref_partial_seq_map = create_map(ref_partial_seq) - exons = gene_exons[gene] - exon_len = 0 - ref_exons = [] # converted exons to MSF file (e.g. A_gen.msf) - ref_partial_exons = [] # converted exons to MSF file (e.g. A_nuc.msf) - - complete = True - for exon in exons: - left, right = exon - ref_exons.append([ref_seq_map[left], ref_seq_map[right]]) - next_exon_len = right - left + exon_len - if next_exon_len >= len(ref_partial_seq_map): - print >> sys.stderr, "Warning: partial sequences (%s) seem to be incomplete" % gene - complete = False - break - ref_partial_exons.append([ref_partial_seq_map[exon_len], ref_partial_seq_map[next_exon_len]]) - exon_len += (right - left + 1) - # Make sure two MSF files (e.g. A_gen.msf and A_nuc.msf) share the same MSF lengths in the exonic sequences - ref_exon_len = ref_exons[-1][1] - ref_exons[-1][0] + 1 - ref_partial_exon_len = ref_partial_exons[-1][1] - ref_partial_exons[-1][0] + 1 - assert ref_exon_len == ref_partial_exon_len - - if complete: - partial_seq_len = find_seq_len(partial_seqs) - partial_backbone_seq, partial_backbone_freq = create_consensus_seq(partial_seqs, - partial_seq_len, - min_var_freq, - False) # Remove empty sequences? - for name, seq_id in partial_names.items(): - if name in names: - continue - seq = partial_seqs[seq_id] - new_seq = "" - right = 0 - for e in range(len(exons)): - ref_exon = ref_exons[e] - ref_partial_exon = ref_partial_exons[e] - new_seq += backbone_seq[right:ref_exon[0]] - exon_seq = seq[ref_partial_exon[0]:ref_partial_exon[1] + 1] - nt_exon_seq = exon_seq.replace('.', '') - if len(nt_exon_seq) == 0: - exon_seq = partial_backbone_seq[ref_partial_exon[0]:ref_partial_exon[1] + 1] - new_seq += exon_seq - right = ref_exon[1] + 1 - new_seq += backbone_seq[right:] - names[name] = len(seqs) - seqs.append(new_seq) - - backbone_seq, backbone_freq = create_consensus_seq(seqs, - seq_len, - min_var_freq, - True) # Remove empty sequences? - seq_len = find_seq_len(seqs) - - if min_var_freq <= 0.0: - assert '.' not in backbone_seq and 'E' not in backbone_seq - - # Reverse complement MSF if this gene is on '-' strand - if strand == '-': - # Reverse exons - ref_seq = seqs[names[ref_gene]] - ref_seq = ref_seq.replace('.', '') - ref_seq_len = len(ref_seq) - if base_fname == "hla": - exons = [] - for left, right in reversed(gene_exons[gene]): - left, right = ref_seq_len - right - 1, ref_seq_len - left - 1 - exons.append([left, right]) - gene_exons[gene] = exons - exon_counts = {} - for exon_i, count in gene_exon_counts[gene].items(): - exon_counts[len(gene_exons[gene]) - exon_i - 1] = count - gene_exon_counts[gene] = exon_counts - - for i in range(len(seqs)): - seqs[i] = typing_common.reverse_complement(seqs[i]) - backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True) - - if leftshift: - for seq_i in range(len(seqs)): - seqs[seq_i] = leftshift_deletions(backbone_seq, seqs[seq_i]) - backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True) - seq_len = find_seq_len(seqs) - - print >> sys.stderr, "%s: number of HLA alleles is %d." % (gene, len(names)) - - Vars = {} - for cmp_name, id in names.items(): - if cmp_name == backbone_name: - continue - assert id < len(seqs) - cmp_seq = seqs[id] - if len(cmp_seq) != seq_len: - print >> sys.stderr, "Warning: the length of %s (%d) is different from %d" % \ - (cmp_name, len(cmp_seq), seq_len) - continue - - # DK - debugging purposes - """ - if cmp_name == "A*03:01:07": - print cmp_name - cmp_seq2 = seqs[names["A*32:29"]] - for s in range(0, seq_len, 100): - print s, backbone_seq[s:s+100] - print s, cmp_seq2[s:s+100] - print s, cmp_seq[s:s+100] - # sys.exit(1) - """ - def insertVar(type, info): - pos, backbone_pos, data = info - if type in "MI": - varKey = "%d-%s-%s" % (pos, type, data) - else: - varKey = "%d-%s-%d" % (pos, type, data) - - if varKey not in Vars: - if type == 'M': - assert backbone_pos < backbone_freq - assert data in backbone_freq[backbone_pos] - freq = backbone_freq[backbone_pos][data] - elif type == 'D': - del_len = int(data) - freq = 100.0 - assert backbone_pos + del_len <= backbone_freq - for d in range(del_len): - assert '.' in backbone_freq[backbone_pos + d] - freq2 = backbone_freq[backbone_pos + d]['.'] - if freq2 < freq: - freq = freq2 - else: - assert type == 'I' - ins_len = len(data) - freq = 100.0 - assert backbone_pos + ins_len <= backbone_freq - for i in range(ins_len): - nt = data[i] - assert nt in backbone_freq[backbone_pos + i] - freq2 = backbone_freq[backbone_pos + i][nt] - if freq2 < freq: - freq = freq2 - assert freq <= min_var_freq - - Vars[varKey] = [freq, [cmp_name]] - else: - Vars[varKey][1].append(cmp_name) - - insertion, deletion = [], [] - ndots = 0 - for s in range(seq_len): - assert not (insertion and deletion) - bc = backbone_seq[s] - cc = cmp_seq[s] - if bc != '.' and cc != '.': - if insertion: - insertVar('I', insertion) - insertion = [] - elif deletion: - insertVar('D', deletion) - deletion = [] - if bc != cc: - mismatch = [s - ndots, s, cc] - insertVar('M', mismatch) - elif bc == '.' and cc != '.': - if deletion: - insertVar('D', deletion) - deletion = [] - if insertion: - insertion[2] += cc - else: - insertion = [s - ndots, s, cc] - elif bc != '.' and cc == '.': - if insertion: - insertVar('I', insertion) - insertion = [] - if deletion: - deletion[2] += 1 - else: - deletion = [s - ndots, s, 1] - - if bc == '.': - ndots += 1 - - """ - if backbone_seq[s] != cmp_seq[s]: - print "%s is different %s at %d: %s vs. %s" % \ - (backbone_name, cmp_name, s+1, backbone_seq[s], cmp_seq[s]) - """ - - if insertion: - insertVar('I', insertion) - elif deletion: - insertVar('D', deletion) - - - print >> sys.stderr, "Number of variants is %d." % (len(Vars.keys())) - - # Compare variants - def cmp_varKey(a, b): - a_locus, a_type, a_data = a.split('-') - b_locus, b_type, b_data = b.split('-') - a_locus, b_locus = int(a_locus), int(b_locus) - if a_locus != b_locus: - return a_locus - b_locus - if a_type != b_type: - if a_type == 'I': - return -1 - elif b_type == 'I': - return 1 - elif a_type == 'M': - return -1 - else: - assert b_type == 'M' - return 1 - assert a_data != b_data - if a_type in "MI": - if a_data < b_data: - return -1 - else: - return 1 - else: - assert a_type == 'D' - return int(a_data) - int(b_data) - - Vars_ = {} - for key, values in Vars.items(): - freq, names_ = values - for name in names_: - if not name in Vars_: - Vars_[name] = [key] - else: - Vars_[name].append(key) - for name, vars in Vars_.items(): - Vars_[name] = sorted(vars, cmp=cmp_varKey) - - # Sanity check - - # (1) Reconstruct the other sequences from the backbone sequence and variants and - # (2) Confirm these constructed sequences are the same as those input sequences. - for cmp_name, id in names.items(): - if cmp_name == backbone_name: - continue - - constr_seq = backbone_seq.replace('.', '') - constr_seq = list(constr_seq) - locus_diff = 0 - - if cmp_name not in Vars_: - continue - - for var in Vars_[cmp_name]: - try: - locus, type, data = var.split('-') - locus = int(locus) - except ValueError: - continue - - if type == 'M': - assert len(data) == 1 - constr_seq[locus + locus_diff] = data[0] - elif type == 'I': - assert locus + locus_diff >= 0 - assert locus + locus_diff <= len(constr_seq) - constr_seq = constr_seq[:locus + locus_diff] + list(data) + constr_seq[locus + locus_diff:] - locus_diff += len(data) - else: - assert type == 'D' - assert locus + locus_diff + len(data) <= len(constr_seq) - assert locus + locus_diff >= 0 - del_len = int(data) - constr_seq = constr_seq[:locus + locus_diff] + constr_seq[locus + locus_diff + del_len:] - locus_diff -= del_len - - constr_seq = "".join(constr_seq) - assert id < len(seqs) - cmp_seq = seqs[id].replace('.', '') - if len(constr_seq) != len(cmp_seq): - print >> sys.stderr, "Error: reconstruction fails (%s)! Lengths different: %d vs. %d" % \ - (cmp_name, len(constr_seq), len(cmp_seq)) - assert False - - # Sanity check - for s in range(len(constr_seq)): - if constr_seq[s] != cmp_seq[s]: - print >> sys.stderr, "Differ at %d: %s vs. %s (reconstruction vs. original)" % \ - (s, constr_seq[s], cmp_seq[s]) - print "%s:%s vs. %s:%s" % \ - (constr_seq[s-10:s], constr_seq[s:s+10], cmp_seq[s-10:s], cmp_seq[s:s+10]) - - if constr_seq != cmp_seq.replace('.', ''): - print >> sys.stderr, "Error: reconstruction fails for %s" % (cmp_name) - assert False - - # Write the backbone sequences into a fasta file - print >> backbone_file, ">%s" % (backbone_name) - backbone_seq_ = backbone_seq.replace('.', '') - for s in range(0, len(backbone_seq_), 60): - print >> backbone_file, backbone_seq_[s:s+60] - - # Remap the backbone allele, which is sometimes slighly different from - # fasta version - ref_backbone_id = names[ref_gene] - ref_backbone_seq = seqs[ref_backbone_id] - aligner_cmd = ["hisat2"] - if base_fname == "hla": - aligner_cmd += ["--score-min", "C,0"] - aligner_cmd += ["--no-unal", - "-x", "grch38/genome", - "-f", - "-c", "%s" % ref_backbone_seq.replace('.', '')] - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - best_chr, best_left, best_right, best_AS = "", 0, 0, -sys.maxint - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - allele_id, flag, chr, left, mapQ, cigar_str = cols[:6] - flag = int(flag) - assert flag & 0x10 == 0 - left = int(left) - 1 - right = left - AS = "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = col[5:] - AS = int(AS) - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MND": - right += length - if AS > best_AS: - best_chr, best_left, best_right, best_AS = chr, left, right, AS - - chr, left, right = best_chr, best_left, best_right - align_proc.communicate() - if left == right: - print >> sys.stderr, "Warning: %s (%s) is not remapped" % (gene, ref_gene) - continue - assert left < right - - base_locus = 0 - ref_seq = seqs[names[ref_gene]] - ref_seq_map = create_map(ref_seq) - - del_count = [] - for nt in backbone_seq: - assert nt in "ACGT." - add = 1 if nt == '.' else 0 - if len(del_count) == 0: - del_count.append(add) - else: - del_count.append(del_count[-1] + add) - - if base_fname == "hla": - exon_str = "" - for exon_i in range(len(gene_exons[gene])): - exon_left, exon_right = gene_exons[gene][exon_i] - exon_left, exon_right = ref_seq_map[exon_left], ref_seq_map[exon_right] - exon_left -= del_count[exon_left] - exon_right -= del_count[exon_right] - if exon_str != "": - exon_str += ',' - primary = gene_exon_counts[gene][exon_i] == max(gene_exon_counts[gene].values()) - exon_str += ("%d-%d%s" % (exon_left, exon_right, 'p' if primary else '')) - - # Sanity check for exonic sequence - sanity_check = True - if sanity_check and \ - os.path.exists("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene): - exons_ = [] - for exon in exon_str.split(','): - if exon.endswith('p'): - exon = exon[:-1] - exon_left, exon_right = exon.split('-') - exon_left, exon_right = int(exon_left), int(exon_right) - exons_.append([exon_left, exon_right]) - - backbone_seq_ = backbone_seq.replace('.', '') - if ref_gene in Vars_: - vars_ = Vars_[ref_gene] - else: - vars_ = [] - seq_ = list(backbone_seq_) - has_insertion = False - for var_ in vars_: - var_pos, var_type, var_data = var_.split('-') - var_pos = int(var_pos) - assert var_pos >= 0 and var_pos < len(backbone_seq_) - if var_type == 'M': - seq_[var_pos] = var_data - elif var_type == 'D': - del_len = int(var_data) - assert var_pos + del_len <= len(ref_seq) - seq_[var_pos:var_pos + del_len] = ['.'] * del_len - else: - assert var_type == 'I' - has_insertion = True - - seq_ = ''.join(seq_) - exon_seq_ = "" - for exon_left, exon_right in exons_: - exon_seq_ += seq_[exon_left:exon_right+1] - exon_seq_ = exon_seq_.replace('.', '') - if gene_strand[gene] == '-': - exon_seq_ = typing_common.reverse_complement(exon_seq_) - - cmp_exon_seq_, allele_name_ = "", "" - for line in open("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene): - if line.startswith(">"): - if allele_name_ == ref_gene: - break - allele_name_ = line.strip().split()[1] - cmp_exon_seq_ = "" - else: - cmp_exon_seq_ += line.strip() - """ - print "Has insertions:", has_insertion - print "constructed:", len(exon_seq_) - for p in range(0, len(exon_seq_), 60): - print exon_seq_[p:p+60] - print "true:", len(cmp_exon_seq_) - for p in range(0, len(cmp_exon_seq_), 60): - print cmp_exon_seq_[p:p+60] - """ - if exon_seq_ != cmp_exon_seq_: - print >> sys.stderr, "Waring: exonic sequences do not match (%s)" % gene - else: - exon_str = "%d-%d" % (left, right - 1) - - print >> locus_file, "%s\t%s\t%d\t%d\t%d\t%s\t%s" % \ - (backbone_name, chr, left, right - 1, len(backbone_seq.replace('.', '')), exon_str, gene_strand[gene]) - - # Write - # (1) variants w.r.t the backbone sequences into a SNP file - # (2) pairs of a variant and the corresponding HLA allels into a LINK file - keys = sorted(Vars.keys(), cmp=cmp_varKey) - var2ID = {} - for k in range(len(keys)): - locus, type, data = keys[k].split('-') - locus = int(locus) - if type == 'M': - type_str = "single" - elif type == 'I': - type_str = "insertion" - else: - assert type == 'D' - type_str = "deletion" - - freq, names_ = Vars[keys[k]] - names_ = sorted(names_) - varID = "hv%d" % (num_vars) - tmp_backbone_name = backbone_name - print >> var_file, "%s\t%s\t%s\t%d\t%s" % \ - (varID, type_str, tmp_backbone_name, base_locus + locus, data) - if freq >= min_var_freq: - print >> var_index_file, "%s\t%s\t%s\t%d\t%s" % \ - (varID, type_str, tmp_backbone_name, base_locus + locus, data) - print >> var_freq_file, "%s\t%.2f" % (varID, freq) - print >> link_file, "%s\t%s" % (varID, ' '.join(names_)) - var2ID[keys[k]] = num_vars - num_vars += 1 - - add_seq_len = 0 - # Write haplotypes - excluded_vars = set() - var_leftmost, var_rightmost = sys.maxint, -1 - for k in range(len(keys)): - key = keys[k] - if Vars[key][0] < min_var_freq: - excluded_vars.add(key) - - # Update leftmost and rightmost of Vars - locus, type, data = key.split('-') - left = right = int(locus) - if type == 'D': - right = left + int(data) - 1 - if k == 0: - var_leftmost = left - if var_rightmost < right: - var_rightmost = right - - i = 0 - while i < len(keys): - key_i = keys[i] - locus, type, data = key_i.split('-') - locus = int(locus) - if type == 'D': - locus += (int(data) - 1) - prev_locus = locus - if whole_haplotype: - j = len(keys) - else: - j = i + 1 - while j < len(keys): - key_j = keys[j] - locus2, type2, data2 = key_j.split('-') - locus2 = int(locus2) - if prev_locus + inter_gap < locus2: - break - prev_locus = locus2 - if type == 'D': - prev_locus += (int(data) - 1) - j += 1 - - alleles = set() - for k in range(i, j): - key_k = keys[k] - freq, names_ = Vars[key_k] - if freq < min_var_freq: - continue - add_alleles = set(names_) - alleles |= add_alleles - - haplotypes = set() - cur_vars = set(keys[i:j]) - excluded_vars - for allele in alleles: - allele_vars = set(Vars_[allele]) - excluded_vars - allele_cur_vars = '#'.join(sorted(list(cur_vars & allele_vars), cmp=cmp_varKey)) - haplotypes.add(allele_cur_vars) - - # Split some haplotypes that include large gaps inside - def split_haplotypes(haplotypes): - split_haplotypes = set() - for haplotype in haplotypes: - haplotype = haplotype.split('#') - assert len(haplotype) > 0 - if len(haplotype) == 1: - split_haplotypes.add(haplotype[0]) - continue - prev_s, s = 0, 1 - while s < len(haplotype): - prev_locus, prev_type, prev_data = haplotype[s-1].split('-') - locus, type, data = haplotype[s].split('-') - prev_locus, locus = int(prev_locus), int(locus) - if prev_type == 'D': - prev_locus += (int(prev_data) - 1) - if prev_locus + intra_gap < locus: - split_haplotypes.add('#'.join(haplotype[prev_s:s])) - prev_s = s - s += 1 - if s == len(haplotype): - split_haplotypes.add('#'.join(haplotype[prev_s:s])) - return split_haplotypes - - if not whole_haplotype: - haplotypes = split_haplotypes(haplotypes) - - def cmp_haplotype(a, b): - a = a.split('#') - a1_locus, _, _ = a[0].split('-') - a2_locus, a2_type, a2_data = a[-1].split('-') - a_begin, a_end = int(a1_locus), int(a2_locus) - if a2_type == 'D': - a_end += (int(a2_data) - 1) - b = b.split('#') - b1_locus, _, _ = b[0].split('-') - b2_locus, b2_type, b2_data = b[-1].split('-') - b_begin, b_end = int(b1_locus), int(b2_locus) - if b2_type == 'D': - b_end += (int(b2_data) - 1) - if a_begin != b_begin: - return a_begin - b_begin - return a_end - b_end - - haplotypes = sorted(list(haplotypes), cmp=cmp_haplotype) - - # DK - for debugging purposes - """ - dis = prev_locus - locus - print "\n[%d, %d]: %d haplotypes" % (i, j, len(haplotypes)), dis - if len(cur_vars) in range(0, 1000): - # print "vars:", sorted(list(cur_vars), cmp=cmp_varKey - print "num:", len(haplotypes) - for haplotype in haplotypes: - print haplotype.split('#') - print "\nnum:", len(haplotypes2) - for haplotype in haplotypes2: - print haplotype.split('#') - """ - - # Write haplotypes - sanity_vars = set() - for h_i in range(len(haplotypes)): - h = haplotypes[h_i].split('#') - varIDs = [] - for var in h: - varIDs.append("hv%s" % var2ID[var]) - # DK - for debugging purposes - # varIDs.append(var) - sanity_vars.add(var2ID[var]) - if whole_haplotype: - h_begin, h_end = var_leftmost, var_rightmost - else: - h1_locus, _, _ = h[0].split('-') - h2_locus, h2_type, h2_data = h[-1].split('-') - h_begin, h_end = int(h1_locus), int(h2_locus) - if h2_type == 'D': - h_end += (int(h2_data) - 1) - assert h_begin <= h_end - h_new_begin = h_begin - for h_j in reversed(range(0, h_i)): - hc = haplotypes[h_j].split('#') - hc_begin, hc_type, hc_data = hc[-1].split('-') - hc_begin = int(hc_begin) - hc_end = hc_begin - if hc_type == 'D': - hc_end += (int(hc_data) - 1) - if hc_end + inter_gap < h_begin: - break - if h_new_begin > hc_end: - h_new_begin = hc_end - assert h_new_begin <= h_begin - h_begin = h_new_begin - tmp_backbone_name = backbone_name - print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (num_haplotypes, tmp_backbone_name, base_locus + h_begin, base_locus + h_end, ','.join(varIDs)) - num_haplotypes += 1 - add_seq_len += (h_end - h_begin + 1) - assert len(sanity_vars) == len(cur_vars) - - i = j - - print >> sys.stderr, "Length of additional sequences for haplotypes:", add_seq_len - - # Write all the sequences with dots removed into a file - for name, ID in names.items(): - print >> input_file, ">%s" % (name) - assert ID < len(seqs) - seq = seqs[ID].replace('.', '') - for s in range(0, len(seq), 60): - print >> input_file, seq[s:s+60] - print >> allele_file, name - - - # Write partial allele names - for name in names: - if name not in full_allele_names: - print >> partial_file, name - - backbone_file.close() - locus_file.close() - var_file.close() - var_index_file.close() - var_freq_file.close() - haplotype_file.close() - link_file.close() - input_file.close() - allele_file.close() - partial_file.close() - - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Extract variants from multiple sequence alignments") - parser.add_argument("-b", "--base", - dest="base_fname", - type=str, - default="hla", - help="base filename for backbone sequence, variants, and linking info (Default: hla)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="A comma-separated list of gene names (default: empty, all genes)") - parser.add_argument("--inter-gap", - dest="inter_gap", - type=int, - default=30, - help="Maximum distance for variants to be in the same haplotype (default: 30)") - parser.add_argument("--intra-gap", - dest="intra_gap", - type=int, - default=50, - help="Break a haplotype into several haplotypes (default: 50)") - parser.add_argument("--whole-haplotype", - dest="whole_haplotype", - action="store_true", - help="Include partial alleles (e.g. A_nuc.fasta)") - parser.add_argument("--min-var-freq", - dest="min_var_freq", - type=float, - default=0.0, - help="Exclude variants whose freq is below than this value in percentage (Default: 0.0)") - parser.add_argument("--ext-seq", - dest="ext_seq_len", - type=int, - default=0, - help="Length of extra sequences flanking backbone sequences (Default: 0)") - parser.add_argument("--leftshift", - dest="leftshift", - action="store_true", - help="Shift deletions to the leftmost") - parser.add_argument("--no-partial", - dest="partial", - action="store_false", - help="Exclude partial alleles, exon-only sequences in HLA") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.locus_list == "": - locus_list = [] - else: - locus_list = args.locus_list.split(',') - if args.inter_gap > args.intra_gap: - print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap) - sys.exit(1) - - if args.base_fname.find('/') != -1: - elems = args.base_fname.split('/') - base_fname = elems[-1] - base_dname = '/'.join(elems[:-1]) - else: - base_fname = args.base_fname - base_dname = "" - - extract_vars(base_fname, - base_dname, - locus_list, - args.inter_gap, - args.intra_gap, - args.whole_haplotype, - args.min_var_freq, - args.ext_seq_len, - args.leftshift, - args.partial, - args.verbose) - diff --git a/hisatgenotype_hla_cyp.py b/hisatgenotype_hla_cyp.py deleted file mode 100755 index cd97eea9..00000000 --- a/hisatgenotype_hla_cyp.py +++ /dev/null @@ -1,1671 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2015, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import sys, os, subprocess, re -import inspect, random -import math -from argparse import ArgumentParser, FileType - -""" -""" -def simulate_reads(HLAs, - test_HLA_list, - simulate_interval): - HLA_reads_1, HLA_reads_2 = [], [] - for test_HLA_names in test_HLA_list: - gene = test_HLA_names[0].split('*')[0] - # ref_allele = refHLAs[gene] - # ref_seq = HLAs[gene][ref_allele] - - # Simulate reads from two HLA alleles - def simulate_reads_impl(seq, simulate_interval = 1, frag_len = 250, read_len = 100): - comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} - reads_1, reads_2 = [], [] - for i in range(0, len(seq) - frag_len + 1, simulate_interval): - reads_1.append(seq[i:i+read_len]) - tmp_read_2 = reversed(seq[i+frag_len-read_len:i+frag_len]) - read_2 = "" - for s in tmp_read_2: - if s in comp_table: - read_2 += comp_table[s] - else: - read_2 += s - reads_2.append(read_2) - return reads_1, reads_2 - - for test_HLA_name in test_HLA_names: - HLA_seq = HLAs[gene][test_HLA_name] - tmp_reads_1, tmp_reads_2 = simulate_reads_impl(HLA_seq, simulate_interval) - HLA_reads_1 += tmp_reads_1 - HLA_reads_2 += tmp_reads_2 - - # Write reads into a fasta read file - def write_reads(reads, idx): - read_file = open('hla_input_%d.fa' % idx, 'w') - for read_i in range(len(reads)): - print >> read_file, ">%d" % (read_i + 1) - print >> read_file, reads[read_i] - read_file.close() - write_reads(HLA_reads_1, 1) - write_reads(HLA_reads_2, 2) - - -""" -Align reads, and sort the alignments into a BAM file -""" -def align_reads(ex_path, - base_fname, - aligner, - index_type, - read_fname, - fastq, - threads, - verbose): - if aligner == "hisat2": - hisat2 = os.path.join(ex_path, "hisat2") - aligner_cmd = [hisat2, - "--no-unal", - "--mm"] - if index_type == "linear": - aligner_cmd += ["-k", "10"] - aligner_cmd += ["-x", "%s.%s" % (base_fname, index_type)] - elif aligner == "bowtie2": - aligner_cmd = [aligner, - "--no-unal", - "-k", "10", - "-x", base_fname] - else: - assert False - assert len(read_fname) in [1,2] - aligner_cmd += ["-p", str(threads)] - if not fastq: - aligner_cmd += ["-f"] - if len(read_fname) == 1: - aligner_cmd += ["-U", read_fname[0]] - else: - aligner_cmd += ["-1", "%s" % read_fname[0], - "-2", "%s" % read_fname[1]] - - if verbose: - print >> sys.stderr, ' '.join(aligner_cmd) - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sambam_cmd = ["samtools", - "view", - "-bS", - "-"] - sambam_proc = subprocess.Popen(sambam_cmd, - stdin=align_proc.stdout, - stdout=open("hla_input_unsorted.bam", 'w'), - stderr=open("/dev/null", 'w')) - sambam_proc.communicate() - if index_type == "graph": - bamsort_cmd = ["samtools", - "sort", - "hla_input_unsorted.bam", - "-o", "hla_input.bam"] - bamsort_proc = subprocess.Popen(bamsort_cmd, - stderr=open("/dev/null", 'w')) - bamsort_proc.communicate() - - bamindex_cmd = ["samtools", - "index", - "hla_input.bam"] - bamindex_proc = subprocess.Popen(bamindex_cmd, - stderr=open("/dev/null", 'w')) - bamindex_proc.communicate() - - os.system("rm hla_input_unsorted.bam") - else: - os.system("mv hla_input_unsorted.bam hla_input.bam") - - -""" -""" -def normalize(prob): - total = sum(prob.values()) - for allele, mass in prob.items(): - prob[allele] = mass / total - - -""" -""" -def prob_diff(prob1, prob2): - diff = 0.0 - for allele in prob1.keys(): - if allele in prob2: - diff += abs(prob1[allele] - prob2[allele]) - else: - diff += prob1[allele] - return diff - - -""" -""" -def HLA_prob_cmp(a, b): - if a[1] != b[1]: - if a[1] < b[1]: - return 1 - else: - return -1 - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - - -""" -""" -def single_abundance(HLA_cmpt, - HLA_length): - def normalize2(prob, length): - total = 0 - for allele, mass in prob.items(): - assert allele in length - total += (mass / length[allele]) - for allele, mass in prob.items(): - assert allele in length - prob[allele] = mass / length[allele] / total - - HLA_prob, HLA_prob_next = {}, {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - for allele in alleles: - if allele not in HLA_prob: - HLA_prob[allele] = 0.0 - HLA_prob[allele] += (float(count) / len(alleles)) - - # normalize2(HLA_prob, HLA_length) - normalize(HLA_prob) - def next_prob(HLA_cmpt, HLA_prob, HLA_length): - HLA_prob_next = {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - alleles_prob = 0.0 - for allele in alleles: - assert allele in HLA_prob - alleles_prob += HLA_prob[allele] - for allele in alleles: - if allele not in HLA_prob_next: - HLA_prob_next[allele] = 0.0 - HLA_prob_next[allele] += (float(count) * HLA_prob[allele] / alleles_prob) - # normalize2(HLA_prob_next, HLA_length) - normalize(HLA_prob_next) - return HLA_prob_next - - diff, iter = 1.0, 0 - while diff > 0.0001 and iter < 1000: - HLA_prob_next = next_prob(HLA_cmpt, HLA_prob, HLA_length) - diff = prob_diff(HLA_prob, HLA_prob_next) - HLA_prob = HLA_prob_next - iter += 1 - for allele, prob in HLA_prob.items(): - allele_len = HLA_length[allele] - HLA_prob[allele] /= float(allele_len) - normalize(HLA_prob) - HLA_prob = [[allele, prob] for allele, prob in HLA_prob.items()] - HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp) - return HLA_prob - - -""" -""" -def joint_abundance(HLA_cmpt, - HLA_length): - allele_names = set() - for cmpt in HLA_cmpt.keys(): - allele_names |= set(cmpt.split('-')) - - HLA_prob, HLA_prob_next = {}, {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - for allele1 in alleles: - for allele2 in allele_names: - if allele1 < allele2: - allele_pair = "%s-%s" % (allele1, allele2) - else: - allele_pair = "%s-%s" % (allele2, allele1) - if not allele_pair in HLA_prob: - HLA_prob[allele_pair] = 0.0 - HLA_prob[allele_pair] += (float(count) / len(alleles)) - - if len(HLA_prob) <= 0: - return HLA_prob - - # Choose top allele pairs - def choose_top_alleles(HLA_prob): - HLA_prob_list = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()] - HLA_prob_list = sorted(HLA_prob_list, cmp=HLA_prob_cmp) - HLA_prob = {} - best_prob = HLA_prob_list[0][1] - for i in range(len(HLA_prob_list)): - allele_pair, prob = HLA_prob_list[i] - if prob * 2 <= best_prob: - break - HLA_prob[allele_pair] = prob - normalize(HLA_prob) - return HLA_prob - HLA_prob = choose_top_alleles(HLA_prob) - - def next_prob(HLA_cmpt, HLA_prob): - HLA_prob_next = {} - for cmpt, count in HLA_cmpt.items(): - alleles = cmpt.split('-') - prob = 0.0 - for allele in alleles: - for allele_pair in HLA_prob.keys(): - if allele in allele_pair: - prob += HLA_prob[allele_pair] - for allele in alleles: - for allele_pair in HLA_prob.keys(): - if not allele in allele_pair: - continue - if allele_pair not in HLA_prob_next: - HLA_prob_next[allele_pair] = 0.0 - HLA_prob_next[allele_pair] += (float(count) * HLA_prob[allele_pair] / prob) - normalize(HLA_prob_next) - return HLA_prob_next - - diff, iter = 1.0, 0 - while diff > 0.0001 and iter < 1000: - HLA_prob_next = next_prob(HLA_cmpt, HLA_prob) - diff = prob_diff(HLA_prob, HLA_prob_next) - HLA_prob = HLA_prob_next - HLA_prob = choose_top_alleles(HLA_prob) - iter += 1 - - HLA_prob = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()] - HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp) - return HLA_prob - - -""" -""" -def HLA_typing(ex_path, - base_fname, - simulation, - reference_type, - hla_list, - partial, - refHLAs, - HLAs, - HLA_names, - HLA_lengths, - refHLA_loci, - Vars, - Var_list, - Links, - exclude_allele_list, - aligners, - num_mismatch, - fastq, - read_fname, - alignment_fname, - threads, - enable_coverage, - best_alleles, - verbose): - - def lower_bound(Var_list, pos): - low, high = 0, len(Var_list) - while low < high: - m = (low + high) / 2 - m_pos = Var_list[m][0] - if m_pos < pos: - low = m + 1 - elif m_pos > pos: - high = m - else: - assert m_pos == pos - while m > 0: - if Var_list[m-1][0] < pos: - break - m -= 1 - return m - return low - - if simulation: - test_passed = {} - for aligner, index_type in aligners: - if index_type == "graph": - print >> sys.stderr, "\n\t\t%s %s on %s" % (aligner, index_type, reference_type) - else: - print >> sys.stderr, "\n\t\t%s %s" % (aligner, index_type) - - if alignment_fname == "": - # Align reads, and sort the alignments into a BAM file - align_reads(ex_path, - base_fname, - aligner, - index_type, - read_fname, - fastq, - threads, - verbose) - - for test_HLA_names in hla_list: - if simulation: - gene = test_HLA_names[0].split('*')[0] - else: - gene = test_HLA_names - - ref_allele = refHLAs[gene] - ref_seq = HLAs[gene][ref_allele] - ref_exons = refHLA_loci[gene][-1] - - # Read alignments - alignview_cmd = ["samtools", - "view"] - if alignment_fname == "": - alignview_cmd += ["hla_input.bam"] - else: - if not os.path.exists(alignment_fname + ".bai"): - os.system("samtools index %s" % alignment_fname) - alignview_cmd += [alignment_fname] - base_locus = 0 - if index_type == "graph": - if reference_type == "gene": - alignview_cmd += ["%s" % ref_allele] - else: - assert reference_type in ["chromosome", "genome"] - _, chr, left, right, _ = refHLA_loci[gene] - base_locus = left - alignview_cmd += ["%s:%d-%d" % (chr, left + 1, right + 1)] - - bamview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sort_read_cmd = ["sort", "-k", "1", "-n"] - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - else: - alignview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - # Count alleles - HLA_counts, HLA_cmpt = {}, {} - coverage = [0 for i in range(len(ref_seq) + 1)] - num_reads, total_read_len = 0, 0 - prev_read_id = None - prev_exon = False - if index_type == "graph": - # Cigar regular expression - cigar_re = re.compile('\d+\w') - for line in alignview_proc.stdout: - cols = line.strip().split() - read_id, flag, chr, pos, mapQ, cigar_str = cols[:6] - read_seq, qual = cols[9], cols[10] - num_reads += 1 - total_read_len += len(read_seq) - flag, pos = int(flag), int(pos) - pos -= (base_locus + 1) - if pos < 0: - continue - - if flag & 0x4 != 0: - continue - - NM, Zs, MD = "", "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("Zs"): - Zs = col[5:] - elif col.startswith("MD"): - MD = col[5:] - elif col.startswith("NM"): - NM = int(col[5:]) - - if NM > num_mismatch: - continue - - # daehwan - for debugging purposes - debug = False - if read_id in ["2339"] and False: - debug = True - print "read_id: %s)" % read_id, pos, cigar_str, "NM:", NM, MD, Zs - print " ", read_seq - - vars = [] - if Zs: - vars = Zs.split(',') - - assert MD != "" - MD_str_pos, MD_len = 0, 0 - read_pos, left_pos = 0, pos - right_pos = left_pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - cmp_list = [] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op == 'M': - # Update coverage - if enable_coverage: - if right_pos + length < len(coverage): - coverage[right_pos] += 1 - coverage[right_pos + length] -= 1 - elif right_pos < len(coverage): - coverage[right_pos] += 1 - coverage[-1] -= 1 - - first = True - MD_len_used = 0 - while True: - if not first or MD_len == 0: - if MD[MD_str_pos].isdigit(): - num = int(MD[MD_str_pos]) - MD_str_pos += 1 - while MD_str_pos < len(MD): - if MD[MD_str_pos].isdigit(): - num = num * 10 + int(MD[MD_str_pos]) - MD_str_pos += 1 - else: - break - MD_len += num - # Insertion or full match followed - if MD_len >= length: - MD_len -= length - cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used]) - break - first = False - read_base = read_seq[read_pos + MD_len] - MD_ref_base = MD[MD_str_pos] - MD_str_pos += 1 - assert MD_ref_base in "ACGT" - cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used]) - cmp_list.append(["mismatch", right_pos + MD_len, 1]) - MD_len_used = MD_len + 1 - MD_len += 1 - # Full match - if MD_len == length: - MD_len = 0 - break - elif cigar_op == 'I': - cmp_list.append(["insertion", right_pos, length]) - elif cigar_op == 'D': - if MD[MD_str_pos] == '0': - MD_str_pos += 1 - assert MD[MD_str_pos] == '^' - MD_str_pos += 1 - while MD_str_pos < len(MD): - if not MD[MD_str_pos] in "ACGT": - break - MD_str_pos += 1 - cmp_list.append(["deletion", right_pos, length]) - elif cigar_op == 'S': - cmp_list.append(["soft", right_pos, length]) - else: - assert cigar_op == 'N' - cmp_list.append(["intron", right_pos, length]) - - if cigar_op in "MND": - right_pos += length - - if cigar_op in "MIS": - read_pos += length - - exon = False - for exon in ref_exons: - exon_left, exon_right = exon - if right_pos <= exon_left or pos > exon_right: - continue - else: - exon = True - break - - if right_pos > len(ref_seq): - continue - - def add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, exon = True): - max_count = max(HLA_count_per_read.values()) - cur_cmpt = set() - for allele, count in HLA_count_per_read.items(): - if count < max_count: - continue - if allele in exclude_allele_list: - continue - cur_cmpt.add(allele) - if not allele in HLA_counts: - HLA_counts[allele] = 1 - else: - HLA_counts[allele] += 1 - - if len(cur_cmpt) == 0: - return - - # daehwan - for debugging purposes - alleles = ["", ""] - # alleles = ["B*40:304", "B*40:02:01"] - allele1_found, allele2_found = False, False - for allele, count in HLA_count_per_read.items(): - if count < max_count: - continue - if allele == alleles[0]: - allele1_found = True - elif allele == alleles[1]: - allele2_found = True - if allele1_found != allele2_found: - print alleles[0], HLA_count_per_read[alleles[0]] - print alleles[1], HLA_count_per_read[alleles[1]] - if allele1_found: - print ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, HLA_count_per_read[alleles[1]])) - else: - print ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, HLA_count_per_read[alleles[0]])) - print read_seq - - cur_cmpt = sorted(list(cur_cmpt)) - cur_cmpt = '-'.join(cur_cmpt) - add = 1 - if partial and not exon: - add *= 0.2 - if not cur_cmpt in HLA_cmpt: - HLA_cmpt[cur_cmpt] = add - else: - HLA_cmpt[cur_cmpt] += add - - if read_id != prev_read_id: - if prev_read_id != None: - add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, prev_exon) - - HLA_count_per_read = {} - for HLA_name in HLA_names[gene]: - if HLA_name.find("BACKBONE") != -1: - continue - HLA_count_per_read[HLA_name] = 0 - - def add_count(var_id, add): - assert var_id in Links - alleles = Links[var_id] - for allele in alleles: - if allele.find("BACKBONE") != -1: - continue - HLA_count_per_read[allele] += add - # daehwan - for debugging purposes - if debug: - if allele in ["DQA1*05:05:01:01", "DQA1*05:05:01:02"]: - print allele, add, var_id - - # Decide which allele(s) a read most likely came from - # also sanity check - read length, cigar string, and MD string - for var_id, data in Vars[gene].items(): - var_type, var_pos, var_data = data - if var_type != "deletion": - continue - if left_pos >= var_pos and right_pos <= var_pos + int(var_data): - add_count(var_id, -1) - ref_pos, read_pos, cmp_cigar_str, cmp_MD = left_pos, 0, "", "" - cigar_match_len, MD_match_len = 0, 0 - for cmp in cmp_list: - type = cmp[0] - length = cmp[2] - if type == "match": - var_idx = lower_bound(Var_list[gene], ref_pos) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if ref_pos + length <= var_pos: - break - if ref_pos <= var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "insertion": - if ref_pos < var_pos and ref_pos + length > var_pos + len(var_data): - add_count(var_id, -1) - # daehwan - for debugging purposes - if debug: - print cmp, var_id, Links[var_id] - elif var_type == "deletion": - del_len = int(var_data) - if ref_pos < var_pos and ref_pos + length > var_pos + del_len: - # daehwan - for debugging purposes - if debug: - print cmp, var_id, Links[var_id], -1, Vars[gene][var_id] - # Check if this might be one of the two tandem repeats (the same left coordinate) - cmp_left, cmp_right = cmp[1], cmp[1] + cmp[2] - test1_seq1 = ref_seq[cmp_left:cmp_right] - test1_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos + del_len:cmp_right + del_len] - # Check if this happens due to small repeats (the same right coordinate - e.g. 19 times of TTTC in DQA1*05:05:01:02) - cmp_left -= read_pos - cmp_right += (len(read_seq) - read_pos - cmp[2]) - test2_seq1 = ref_seq[cmp_left+int(var_data):cmp_right] - test2_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos+int(var_data):cmp_right] - if test1_seq1 != test1_seq2 and test2_seq1 != test2_seq2: - add_count(var_id, -1) - else: - if debug: - print cmp, var_id, Links[var_id], -1 - add_count(var_id, -1) - var_idx += 1 - - read_pos += length - ref_pos += length - cigar_match_len += length - MD_match_len += length - elif type == "mismatch": - read_base = read_seq[read_pos] - var_idx = lower_bound(Var_list[gene], ref_pos) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if ref_pos < var_pos: - break - if ref_pos == var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "single": - if var_data == read_base: - # daehwan - for debugging purposes - if debug: - print cmp, var_id, 1, var_data, read_base, Links[var_id] - - # daehwan - for debugging purposes - if False: - read_qual = ord(qual[read_pos]) - add_count(var_id, (read_qual - 60) / 60.0) - else: - add_count(var_id, 1) - # daehwan - check out if this routine is appropriate - # else: - # add_count(var_id, -1) - var_idx += 1 - - cmp_MD += ("%d%s" % (MD_match_len, ref_seq[ref_pos])) - MD_match_len = 0 - cigar_match_len += 1 - read_pos += 1 - ref_pos += 1 - elif type == "insertion": - ins_seq = read_seq[read_pos:read_pos+length] - var_idx = lower_bound(Var_list[gene], ref_pos) - # daehwan - for debugging purposes - if debug: - print left_pos, cigar_str, MD, vars - print ref_pos, ins_seq, Var_list[gene][var_idx], Vars[gene][Var_list[gene][var_idx][1]] - # sys.exit(1) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if ref_pos < var_pos: - break - if ref_pos == var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "insertion": - if var_data == ins_seq: - # daehwan - for debugging purposes - if debug: - print cmp, var_id, 1, Links[var_id] - add_count(var_id, 1) - var_idx += 1 - - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - read_pos += length - cmp_cigar_str += ("%dI" % length) - elif type == "deletion": - del_len = length - # Deletions can be shifted bidirectionally - temp_ref_pos = ref_pos - while temp_ref_pos > 0: - last_bp = ref_seq[temp_ref_pos + del_len - 1] - prev_bp = ref_seq[temp_ref_pos - 1] - if last_bp != prev_bp: - break - temp_ref_pos -= 1 - var_idx = lower_bound(Var_list[gene], temp_ref_pos) - while var_idx < len(Var_list[gene]): - var_pos, var_id = Var_list[gene][var_idx] - if temp_ref_pos < var_pos: - first_bp = ref_seq[temp_ref_pos] - next_bp = ref_seq[temp_ref_pos + del_len] - if first_bp == next_bp: - temp_ref_pos += 1 - continue - else: - break - if temp_ref_pos == var_pos: - var_type, _, var_data = Vars[gene][var_id] - if var_type == "deletion": - var_len = int(var_data) - if var_len == length: - if debug: - print cmp, var_id, 1, Links[var_id] - print ref_seq[var_pos - 10:var_pos], ref_seq[var_pos:var_pos+int(var_data)], ref_seq[var_pos+int(var_data):var_pos+int(var_data)+10] - add_count(var_id, 1) - var_idx += 1 - - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - cmp_MD += ("%d" % MD_match_len) - MD_match_len = 0 - cmp_cigar_str += ("%dD" % length) - cmp_MD += ("^%s" % ref_seq[ref_pos:ref_pos+length]) - ref_pos += length - elif type == "soft": - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - read_pos += length - cmp_cigar_str += ("%dS" % length) - else: - assert type == "intron" - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cigar_match_len = 0 - cmp_cigar_str += ("%dN" % length) - ref_pos += length - if cigar_match_len > 0: - cmp_cigar_str += ("%dM" % cigar_match_len) - cmp_MD += ("%d" % MD_match_len) - if read_pos != len(read_seq) or \ - cmp_cigar_str != cigar_str or \ - cmp_MD != MD: - print >> sys.stderr, "Error:", cigar_str, MD - print >> sys.stderr, "\tcomputed:", cmp_cigar_str, cmp_MD - print >> sys.stderr, "\tcmp list:", cmp_list - assert False - - prev_read_id = read_id - prev_exon = exon - - if num_reads <= 0: - continue - - if prev_read_id != None: - add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read) - - # Coverage - # it is not used by the default - if enable_coverage: - assert num_reads > 0 - read_len = int(total_read_len / float(num_reads)) - coverage_sum = 0 - for i in range(len(coverage)): - if i > 0: - coverage[i] += coverage[i-1] - coverage_sum += coverage[i] - coverage_avg = coverage_sum / float(len(coverage)) - assert len(ref_seq) < len(coverage) - for i in range(len(ref_seq)): - coverage_threshold = 1.0 * coverage_avg - if i < read_len: - coverage_threshold *= ((i+1) / float(read_len)) - elif i + read_len > len(ref_seq): - coverage_threshold *= ((len(ref_seq) - i) / float(read_len)) - if coverage[i] >= coverage_threshold: - continue - pseudo_num_reads = (coverage_threshold - coverage[i]) / read_len - var_idx = lower_bound(Var_list[gene], i + 1) - if var_idx >= len(Var_list[gene]): - var_idx = len(Var_list[gene]) - 1 - cur_cmpt = set() - while var_idx >= 0: - var_pos, var_id = Var_list[gene][var_idx] - var_type, _, var_data = Vars[gene][var_id] - if var_type == "deletion": - del_len = int(var_data) - if i < var_pos: - break - if i + read_len < var_pos + int(var_data): - assert var_id in Links - cur_cmpt = cur_cmpt.union(set(Links[var_id])) - var_idx -= 1 - if cur_cmpt: - cur_cmpt = '-'.join(list(cur_cmpt)) - if not cur_cmpt in HLA_cmpt: - HLA_cmpt[cur_cmpt] = 0 - HLA_cmpt[cur_cmpt] += pseudo_num_reads - else: - assert index_type == "linear" - def add_alleles(alleles): - if not allele in HLA_counts: - HLA_counts[allele] = 1 - else: - HLA_counts[allele] += 1 - - cur_cmpt = sorted(list(alleles)) - cur_cmpt = '-'.join(cur_cmpt) - if not cur_cmpt in HLA_cmpt: - HLA_cmpt[cur_cmpt] = 1 - else: - HLA_cmpt[cur_cmpt] += 1 - - prev_read_id, prev_AS = None, None - alleles = set() - for line in alignview_proc.stdout: - cols = line[:-1].split() - read_id, flag, allele = cols[:3] - flag = int(flag) - if flag & 0x4 != 0: - continue - if not allele.startswith(gene): - continue - if allele.find("BACKBONE") != -1: - continue - - AS = None - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - assert AS != None - if read_id != prev_read_id: - if alleles: - if aligner == "hisat2" or \ - (aligner == "bowtie2" and len(alleles) < 10): - add_alleles(alleles) - alleles = set() - prev_AS = None - if prev_AS != None and AS < prev_AS: - continue - prev_read_id = read_id - prev_AS = AS - alleles.add(allele) - - if alleles: - add_alleles(alleles) - - HLA_counts = [[allele, count] for allele, count in HLA_counts.items()] - def HLA_count_cmp(a, b): - if a[1] != b[1]: - return b[1] - a[1] - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - HLA_counts = sorted(HLA_counts, cmp=HLA_count_cmp) - for count_i in range(len(HLA_counts)): - count = HLA_counts[count_i] - if simulation: - found = False - for test_HLA_name in test_HLA_names: - if count[0] == test_HLA_name: - print >> sys.stderr, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_HLA_name, count[1]) - found = True - """ - if count_i > 0 and HLA_counts[0][1] > count[1]: - print >> sys.stderr, "Warning: %s ranked first (count: %d)" % (HLA_counts[0][0], HLA_counts[0][1]) - assert False - else: - test_passed += 1 - """ - if count_i < 5 and not found: - print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - else: - print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - if count_i >= 9: - break - print >> sys.stderr - - HLA_prob = single_abundance(HLA_cmpt, HLA_lengths[gene]) - - success = [False for i in range(len(test_HLA_names))] - found_list = [False for i in range(len(test_HLA_names))] - for prob_i in range(len(HLA_prob)): - prob = HLA_prob[prob_i] - found = False - if simulation: - for name_i in range(len(test_HLA_names)): - test_HLA_name = test_HLA_names[name_i] - if prob[0] == test_HLA_name: - rank_i = prob_i - while rank_i > 0: - if prob == HLA_prob[rank_i - 1][1]: - rank_i -= 1 - else: - break - print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_HLA_name, prob[1] * 100.0) - if rank_i < len(success): - success[rank_i] = True - found_list[name_i] = True - found = True - if not False in found_list: - break - if not found: - print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, prob[0], prob[1] * 100.0) - if best_alleles and prob_i < 2: - print >> sys.stdout, "SingleModel %s (abundance: %.2f%%)" % (prob[0], prob[1] * 100.0) - if not simulation and prob_i >= 9: - break - print >> sys.stderr - - if len(test_HLA_names) == 2 or not simulation: - HLA_prob = joint_abundance(HLA_cmpt, HLA_lengths[gene]) - if len(HLA_prob) <= 0: - continue - success = [False] - for prob_i in range(len(HLA_prob)): - allele_pair, prob = HLA_prob[prob_i] - allele1, allele2 = allele_pair.split('-') - if best_alleles and prob_i < 1: - print >> sys.stdout, "PairModel %s (abundance: %.2f%%)" % (allele_pair, prob * 100.0) - if simulation: - if allele1 in test_HLA_names and allele2 in test_HLA_names: - rank_i = prob_i - while rank_i > 0: - if HLA_prob[rank_i-1][1] == prob: - rank_i -= 1 - else: - break - print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, allele_pair, prob * 100.0) - if rank_i == 0: - success[0] = True - break - print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, allele_pair, prob * 100.0) - if not simulation and prob_i >= 9: - break - print >> sys.stderr - - # Li's method - """ - li_hla = os.path.join(ex_path, "li_hla/hla") - if os.path.exists(li_hla): - li_hla_cmd = [li_hla, - "hla", - "hla_input.bam", - "-b", "%s*BACKBONE" % gene] - li_hla_proc = subprocess.Popen(li_hla_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - # read in the result of Li's hla - for line in li_hla_proc.stdout: - allele1, allele2, score = line.strip().split() - score = float(score) - if simulation: - if allele1 in test_HLA_names and allele2 in test_HLA_names: - print >> sys.stderr, "\t\t\t*** 1 ranked %s-%s (score: %.2f)" % (allele1, allele2, score) - success[0] = True - else: - print >> sys.stderr, "\t\t\tLiModel fails" - if best_alleles: - print >> sys.stdout, "LiModel %s-%s (score: %.2f)" % (allele1, allele2, score) - li_hla_proc.communicate() - """ - - if simulation and not False in success: - aligner_type = "%s %s" % (aligner, index_type) - if not aligner_type in test_passed: - test_passed[aligner_type] = 1 - else: - test_passed[aligner_type] += 1 - - if simulation: - return test_passed - - -def read_HLA_alleles(fname, HLAs): - for line in open(fname): - if line.startswith(">"): - HLA_name = line.strip().split()[0][1:] - HLA_gene = HLA_name.split('*')[0] - if not HLA_gene in HLAs: - HLAs[HLA_gene] = {} - if not HLA_name in HLAs[HLA_gene]: - HLAs[HLA_gene][HLA_name] = "" - else: - HLAs[HLA_gene][HLA_name] += line.strip() - return HLAs - -""" -""" -def genotyping(base_fname, - reference_type, - hla_list, - partial, - aligners, - read_fname, - alignment_fname, - threads, - simulate_interval, - enable_coverage, - best_alleles, - exclude_allele_list, - default_allele_list, - num_mismatch, - verbose, - daehwan_debug): - # Current script directory - curr_script = os.path.realpath(inspect.getsourcefile(genotyping)) - ex_path = os.path.dirname(curr_script) - - # Clone a git repository, IMGTHLA - if not os.path.exists("IMGTHLA"): - os.system("git clone https://github.com/jrob119/IMGTHLA.git") - - # Clone hisat2 genotype database, hisat_genotype_db - """ - if not os.path.exists("hisat_genotype_db"): - os.system("git clone https://github.com/infphilo/hisat_genotype_db.git") - """ - - simulation = (read_fname == [] and alignment_fname == "") - - def check_files(fnames): - for fname in fnames: - if not os.path.exists(fname): - return False - return True - - # Download HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not check_files(HISAT2_fnames): - os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz") - hisat2_inspect = os.path.join(ex_path, "hisat2-inspect") - os.system("%s grch38/genome > genome.fa" % hisat2_inspect) - os.system("samtools faidx genome.fa") - - # Check if the pre-existing files (hla*) are compatible with the current parameter setting - if os.path.exists("%s.ref" % base_fname): - left = 0 - HLA_genes = set() - BACKBONE = False - for line in open("%s.ref" % base_fname): - HLA_name = line.strip().split()[0] - if HLA_name.find("BACKBONE") != -1: - BACKBONE = True - HLA_gene = HLA_name.split('*')[0] - HLA_genes.add(HLA_gene) - delete_hla_files = False - if reference_type == "gene": - if not BACKBONE: - delete_hla_files = True - elif reference_type in ["chromosome", "genome"]: - if BACKBONE: - delete_hla_files = True - else: - assert False - if not set(hla_list).issubset(HLA_genes): - delete_hla_files = True - if base_fname == "hla": - if delete_hla_files: - os.system("rm %s*" % base_fname) - - # Extract HLA variants, backbone sequence, and other sequeces - HLA_fnames = [base_fname+"_backbone.fa", - base_fname+"_sequences.fa", - base_fname+".ref", - base_fname+".snp", - base_fname+".haplotype", - base_fname+".link", - base_fname+"_alleles_excluded.txt"] - - # Check if excluded alleles in current files match - excluded_alleles_match = False - if(os.path.exists(HLA_fnames[6])): - afile = open(HLA_fnames[6],'r') - afile.readline() - lines = afile.read().split() - excluded_alleles_match = (set(exclude_allele_list) == set(lines)) - afile.close() - elif len(exclude_allele_list) == 0: - excluded_alleles_match = True - try: - temp_name = HLA_fnames[6] - HLA_fnames.remove(HLA_fnames[6]) - os.remove(temp_name) - except OSError: - pass - - if not excluded_alleles_match: - print("Creating Allele Exclusion File.\n") - afile = open(HLA_fnames[6],'w') - afile.write("Alleles excluded:\n") - afile.write("\n".join(exclude_allele_list)) - afile.close() - - if (not check_files(HLA_fnames)) or (not excluded_alleles_match) : - extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py") - extract_cmd = [extract_hla_script, - "--base", base_fname, - "--reference-type", reference_type] - - if base_fname == "hla": - extract_cmd += ["--hla-list", ','.join(hla_list)] - - if len(exclude_allele_list) > 0: - print exclude_allele_list - extract_cmd += ["--exclude-allele-list", ",".join(exclude_allele_list)] - - if len(base_fname) > 3: - extract_cmd += ["--base", base_fname] - - if partial: - extract_cmd += ["--partial"] - extract_cmd += ["--inter-gap", "30", - "--intra-gap", "50"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - - if not check_files(HLA_fnames): - print >> sys.stderr, "Error: extract_HLA_vars failed!" - sys.exit(1) - - for aligner, index_type in aligners: - # Build HISAT2 graph indexes based on the above information - if aligner == "hisat2" and index_type == "graph": - HLA_hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base_fname, i+1) for i in range(8)] - if not check_files(HLA_hisat2_graph_index_fnames) or (not excluded_alleles_match): - hisat2_build = os.path.join(ex_path, "hisat2-build") - build_cmd = [hisat2_build, - "-p", str(threads), - "--snp", HLA_fnames[3], - "--haplotype", HLA_fnames[4] , - HLA_fnames[0], - "%s.graph" % base_fname] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(build_cmd) - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(HLA_hisat2_graph_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed! Perhaps, you may have forgotten to build hisat2 executables?" - sys.exit(1) - - # Build HISAT2 linear indexes based on the above information - elif aligner == "hisat2" and index_type == "linear": - HLA_hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base_fname, i+1) for i in range(8)] - if reference_type == "gene" and (not check_files(HLA_hisat2_linear_index_fnames) or (not excluded_alleles_match)): - hisat2_build = os.path.join(ex_path, "hisat2-build") - build_cmd = [hisat2_build, - "%s,%s"%(HLA_fnames[0],HLA_fnames[1]), - "%s.linear" % base_fname] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(HLA_hisat2_linear_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - - # Build Bowtie2 indexes based on the above information - else: - assert aligner == "bowtie2" and index_type == "linear" - HLA_bowtie2_index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)] - HLA_bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)] - if reference_type == "gene" and (not check_files(HLA_bowtie2_index_fnames) or (not excluded_alleles_match)): - build_cmd = ["bowtie2-build", - "%s,%s"%(HLA_fnames[0],HLA_fnames[1]), - base_fname] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w')) - proc.communicate() - if not check_files(HLA_bowtie2_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - - # Read partial alleles from hla.data (temporary) - partial_alleles = set() - if base_fname == "hla": - for line in open("IMGTHLA/hla.dat"): - if not line.startswith("DE"): - continue - allele_name = line.split()[1][4:-1] - gene = allele_name.split('*')[0] - if line.find("partial") != -1: - partial_alleles.add(allele_name) - - if len(default_allele_list) != 0: - #print os.getcwd() - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - #current_path = os.getcwd() - try: - os.mkdir("./Default-HLA") - except: - pass - #os.chdir(current_path + "/Default-HLA") - - extract_hla_script = os.path.join(ex_path, "hisat2_extract_HLA_vars.py") - extract_cmd = [extract_hla_script, - "--reference-type", reference_type, - "--hla-list", ','.join(hla_list), - "--base", "./Default-HLA/hla"] - - if partial: - extract_cmd += ["--partial"] - extract_cmd += ["--inter-gap", "30", - "--intra-gap", "50"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!" - sys.exit(1) - - # Read HLA alleles (names and sequences) - refHLAs, refHLA_loci = {}, {} - for line in open("%s.ref" % base_fname): - HLA_name, chr, left, right, length, exon_str = line.strip().split() - HLA_gene = HLA_name.split('*')[0] - assert not HLA_gene in refHLAs - refHLAs[HLA_gene] = HLA_name - left, right = int(left), int(right) - exons = [] - for exon in exon_str.split(','): - exon_left, exon_right = exon.split('-') - exons.append([int(exon_left), int(exon_right)]) - refHLA_loci[HLA_gene] = [HLA_name, chr, left, right, exons] - - HLAs = {} - if reference_type == "gene": - read_HLA_alleles(HLA_fnames[0], HLAs) - read_HLA_alleles(HLA_fnames[1], HLAs) - - # HLA gene alleles - HLA_names = {} - for HLA_gene, data in HLAs.items(): - HLA_names[HLA_gene] = list(data.keys()) - - # HLA gene allele lengths - HLA_lengths = {} - for HLA_gene, HLA_alleles in HLAs.items(): - HLA_lengths[HLA_gene] = {} - for allele_name, seq in HLA_alleles.items(): - HLA_lengths[HLA_gene][allele_name] = len(seq) - - # Construct excluded alleles (Via default backbone data) - custom_allele_check = False - if len(default_allele_list) > 0: - custom_allele_check = True - HLAs_default = {} - read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default) - read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default) - - for HLA_gene, HLA_alleles in HLAs_default.items(): - for allele_name, seq in HLA_alleles.items(): - if allele_name in default_allele_list: - HLA_lengths[HLA_gene][allele_name] = len(seq) - - # Read HLA variants, and link information - Vars, Var_list = {}, {} - for line in open("%s.snp" % base_fname): - var_id, var_type, allele, pos, data = line.strip().split('\t') - pos = int(pos) - if reference_type != "gene": - allele, dist = None, 0 - for tmp_gene, values in refHLA_loci.items(): - allele_name, chr, left, right, exons = values - if allele == None: - allele = allele_name - dist = abs(pos - left) - else: - if dist > abs(pos - left): - allele = allele_name - dist = abs(pos - left) - - gene = allele.split('*')[0] - if not gene in Vars: - Vars[gene] = {} - assert not gene in Var_list - Var_list[gene] = [] - - assert not var_id in Vars[gene] - left = 0 - if reference_type != "gene": - _, _, left, _, _ = refHLA_loci[gene] - Vars[gene][var_id] = [var_type, pos - left, data] - Var_list[gene].append([pos - left, var_id]) - - for gene, in_var_list in Var_list.items(): - Var_list[gene] = sorted(in_var_list) - - Links = {} - for line in open("%s.link" % base_fname): - var_id, alleles = line.strip().split('\t') - alleles = alleles.split() - assert not var_id in Links - Links[var_id] = alleles - - # Test HLA typing - test_list = [] - if simulation: - basic_test, pair_test = True, False - if daehwan_debug: - if "basic_test" in daehwan_debug: - basic_test, pair_test = True, False - else: - basic_test, pair_test = False, True - - test_passed = {} - test_list = [] - if base_fname == "hla": - genes = list(set(hla_list) & set(HLA_names.keys())) - else: - genes = HLA_names.keys() - - if basic_test: - for gene in genes: - HLA_gene_alleles = HLA_names[gene] - for HLA_name in HLA_gene_alleles: - if HLA_name.find("BACKBONE") != -1: - continue - test_list.append([[HLA_name]]) - if pair_test: - test_size = 500 - allele_count = 2 - for test_i in range(test_size): - test_pairs = [] - for gene in genes: - HLA_gene_alleles = [] - for allele in HLA_names[gene]: - if allele.find("BACKBONE") != -1: - continue - HLA_gene_alleles.append(allele) - - # DK - temporary - if len(HLA_gene_alleles) < 2: - continue - - nums = [i for i in range(len(HLA_gene_alleles))] - random.shuffle(nums) - test_pairs.append(sorted([HLA_gene_alleles[nums[i]] for i in range(allele_count)])) - test_list.append(test_pairs) - - for test_i in range(len(test_list)): - if "test_id" in daehwan_debug: - daehwan_test_ids = daehwan_debug["test_id"].split('-') - if str(test_i + 1) not in daehwan_test_ids: - continue - - print >> sys.stderr, "Test %d" % (test_i + 1) - test_HLA_list = test_list[test_i] - - # daehwan - for debugging purposes - # test_HLA_list = [["A*11:50Q", "A*11:01:01:01", "A*01:01:01:01"]] - for test_HLA_names in test_HLA_list: - for test_HLA_name in test_HLA_names: - if custom_allele_check: - gene = test_HLA_name.split('*')[0] - test_HLA_seq = HLAs_default[gene][test_HLA_name] - seq_type = "partial" if test_HLA_name in partial_alleles else "full" - print >> sys.stderr, "\t%s - %d bp (%s sequence)" % (test_HLA_name, len(test_HLA_seq), seq_type) - continue - gene = test_HLA_name.split('*')[0] - test_HLA_seq = HLAs[gene][test_HLA_name] - seq_type = "partial" if test_HLA_name in partial_alleles else "full" - print >> sys.stderr, "\t%s - %d bp (%s sequence)" % (test_HLA_name, len(test_HLA_seq), seq_type) - if custom_allele_check: - simulate_reads(HLAs_default, test_HLA_list, simulate_interval) - else: - simulate_reads(HLAs, test_HLA_list, simulate_interval) - - if "test_id" in daehwan_debug: - read_fname = ["hla_input_1.fa"] - else: - read_fname = ["hla_input_1.fa", "hla_input_2.fa"] - - fastq = False - - tmp_test_passed = HLA_typing(ex_path, - base_fname, - simulation, - reference_type, - test_HLA_list, - partial, - refHLAs, - HLAs, - HLA_names, - HLA_lengths, - refHLA_loci, - Vars, - Var_list, - Links, - exclude_allele_list, - aligners, - num_mismatch, - fastq, - read_fname, - alignment_fname, - threads, - enable_coverage, - best_alleles, - verbose) - - for aligner_type, passed in tmp_test_passed.items(): - if aligner_type in test_passed: - test_passed[aligner_type] += passed - else: - test_passed[aligner_type] = passed - - print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1))) - - - for aligner_type, passed in test_passed.items(): - print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list)) - - else: # With real reads or BAMs - if base_fname == "hla": - gene_list = hla_list - else: - gene_list = Vars.keys() - print >> sys.stderr, "\t", ' '.join(gene_list) - - fastq = True - HLA_typing(ex_path, - base_fname, - simulation, - reference_type, - gene_list, - partial, - refHLAs, - HLAs, - HLA_names, - HLA_lengths, - refHLA_loci, - Vars, - Var_list, - Links, - exclude_allele_list, - aligners, - num_mismatch, - fastq, - read_fname, - alignment_fname, - threads, - enable_coverage, - best_alleles, - verbose) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='genotyping') - parser.add_argument("--base", - dest="base_fname", - type=str, - default="hla", - help="base filename for backbone HLA sequence, HLA variants, and HLA linking info") - parser.add_argument("--default-list", - dest = "default_allele_list", - type=str, - default="", - help="A comma-separated list of HLA alleles to be tested. Alleles are retrieved from default backbone data (all alleles included in backbone).") - parser.add_argument("--reference-type", - dest="reference_type", - type=str, - default="gene", - help="Reference type: gene, chromosome, and genome (default: gene)") - parser.add_argument("--hla-list", - dest="hla_list", - type=str, - default="A,B,C,DQA1,DQB1,DRB1", - help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)") - parser.add_argument('--partial', - dest='partial', - action='store_true', - help='Include partial alleles (e.g. A_nuc.fasta)') - parser.add_argument("--aligner-list", - dest="aligners", - type=str, - default="hisat2.graph,hisat2.linear,bowtie2.linear", - help="A comma-separated list of aligners (default: hisat2.graph,hisat2.linear,bowtie2.linear)") - parser.add_argument("--reads", - dest="read_fname", - type=str, - default="", - help="Fastq read file name") - parser.add_argument("--alignment", - dest="alignment_fname", - type=str, - default="", - help="BAM file name") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument("--simulate-interval", - dest="simulate_interval", - type=int, - default=1, - help="Reads simulated at every these base pairs (default: 1)") - parser.add_argument("--coverage", - dest="coverage", - action='store_true', - help="Experimental purpose (assign reads based on coverage)") - parser.add_argument("--best-alleles", - dest="best_alleles", - action='store_true', - help="") - parser.add_argument("--exclude-allele-list", - dest="exclude_allele_list", - type=str, - default="", - help="A comma-separated list of alleles to be excluded. Enter a number N to randomly select N alleles for exclusion and N non-excluded alleles for testing (2N tested in total).") - parser.add_argument("--num-mismatch", - dest="num_mismatch", - type=int, - default=0, - help="Maximum number of mismatches per read alignment to be considered (default: 0)") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument("--debug", - dest="debug", - type=str, - default="", - help="e.g., test_id:10,read_id:10000,basic_test") - parser.add_argument("--novel_allele_detection", - dest="novel_allele_detection", - action='store_true', - help="Change test to detection of new alleles. Report sensitivity and specificity rate at the end.") - - - args = parser.parse_args() - if not args.reference_type in ["gene", "chromosome", "genome"]: - print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type) - sys.exit(1) - args.hla_list = args.hla_list.split(',') - if args.aligners == "": - print >> sys.stderr, "Error: --aligners must be non-empty." - sys.exit(1) - args.aligners = args.aligners.split(',') - for i in range(len(args.aligners)): - args.aligners[i] = args.aligners[i].split('.') - if args.read_fname: - args.read_fname = args.read_fname.split(',') - else: - args.read_fname = [] - if args.alignment_fname != "" and \ - not os.path.exists(args.alignment_fname): - print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname - sys.exit(1) - - if len(args.default_allele_list) > 0: - args.default_allele_list = args.default_allele_list.split(',') - - if len(args.exclude_allele_list) > 0: - if args.exclude_allele_list.strip().isdigit(): - num_alleles = int(args.exclude_allele_list) - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - try: - os.mkdir("./Default-HLA") - except: - pass - - extract_hla_script = os.path.join(ex_path, "hisat2_extract_HLA_vars.py") - extract_cmd = [extract_hla_script, - "--reference-type", reference_type, - "--hla-list", ','.join(hla_list), - "--base", "./Default-HLA/hla"] - if partial: - extract_cmd += ["--partial"] - extract_cmd += ["--inter-gap", "30", - "--intra-gap", "50"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not os.path.exists("./Default-HLA/hla_backbone.fa"): - print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!" - sys.exit(1) - - HLAs_default = {} - #read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default) - read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default) - - allele_names = list(HLAs_default['A'].keys()) - random.shuffle(allele_names) - args.exclude_allele_list = allele_names[0:num_alleles] - args.default_allele_list = allele_names[num_alleles:2*num_alleles] - - args.default_allele_list = args.default_allele_list + args.exclude_allele_list - else: - args.exclude_allele_list = args.exclude_allele_list.split(',') - - debug = {} - if args.debug != "": - for item in args.debug.split(','): - if ':' in item: - key, value = item.split(':') - debug[key] = value - else: - debug[item] = 1 - - random.seed(1) - genotyping(args.base_fname, - args.reference_type, - args.hla_list, - args.partial, - args.aligners, - args.read_fname, - args.alignment_fname, - args.threads, - args.simulate_interval, - args.coverage, - args.best_alleles, - args.exclude_allele_list, - args.default_allele_list, - args.num_mismatch, - args.verbose, - debug) - - diff --git a/hisatgenotype_locus.py b/hisatgenotype_locus.py deleted file mode 100755 index 4d958058..00000000 --- a/hisatgenotype_locus.py +++ /dev/null @@ -1,2631 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re -import inspect, random -import math -from datetime import datetime, date, time -from argparse import ArgumentParser, FileType -from copy import deepcopy -import hisatgenotype_typing_common as typing_common, hisatgenotype_assembly_graph as assembly_graph - - -""" - var: ['single', 3300, 'G'] - exons: [[301, 373], [504, 822], [1084, 1417], [2019, 2301], [2404, 2520], [2965, 2997], [3140, 3187], [3357, 3361]] -""" -def var_in_exon(var, exons): - exonic = False - var_type, var_left, var_data = var - var_right = var_left - if var_type == "deletion": - var_right = var_left + int(var_data) - 1 - for exon_left, exon_right in exons: - if var_left >= exon_left and var_right <= exon_right: - return True - return False - - -""" -Report variant IDs whose var is within exonic regions -""" -def get_exonic_vars(Vars, exons): - vars = set() - for var_id, var in Vars.items(): - var_type, var_left, var_data = var - var_right = var_left - if var_type == "deletion": - var_right = var_left + int(var_data) - 1 - for exon_left, exon_right in exons: - if var_left >= exon_left and var_right <= exon_right: - vars.add(var_id) - - return vars - - -""" -Get representative alleles among those that share the same exonic sequences -""" -def get_rep_alleles(Links, exon_vars, in_alleles = None): - allele_vars = {} - for var, alleles in Links.items(): - if var not in exon_vars: - continue - for allele in alleles: - if in_alleles != None and allele not in in_alleles: - continue - if allele not in allele_vars: - allele_vars[allele] = set() - allele_vars[allele].add(var) - - allele_groups = {} - for allele, vars in allele_vars.items(): - vars = '-'.join(vars) - if vars not in allele_groups: - allele_groups[vars] = [] - allele_groups[vars].append(allele) - - allele_reps = {} # allele representatives - allele_rep_groups = {} # allele groups by allele representatives - for allele_members in allele_groups.values(): - assert len(allele_members) > 0 - allele_rep = allele_members[0] - allele_rep_groups[allele_rep] = allele_members - for allele_member in allele_members: - assert allele_member not in allele_reps - allele_reps[allele_member] = allele_rep - - return allele_reps, allele_rep_groups - - -""" -""" -def error_correct(ref_seq, - read_seq, - read_pos, - mpileup, - Vars, - Var_list, - cmp_list, - debug = False): - if debug: - print >> sys.stderr, cmp_list - print >> sys.stderr, read_seq - - num_correction = 0 - i = 0 - while i < len(cmp_list): - type, left, length = cmp_list[i][:3] - assert length > 0 - if left >= len(ref_seq): - break - if type == "match": - middle_cmp_list = [] - last_j = 0 - for j in range(length): - if read_pos + j >= len(read_seq) or \ - left + j >= len(ref_seq): - continue - - read_bp, ref_bp = read_seq[read_pos + j], ref_seq[left + j] - assert left + j < len(mpileup) - nt_set = mpileup[left + j][0] - if len(nt_set) > 0 and read_bp not in nt_set: - read_bp = 'N' if len(nt_set) > 1 else nt_set[0] - read_seq = read_seq[:read_pos + j] + read_bp + read_seq[read_pos + j + 1:] - assert read_bp != ref_bp - new_cmp = ["mismatch", left + j, 1, "unknown"] - num_correction += 1 - if read_bp != 'N': - var_idx = typing_common.lower_bound(Var_list, left + j) - while var_idx < len(Var_list): - var_pos, var_id = Var_list[var_idx] - if var_pos > left + j: - break - if var_pos == left + j: - var_type, _, var_data = Vars[var_id] - if var_type == "single" and read_bp == var_data: - new_cmp[3] = var_id - break - var_idx += 1 - if j > last_j: - middle_cmp_list.append(["match", left + last_j, j- last_j]) - middle_cmp_list.append(new_cmp) - last_j = j + 1 - if last_j < length: - middle_cmp_list.append(["match", left + last_j, length - last_j]) - - assert len(middle_cmp_list) > 0 - cmp_list = cmp_list[:i] + middle_cmp_list + cmp_list[i+1:] - i += (len(middle_cmp_list) - 1) - else: - assert type == "mismatch" - read_bp, ref_bp = read_seq[read_pos], ref_seq[left] - assert left < len(mpileup) - nt_set = mpileup[left][0] - - if debug: - print >> sys.stderr, left, read_bp, ref_bp, mpileup[left] - - if len(nt_set) > 0 and read_bp not in nt_set: - read_bp = 'N' if len(nt_set) > 1 else nt_set[0] - read_seq = read_seq[:read_pos] + read_bp + read_seq[read_pos+1:] - if read_bp == 'N': - cmp_list[i][3] = "unknown" - elif read_bp == ref_bp: - cmp_list[i] = ["match", left, 1] - num_correction += 1 - else: - cmp_list[i][3] = "unknown" - var_idx = typing_common.lower_bound(Var_list, left) - while var_idx < len(Var_list): - var_pos, var_id = Var_list[var_idx] - if var_pos > left: - break - if var_pos == left: - var_type, _, var_data = Vars[var_id] - if var_type == "single" and read_bp == var_data: - cmp_list[i][3] = var_id - break - var_idx += 1 - - if debug: - print >> sys.stderr, left, read_bp, ref_bp, mpileup[left] - print >> sys.stderr, cmp_list[i] - - read_pos += length - i += 1 - - # Combine matches - i = 0 - while i < len(cmp_list): - type, left, length = cmp_list[i][:3] - if type == "match" and i + 1 < len(cmp_list): - type2, left2, length2 = cmp_list[i+1][:3] - if type2 == "match": - cmp_list[i] = [type, left, length + length2] - cmp_list = cmp_list[:i+1] + cmp_list[i+2:] - continue - i += 1 - - if debug: - print >> sys.stderr, cmp_list - print >> sys.stderr, read_seq - - return cmp_list, read_seq, num_correction - - -""" -""" -def typing(simulation, - base_fname, - locus_list, - genotype_genome, - partial, - partial_alleles, - refGenes, - Genes, - Gene_names, - Gene_lengths, - refGene_loci, - Vars, - Var_list, - Links, - aligners, - num_editdist, - assembly, - output_base, - error_correction, - keep_alignment, - allow_discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - fastq, - read_fname, - alignment_fname, - num_frag_list, - read_len, - fragment_len, - threads, - best_alleles, - verbose, - assembly_verbose): - if simulation: - test_passed = {} - report_file = open(output_base + ".report", 'w') - for aligner, index_type in aligners: - for f_ in [sys.stderr, report_file]: - if index_type == "graph": - print >> f_, "\n\t\t%s %s" % (aligner, index_type) - else: - print >> f_, "\n\t\t%s %s" % (aligner, index_type) - - remove_alignment_file = False - if alignment_fname == "": - # Align reads, and sort the alignments into a BAM file - remove_alignment_file = True - if simulation: - alignment_fname = "%s_output.bam" % base_fname - else: - alignment_fname = read_fname[0].split('/')[-1] - alignment_fname = "%s.bam" % '.'.join(alignment_fname.split('.')[:2]) - - typing_common.align_reads(aligner, - simulation, - genotype_genome if genotype_genome != "" else (base_fname + "." + index_type), - index_type, - base_fname, - read_fname, - fastq, - threads, - alignment_fname, - verbose) - - for test_Gene_names in locus_list: - if base_fname == "genome": - if simulation: - region_chr, region_left, region_right = test_Gene_names[0] - else: - region_chr, region_left, region_right = test_Gene_names - gene = "%s:%d-%d" % (region_chr, region_left, region_right) - else: - if simulation: - gene = test_Gene_names[0].split('*')[0] - else: - gene = test_Gene_names - - ref_allele = refGenes[gene] - ref_seq = Genes[gene][ref_allele] - ref_locus = refGene_loci[gene] - ref_exons, ref_primary_exons = ref_locus[-2], ref_locus[-1] - novel_var_count = 0 - gene_vars, gene_var_list = deepcopy(Vars[gene]), deepcopy(Var_list[gene]) - cur_maxright = -1 - gene_var_maxrights = {} - for var_pos, var_id in gene_var_list: - var_type, var_pos, var_data = gene_vars[var_id] - if var_type == "deletion": - var_pos = var_pos + int(var_data) - 1 - cur_maxright = max(cur_maxright, var_pos) - gene_var_maxrights[var_id] = cur_maxright - - var_count = {} - def add_novel_var(gene_vars, - gene_var_list, - novel_var_count, - var_type, - var_pos, - var_data): - var_idx = typing_common.lower_bound(gene_var_list, var_pos) - while var_idx < len(gene_var_list): - pos_, id_ = gene_var_list[var_idx] - if pos_ > var_pos: - break - if pos_ == var_pos: - type_, _, data_ = gene_vars[id_] - assert type_ != var_type or data_ != var_data - if type_ != var_type: - if var_type == "insertion": - break - elif var_type == "single" and type_ == "deletion": - break - else: - if var_data < data_: - break - var_idx += 1 - var_id = "nv%d" % novel_var_count - assert var_id not in gene_vars - gene_vars[var_id] = [var_type, var_pos, var_data] - gene_var_list.insert(var_idx, [var_pos, var_id]) - return var_id, novel_var_count + 1 - - if not os.path.exists(alignment_fname + ".bai"): - os.system("samtools index %s" % alignment_fname) - # Read alignments - alignview_cmd = ["samtools", - "view", - alignment_fname] - base_locus = 0 - if genotype_genome != "": - _, chr, left, right = ref_locus[:4] - alignview_cmd += ["%s:%d-%d" % (chr, left+1, right+1)] - base_locus = left - - if index_type == "graph": - alignview_cmd += [ref_allele] - mpileup = typing_common.get_mpileup(alignview_cmd, - ref_seq, - base_locus, - gene_vars, - allow_discordant) - - if base_fname == "codis": - pair_interdist = typing_common.get_pair_interdist(alignview_cmd, - simulation, - verbose) - else: - pair_interdist = None - - bamview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - else: - alignview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - # List of nodes that represent alleles - allele_vars = {} - for _, var_id in gene_var_list: - if var_id not in Links: - continue - allele_list = Links[var_id] - for allele_id in allele_list: - if allele_id not in Genes[gene]: - continue - if allele_id not in allele_vars: - allele_vars[allele_id] = [var_id] - else: - allele_vars[allele_id].append(var_id) - - # Extract variants that are within exons - exon_vars = get_exonic_vars(gene_vars, ref_exons) - primary_exon_vars = get_exonic_vars(gene_vars, ref_primary_exons) - - # Store nodes that represent alleles - allele_nodes = {} - def create_allele_node(allele_name): - if allele_name in allele_nodes: - return allele_nodes[allele_name] - if allele_name in allele_vars: - var_ids = allele_vars[allele_name] - else: - var_ids = [] - seq = list(ref_seq) # sequence that node represents - var = ["" for i in range(len(ref_seq))] # how sequence is related to backbone - for var_id in var_ids: - assert var_id in gene_vars - var_type, var_pos, var_data = gene_vars[var_id] - assert var_pos >= 0 and var_pos < len(ref_seq) - if var_type == "single": - seq[var_pos] = var_data - var[var_pos] = var_id - elif var_type == "deletion": - del_len = int(var_data) - assert var_pos + del_len <= len(ref_seq) - seq[var_pos:var_pos + del_len] = ['D'] * del_len - var[var_pos:var_pos + del_len] = [var_id] * del_len - else: - # DK - to be implemented for insertions - assert var_type == "insertion" - - qual = ' ' * len(seq) - allele_node = assembly_graph.Node(allele_name, - 0, - seq, - qual, - var, - ref_seq, - gene_vars, - mpileup, - simulation) - allele_nodes[allele_name] = allele_node - return allele_node - - true_allele_nodes = {} - if simulation: - for allele_name in test_Gene_names: - true_allele_nodes[allele_name] = create_allele_node(allele_name) - - display_allele_nodes = {} - for display_allele in display_alleles: - display_allele_nodes[display_allele] = create_allele_node(display_allele) - - # Assembly graph - asm_graph = assembly_graph.Graph(ref_seq, - gene_vars, - ref_exons, - ref_primary_exons, - partial_alleles, - true_allele_nodes, - {}, # predicted_allele_nodes, which is empty for now - display_allele_nodes, - simulation) - - # Choose allele representives from those that share the same exonic sequences - allele_reps, allele_rep_groups = get_rep_alleles(Links, exon_vars) - allele_rep_set = set(allele_reps.values()) - - # Choose allele representives from those that share the primary exonic sequences - primary_exon_allele_reps, primary_exon_allele_rep_groups = get_rep_alleles(Links, primary_exon_vars, allele_rep_set) - primary_exon_allele_rep_set = set(primary_exon_allele_reps.values()) - - # Sanity check - for exon_allele in primary_exon_allele_reps.keys(): - # DK - debugging purposes - if exon_allele not in allele_rep_set: - print exon_allele, allele_reps[exon_allele], exon_allele in primary_exon_allele_reps.keys() - - assert exon_allele in allele_rep_set - - # For checking alternative alignments near the ends of alignments - Alts_left, Alts_right = typing_common.get_alternatives(ref_seq, - allele_vars, - gene_vars, - gene_var_list, - verbose >= 2) - - def haplotype_alts_list(haplotype_alts, left = True): - haplotype_list = [] - for haplotype in haplotype_alts.keys(): - if left: - pos = int(haplotype.split('-')[-1]) - else: - pos = int(haplotype.split('-')[0]) - haplotype_list.append([pos, haplotype]) - return sorted(haplotype_list, cmp = lambda a, b: a[0] - b[0]) - - Alts_left_list, Alts_right_list = haplotype_alts_list(Alts_left, True), haplotype_alts_list(Alts_right, False) - - # Count alleles - Gene_primary_exons_counts, Gene_primary_exons_cmpt = {}, {} - Gene_exons_counts, Gene_exons_cmpt = {}, {} - Gene_counts, Gene_cmpt = {}, {} - num_reads, num_pairs = 0, 0 - - # For debugging purposes - debug_allele_names = set(test_Gene_names) if simulation and verbose >= 2 else set() - - # Read information - prev_read_id = None - prev_right_pos = 0 - prev_lines = [] - left_read_ids, right_read_ids = set(), set() - if index_type == "graph": - # nodes for reads - read_nodes = [] - read_vars_list = [] - - # - def add_count(count_per_read, ht, add): - if base_fname == "genome" and len(count_per_read) == 1: - for allele in count_per_read.keys(): - count_per_read[allele] = add - return - - orig_ht = ht - ht = ht.split('-') - - assert len(ht) >= 2 - left, right = int(ht[0]), int(ht[-1]) - assert left <= right - - ht = ht[1:-1] - alleles = set(Genes[gene].keys()) - set([ref_allele]) - for i in range(len(ht)): - var_id = ht[i] - if var_id.startswith("nv") or \ - var_id not in Links: - continue - alleles &= set(Links[var_id]) - ht = set(ht) - - tmp_alleles = set() - var_idx = typing_common.lower_bound(gene_var_list, right + 1) - var_idx = min(var_idx, len(gene_var_list) - 1) - while var_idx >= 0: - _, var_id = gene_var_list[var_idx] - if var_id.startswith("nv") or \ - var_id in ht or \ - var_id not in Links: - var_idx -= 1 - continue - if var_id in gene_var_maxrights and gene_var_maxrights[var_id] < left: - break - var_type, var_left, var_data = gene_vars[var_id] - var_right = var_left - if var_type == "deletion": - var_right = var_left + int(var_data) - 1 - if (var_left >= left and var_left <= right) or \ - (var_right >= left and var_right <= right): - tmp_alleles |= set(Links[var_id]) - var_idx -= 1 - alleles -= tmp_alleles - alleles &= set(count_per_read.keys()) - - for allele in alleles: - count_per_read[allele] += add - - return len(alleles) - - # Identify best pairs - def choose_pairs(left_positive_hts, right_positive_hts): - if len(left_positive_hts) > 0 and \ - len(right_positive_hts) > 0 and \ - max(len(left_positive_hts), len(right_positive_hts)) >= 2: - expected_inter_dist = pair_interdist - """ - if simulation: - expected_inter_dist = fragment_len - read_len * 2 - """ - - best_diff = sys.maxint - picked = [] - for left_ht_str in left_positive_hts: - left_ht = left_ht_str.split('-') - l_left, l_right = int(left_ht[0]), int(left_ht[-1]) - for right_ht_str in right_positive_hts: - right_ht = right_ht_str.split('-') - r_left, r_right = int(right_ht[0]), int(right_ht[-1]) - if l_right < r_right: - inter_dist = r_left - l_right - 1 - else: - inter_dist = l_left - r_right - 1 - - cur_diff = abs(expected_inter_dist - inter_dist) - if best_diff > cur_diff: - best_diff = cur_diff - picked = [[left_ht_str, right_ht_str]] - elif best_diff == cur_diff: - picked.append([left_ht_str, right_ht_str]) - - assert len(picked) > 0 - - left_positive_hts, right_positive_hts = set(), set() - for left_ht_str, right_ht_str in picked: - left_positive_hts.add(left_ht_str) - right_positive_hts.add(right_ht_str) - - return left_positive_hts, right_positive_hts - - def get_exon_haplotypes(ht, exons): - if len(exons) <= 0: - return [] - - debug_ht = deepcopy(ht) - ht = ht.split('-') - assert len(ht) >= 2 - ht[0], ht[-1] = int(ht[0]), int(ht[-1]) - exon_hts = [] - for e_left, e_right in exons: - assert len(ht) >= 2 - ht_left, ht_right = ht[0], ht[-1] - if e_left > ht_right or e_right < ht_left: - continue - - new_ht = deepcopy(ht) - if ht_left < e_left: - split = False - for i in range(1, len(new_ht) - 1): - var_id = new_ht[i] - type, left, data = gene_vars[var_id] - if (type != "deletion" and left >= e_left) or \ - (type == "deletion" and left - 1 >= e_left): - ht_left = e_left - new_ht = [ht_left] + new_ht[i:] - split = True - break - if type == "deletion": - right = left + int(data) - if right >= e_left: - ht_left = right - new_ht = [right] + new_ht[i+1:] - split = True - break - if not split: - ht_left = e_left - new_ht = [ht_left, ht_right] - assert ht_left >= e_left - if ht_right > e_right: - split = False - for i in reversed(range(1, len(new_ht) - 1)): - var_id = new_ht[i] - type, right, data = gene_vars[var_id] - if type == "deletion": - right = right + int(data) - 1 - if (type != "deletion" and right <= e_right) or \ - (type == "deletion" and right + 1 <= e_right): - ht_right = e_right - new_ht = new_ht[:i+1] + [ht_right] - split = True - break - if type == "deletion": - left = right - int(data) - if left <= e_right: - ht_right = left - new_ht = new_ht[:i] + [ht_right] - split = True - break - if not split: - ht_right = e_right - new_ht = [ht_left, ht_right] - - if len(new_ht) == 2: - new_ht = "%d-%d" % (new_ht[0], new_ht[-1]) - else: - assert len(new_ht) > 2 - new_ht = "%d-%s-%d" % (new_ht[0], '-'.join(new_ht[1:-1]), new_ht[-1]) - assert ht_left <= ht_right - exon_hts.append(new_ht) - - return exon_hts - - # Positive evidence for left and right reads - left_positive_hts, right_positive_hts = set(), set() - - # Cigar regular expression - cigar_re = re.compile('\d+\w') - for line in alignview_proc.stdout: - line = line.strip() - cols = line.split() - read_id, flag, chr, pos, mapQ, cigar_str = cols[:6] - node_read_id = orig_read_id = read_id - if simulation: - read_id = read_id.split('|')[0] - read_seq, read_qual = cols[9], cols[10] - flag, pos = int(flag), int(pos) - pos -= (base_locus + 1) - if pos < 0: - continue - - # Unalined? - if flag & 0x4 != 0: - if simulation and verbose >= 2: - print "Unaligned" - print "\t", line - continue - - # Concordantly mapped? - if flag & 0x2 != 0: - concordant = True - else: - concordant = False - - NM, Zs, MD, NH = "", "", "", "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("Zs"): - Zs = col[5:] - elif col.startswith("MD"): - MD = col[5:] - elif col.startswith("NM"): - NM = int(col[5:]) - elif col.startswith("NH"): - NH = int(col[5:]) - - if NM > num_editdist: - continue - - # Only consider unique alignment - if NH > 1: - continue - - # Concordantly aligned mate pairs - if not allow_discordant and not concordant: - continue - - # Left read? - is_left_read = flag & 0x40 != 0 - if is_left_read: - if read_id in left_read_ids: - continue - left_read_ids.add(read_id) - if not simulation: - node_read_id += '|L' - else: # Right read? - assert flag & 0x80 != 0 - if read_id in right_read_ids: - continue - right_read_ids.add(read_id) - if not simulation: - node_read_id += '|R' - - if Zs: - Zs_str = Zs - Zs = Zs.split(',') - - assert MD != "" - MD_str_pos, MD_len = 0, 0 - Zs_pos, Zs_i = 0, 0 - for _i in range(len(Zs)): - Zs[_i] = Zs[_i].split('|') - Zs[_i][0] = int(Zs[_i][0]) - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - read_pos, left_pos = 0, pos - right_pos = left_pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - cmp_list = [] - num_error_correction = 0 - likely_misalignment = False - - # Extract variants w.r.t backbone from CIGAR string - softclip = [0, 0] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op == 'M': - first = True - MD_len_used = 0 - cmp_list_i = len(cmp_list) - while True: - if not first or MD_len == 0: - if MD[MD_str_pos].isdigit(): - num = int(MD[MD_str_pos]) - MD_str_pos += 1 - while MD_str_pos < len(MD): - if MD[MD_str_pos].isdigit(): - num = num * 10 + int(MD[MD_str_pos]) - MD_str_pos += 1 - else: - break - MD_len += num - # Insertion or full match followed - if MD_len >= length: - MD_len -= length - if length > MD_len_used: - cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used]) - break - first = False - read_base = read_seq[read_pos + MD_len] - MD_ref_base = MD[MD_str_pos] - MD_str_pos += 1 - assert MD_ref_base in "ACGT" - if MD_len > MD_len_used: - cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used]) - - _var_id = "unknown" - if read_pos + MD_len == Zs_pos and Zs_i < len(Zs): - assert Zs[Zs_i][1] == 'S' - _var_id = Zs[Zs_i][2] - Zs_i += 1 - Zs_pos += 1 - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - else: - # Search for a known (yet not indexed) variant or a novel variant - ref_pos = right_pos + MD_len - var_idx = typing_common.lower_bound(gene_var_list, ref_pos) - while var_idx < len(gene_var_list): - var_pos, var_id = gene_var_list[var_idx] - if var_pos > ref_pos: - break - if var_pos == ref_pos: - var_type, _, var_data = gene_vars[var_id] - if var_type == "single" and var_data == read_base: - _var_id = var_id - break - var_idx += 1 - - cmp_list.append(["mismatch", right_pos + MD_len, 1, _var_id]) - MD_len_used = MD_len + 1 - MD_len += 1 - # Full match - if MD_len == length: - MD_len = 0 - break - - # Correction for sequencing errors and update for cmp_list - if error_correction: - assert cmp_list_i < len(cmp_list) - new_cmp_list, read_seq, _num_error_correction = error_correct(ref_seq, - read_seq, - read_pos, - mpileup, - gene_vars, - gene_var_list, - cmp_list[cmp_list_i:], - node_read_id == "aHSQ1008:175:C0JVFACXX:5:1109:17665:21583|L") - cmp_list = cmp_list[:cmp_list_i] + new_cmp_list - num_error_correction += _num_error_correction - - elif cigar_op == 'I': - _var_id = "unknown" - if read_pos == Zs_pos and Zs_i < len(Zs): - assert Zs[Zs_i][1] == 'I' - _var_id = Zs[Zs_i][2] - Zs_i += 1 - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - else: - # Search for a known (yet not indexed) variant or a novel variant - var_idx = typing_common.lower_bound(gene_var_list, right_pos) - while var_idx < len(gene_var_list): - var_pos, var_id = gene_var_list[var_idx] - if var_pos > right_pos: - break - if var_pos == right_pos: - var_type, _, var_data = gene_vars[var_id] - if var_type == "insertion" and len(var_data) == length: - _var_id = var_id - break - var_idx += 1 - cmp_list.append(["insertion", right_pos, length, _var_id]) - if 'N' in read_seq[read_pos:read_pos+length]: - likely_misalignment = True - - elif cigar_op == 'D': - if MD[MD_str_pos] == '0': - MD_str_pos += 1 - assert MD[MD_str_pos] == '^' - MD_str_pos += 1 - while MD_str_pos < len(MD): - if not MD[MD_str_pos] in "ACGT": - break - MD_str_pos += 1 - _var_id = "unknown" - if read_pos == Zs_pos and \ - Zs_i < len(Zs) and \ - Zs[Zs_i][1] == 'D': - _var_id = Zs[Zs_i][2] - Zs_i += 1 - if Zs_i < len(Zs): - Zs_pos += Zs[Zs_i][0] - else: - # Search for a known (yet not indexed) variant or a novel variant - var_idx = typing_common.lower_bound(gene_var_list, right_pos) - while var_idx < len(gene_var_list): - var_pos, var_id = gene_var_list[var_idx] - if var_pos > right_pos: - break - if var_pos == right_pos: - var_type, _, var_data = gene_vars[var_id] - if var_type == "deletion" and int(var_data) == length: - _var_id = var_id - break - var_idx += 1 - - cmp_list.append(["deletion", right_pos, length, _var_id]) - - # Check if this deletion is artificial alignment - if right_pos < len(mpileup): - del_count, nt_count = 0, 0 - for nt, value in mpileup[right_pos][1].items(): - count = value[0] - if nt == 'D': - del_count += count - else: - nt_count += count - - # DK - debugging purposes - if base_fname == "hla": - if del_count * 6 < nt_count: # and nt_count >= 15: - likely_misalignment = True - - elif cigar_op == 'S': - if i == 0: - softclip[0] = length - Zs_pos += length - else: - assert i + 1 == len(cigars) - softclip[1] = length - else: - assert cigar_op == 'N' - assert False - cmp_list.append(["intron", right_pos, length]) - - if cigar_op in "MND": - right_pos += length - - if cigar_op in "MIS": - read_pos += length - - # Remove softclip in cigar and modify read_seq and read_qual accordingly - if sum(softclip) > 0: - if softclip[0] > 0: - cigars = cigars[1:] - read_seq = read_seq[softclip[0]:] - read_qual = read_qual[softclip[0]:] - if softclip[1] > 0: - cigars = cigars[:-1] - read_seq = read_seq[:-softclip[1]] - read_qual = read_qual[:-softclip[1]] - - cigar_str = "" - for type, length in cigars: - cigar_str += str(length) - cigar_str += type - - if sum(softclip) > 0: - continue - - if right_pos > len(ref_seq): - continue - - if num_error_correction > max(1, num_editdist): - continue - - if likely_misalignment: - continue - - # Add novel variants - read_pos = 0 - for cmp_i in range(len(cmp_list)): - type_, pos_, length_ = cmp_list[cmp_i][:3] - if type_ != "match": - var_id_ = cmp_list[cmp_i][3] - if var_id_ == "unknown": - add = True - if type_ == "mismatch": - data_ = read_seq[read_pos] - if data_ == 'N': - add = False - elif type_ == "deletion": - data_ = str(length_) - else: - assert type_ == "insertion" - data_ = read_seq[read_pos:read_pos + length_] - if add: - var_id_, novel_var_count = add_novel_var(gene_vars, - gene_var_list, - novel_var_count, - type_ if type_ != "mismatch" else "single", - pos_, - data_) - cmp_list[cmp_i][3] = var_id_ - if var_id_ != "unknown": - if var_id_ not in var_count: - var_count[var_id_] = 1 - else: - var_count[var_id_] += 1 - - if type_ != "deletion": - read_pos += length_ - - # Count the number of reads aligned uniquely with some constraints - num_reads += 1 - - def add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, include_alleles = set()): - if len(Gene_count_per_read) <= 0: - return "" - max_count = max(Gene_count_per_read.values()) - cur_cmpt = set() - for allele, count in Gene_count_per_read.items(): - if count < max_count: - continue - if len(include_alleles) > 0 and allele not in include_alleles: - continue - - cur_cmpt.add(allele) - if allele not in Gene_counts: - Gene_counts[allele] = 1 - else: - Gene_counts[allele] += 1 - - if len(cur_cmpt) == 0: - return "" - - if verbose >= 2: - alleles = ["", ""] - allele1_found, allele2_found = False, False - if alleles[0] != "": - for allele, count in Gene_count_per_read.items(): - if count < max_count: - continue - if allele == alleles[0]: - allele1_found = True - elif allele == alleles[1]: - allele2_found = True - if allele1_found != allele2_found: - print >> sys.stderr, alleles[0], Gene_count_per_read[alleles[0]] - print >> sys.stderr, alleles[1], Gene_count_per_read[alleles[1]] - if allele1_found: - print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, Gene_count_per_read[alleles[1]])) - else: - print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, Gene_count_per_read[alleles[0]])) - - cur_cmpt = sorted(list(cur_cmpt)) - cur_cmpt = '-'.join(cur_cmpt) - if not cur_cmpt in Gene_cmpt: - Gene_cmpt[cur_cmpt] = 1 - else: - Gene_cmpt[cur_cmpt] += 1 - - return cur_cmpt - - if read_id != prev_read_id: - if prev_read_id != None: - num_pairs += 1 - # DK - needs more test - # Several alleles go over 100 bps - """ - if base_fname == "codis" and gene == "D18S51": - left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts) - """ - - for positive_ht in left_positive_hts | right_positive_hts: - primary_exon_hts = get_exon_haplotypes(positive_ht, ref_primary_exons) - for exon_ht in primary_exon_hts: - add_count(Gene_primary_exons_count_per_read, exon_ht, 1) - exon_hts = get_exon_haplotypes(positive_ht, ref_exons) - for exon_ht in exon_hts: - add_count(Gene_exons_count_per_read, exon_ht, 1) - add_count(Gene_count_per_read, positive_ht, 1) - - # DK - debugging purposes - if prev_read_id.startswith("NS500497:33:HY32TBGXX:3:13511:0:56517876") and False: - print prev_read_id, left_positive_hts, right_positive_hts - max_count = max(Gene_primary_exons_count_per_read.values()) - for allele, count in Gene_primary_exons_count_per_read.items(): - if allele not in primary_exon_allele_rep_set: - continue - if count < max_count: - continue - print allele, count - - # DK - debugging purposes - """ - debug_allele_id = "TH01*10" - assert debug_allele_id in Gene_gen_count_per_read - debug_max_read_count = max(Gene_gen_count_per_read.values()) - debug_read_count = Gene_gen_count_per_read[debug_allele_id] - if debug_read_count < debug_max_read_count: - print prev_read_id, debug_read_count, debug_max_read_count, Gene_gen_count_per_read - print "\t", left_positive_hts, right_positive_hts - None - if prev_read_id == "HSQ1008:175:C0JVFACXX:5:1109:17665:21583": - for line in prev_lines: - print line - print "left_positive_hts :", left_positive_hts - print "right_positive_hts:", right_positive_hts - print "exon:", debug_read_count, "max:", debug_max_read_count - print "gen:", Gene_gen_count_per_read[debug_allele_id], "max:", max(Gene_gen_count_per_read.values()) - - for allele_id, count in Gene_count_per_read.items(): - if count == debug_max_read_count: - None - # print "allele max:", allele_id, count - # sys.exit(1) - None - """ - - cur_cmpt, cur_cmpt_gen = "", "" - if base_fname == "hla": - cur_primary_exons_cmpt = add_stat(Gene_primary_exons_cmpt, Gene_primary_exons_counts, Gene_primary_exons_count_per_read, primary_exon_allele_rep_set) - - # DK - debugging purposes - # for cmpt, count in Gene_primary_exons_count_per_read.items(): - if cur_primary_exons_cmpt.find("A*24:145") != -1 and cur_primary_exons_cmpt.find("A*24:02:01") == -1: - print prev_read_id - print cur_primary_exons_cmpt - - - - cur_exons_cmpt = add_stat(Gene_exons_cmpt, Gene_exons_counts, Gene_exons_count_per_read, allele_rep_set) - cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read) - else: - cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read) - for read_id_, read_id_i, read_node in read_nodes: - asm_graph.add_node(read_id_, - read_id_i, - read_node, - simulation) - read_nodes, read_var_list = [], [] - if simulation and \ - verbose >= 2 and \ - base_fname in ["hla", "codis"]: - cur_cmpt = cur_cmpt.split('-') if cur_cmpt != "" else set() - cur_cmpt_gen = cur_cmpt_gen.split('-') if cur_cmpt_gen != "" else set() - show_debug = (partial and cur_cmpt != "" and not set(cur_cmpt) & set(test_Gene_names)) or \ - (not partial and cur_cmpt_gen != "" and not set(cur_cmpt_gen) & set(test_Gene_names)) - - if show_debug: - print "%s are chosen instead of %s" % (cur_cmpt if partial else cur_cmpt_gen, '-'.join(test_Gene_names)) - for prev_line in prev_lines: - print "\t", prev_line - - prev_lines = [] - - left_positive_hts, right_positive_hts = set(), set() - Gene_primary_exons_count_per_read, Gene_exons_count_per_read, Gene_count_per_read = {}, {}, {} - for allele in Gene_names[gene]: - if allele.find("BACKBONE") != -1: - continue - if base_fname == "genome" and allele.find("GRCh38") != -1: - continue - if allele in primary_exon_allele_rep_set: - Gene_primary_exons_count_per_read[allele] = 0 - if allele in allele_rep_set: - Gene_exons_count_per_read[allele] = 0 - Gene_count_per_read[allele] = 0 - - prev_lines.append(line) - - # Remove mismatches due to unknown or novel variants - cmp_list2 = [] - for cmp in cmp_list: - cmp = deepcopy(cmp) - type, pos, length = cmp[:3] - if type == "match": - if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match": - cmp_list2[-1][2] += length - else: - cmp_list2.append(cmp) - elif type == "mismatch" and \ - (cmp[3] == "unknown" or cmp[3].startswith("nv")): - if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match": - cmp_list2[-1][2] += 1 - else: - cmp_list2.append(["match", pos, 1]) - else: - cmp_list2.append(cmp) - - cmp_list_left, cmp_list_right, cmp_left_alts, cmp_right_alts = \ - typing_common.identify_ambigious_diffs(ref_seq, - gene_vars, - Alts_left, - Alts_right, - Alts_left_list, - Alts_right_list, - cmp_list2, - verbose, - orig_read_id.startswith("HSQ1009:126:D0UUYACXX:4:2212:9787:80992#")) # debug? - - mid_ht = [] - for cmp in cmp_list2[cmp_list_left:cmp_list_right+1]: - type = cmp[0] - if type not in ["mismatch", "deletion", "insertion"]: - continue - var_id = cmp[3] - mid_ht.append(var_id) - - for l in range(len(cmp_left_alts)): - left_ht = cmp_left_alts[l].split('-') - left_ht += mid_ht - for r in range(len(cmp_right_alts)): - right_ht = cmp_right_alts[r].split('-') - ht = left_ht + right_ht - if len(ht) <= 0: - continue - ht_str = '-'.join(ht) - if is_left_read: - left_positive_hts.add(ht_str) - else: - right_positive_hts.add(ht_str) - - # DK - debugging purposes - DK_debug = False - if orig_read_id.startswith("30|R!"): - DK_debug = True - print line - print cmp_list - print "positive hts:", left_positive_hts, right_positive_hts - print "cmp_list [%d, %d]" % (cmp_list_left, cmp_list_right) - - if assembly: - # Construct multiple candidate realignments for CODIS - cmp_llist = [] - hts = left_positive_hts if is_left_read else right_positive_hts - assert len(hts) > 0 - for ht in hts: - cmp_list = [] - read_pos = 0 - vars_ = ht.split('-') - left_ = int(vars_[0]) - vars_ = vars_[1:] - for var_i in range(len(vars_)): - var_id = vars_[var_i] - # ref_seq, read_seq - if var_i == len(vars_) - 1: - right_ = int(var_id) - else: - var_type, var_pos, var_data = gene_vars[var_id] - right_ = var_pos - 1 - - for pos in range(left_, right_ + 1): - if read_seq[read_pos] != ref_seq[pos]: - if left_ < pos: - cmp_list.append(["match", left_, pos - left_]) - cmp_list.append(["mismatch", pos, 1, "unknown"]) - left_ = pos + 1 - read_pos += 1 - if left_ <= right_: - cmp_list.append(["match", left_, right_ - left_ + 1]) - - if var_i == len(vars_) - 1: - left_ = right_ + 1 - break - - if var_type == "single": - cmp_list.append(["mismatch", var_pos, 1, var_id]) - left_ = var_pos + 1 - read_pos += 1 - elif var_type == "deletion": - del_len = int(var_data) - cmp_list.append(["deletion", var_pos, del_len, var_id]) - left_ = var_pos + del_len - else: - assert var_type == "insertion" - cmp_list.append(["insertion", var_pos, len(var_data), var_id]) - left_ = var_pos - read_pos += len(var_data) - - assert len(cmp_list) > 0 - cmp_llist.append(cmp_list) - - for cmp_list_i in range(len(cmp_llist)): - # Node - cmp_list = cmp_llist[cmp_list_i] - read_node_pos, read_node_seq, read_node_qual, read_node_var = -1, [], [], [] - read_vars = [] - ref_pos, read_pos = cmp_list[0][1], 0 - cmp_i = 0 - while cmp_i < len(cmp_list): - cmp = cmp_list[cmp_i] - type, length = cmp[0], cmp[2] - if type in ["match", "mismatch"]: - if read_node_pos < 0: - read_node_pos = ref_pos - if type == "match": - read_node_seq += list(read_seq[read_pos:read_pos+length]) - read_node_qual += list(read_qual[read_pos:read_pos+length]) - read_node_var += ([''] * length) - read_pos += length - elif type == "mismatch": - var_id = cmp[3] - read_base, qual = read_seq[read_pos], read_qual[read_pos] - read_node_seq += [read_base] - read_node_qual += [qual] - read_node_var.append(var_id) - read_pos += 1 - elif type == "deletion": - var_id = cmp[3] - del_len = length - read_node_seq += (['D'] * del_len) - read_node_qual += ([''] * del_len) - if len(read_node_seq) > len(read_node_var): - assert len(read_node_seq) == len(read_node_var) + del_len - read_node_var += ([var_id] * del_len) - elif type == "insertion": - var_id = cmp[3] - ins_len = length - ins_seq = read_seq[read_pos:read_pos+ins_len] - read_node_seq += ["I%s" % nt for nt in ins_seq] - read_node_qual += list(read_qual[read_pos:read_pos+ins_len]) - read_node_var += ([var_id] * ins_len) - read_pos += length - else: - assert type == "intron" - cmp_i += 1 - - read_nodes.append([node_read_id, - cmp_list_i, - assembly_graph.Node(node_read_id, - read_node_pos, - read_node_seq, - read_node_qual, - read_node_var, - ref_seq, - gene_vars, - mpileup, - simulation)]) - - prev_read_id = read_id - prev_right_pos = right_pos - - if prev_read_id != None: - num_pairs += 1 - if base_fname == "codis" and gene == "D18S51": - left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts) - for positive_ht in left_positive_hts | right_positive_hts: - primary_exon_hts = get_exon_haplotypes(positive_ht, ref_primary_exons) - for exon_ht in primary_exon_hts: - add_count(Gene_primary_exons_count_per_read, exon_ht, 1) - exon_hts = get_exon_haplotypes(positive_ht, ref_exons) - for exon_ht in exon_hts: - add_count(Gene_exons_count_per_read, exon_ht, 1) - add_count(Gene_count_per_read, positive_ht, 1) - - if base_fname == "hla": - add_stat(Gene_primary_exons_cmpt, Gene_primary_exons_counts, Gene_primary_exons_count_per_read, primary_exon_allele_rep_set) - add_stat(Gene_exons_cmpt, Gene_exons_counts, Gene_exons_count_per_read, allele_rep_set) - add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read) - for read_id_, read_id_i, read_node in read_nodes: - asm_graph.add_node(read_id_, - read_id_i, - read_node, - simulation) - read_nodes, read_var_list = [], [] - - if num_reads <= 0: - continue - - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t%d reads and %d pairs are aligned" % (num_reads, num_pairs) - - else: - assert index_type == "linear" - def add_alleles(alleles): - if not allele in Gene_counts: - Gene_counts[allele] = 1 - else: - Gene_counts[allele] += 1 - - cur_cmpt = sorted(list(alleles)) - cur_cmpt = '-'.join(cur_cmpt) - if not cur_cmpt in Gene_cmpt: - Gene_cmpt[cur_cmpt] = 1 - else: - Gene_cmpt[cur_cmpt] += 1 - - prev_read_id, prev_AS = None, None - alleles = set() - for line in alignview_proc.stdout: - cols = line[:-1].split() - read_id, flag, allele = cols[:3] - flag = int(flag) - if flag & 0x4 != 0: - continue - if not allele.startswith(gene): - continue - if allele.find("BACKBONE") != -1: - continue - - AS = None - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("AS"): - AS = int(col[5:]) - assert AS != None - if read_id != prev_read_id: - if alleles: - if aligner == "hisat2" or \ - (aligner == "bowtie2" and len(alleles) < 10): - add_alleles(alleles) - alleles = set() - prev_AS = None - if prev_AS != None and AS < prev_AS: - continue - prev_read_id = read_id - prev_AS = AS - alleles.add(allele) - - if alleles: - add_alleles(alleles) - - Gene_counts = [[allele, count] for allele, count in Gene_counts.items()] - def Gene_count_cmp(a, b): - if a[1] != b[1]: - return b[1] - a[1] - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - Gene_counts = sorted(Gene_counts, cmp=Gene_count_cmp) - for count_i in range(len(Gene_counts)): - count = Gene_counts[count_i] - if simulation: - found = False - for test_Gene_name in test_Gene_names: - if count[0] == test_Gene_name: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_Gene_name, count[1]) - found = True - if count_i < 5 and not found: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - else: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1]) - if count_i >= 9: - break - for f_ in [sys.stderr, report_file]: - print >> f_ - - # Calculate the abundance of representative alleles on exonic sequences - if base_fname == "hla": - perform_typing_primary_exon = False - # Incorporate representive alleles for primary exons (experimental feature) - if perform_typing_primary_exon: - Gene_prob = primary_exon_prob = typing_common.single_abundance(Gene_primary_exons_cmpt) - primary_exon_alleles = set() - primary_exon_prob_sum = 0.0 - for prob_i in range(len(primary_exon_prob)): - allele, prob = primary_exon_prob[prob_i][:2] - if len(primary_exon_allele_rep_groups[allele]) <= 1: - continue - primary_exon_prob_sum += prob - primary_exon_alleles |= set(primary_exon_allele_rep_groups[allele]) - - # Incorporate representative alleles for exons - if len(primary_exon_alleles) > 0: - Gene_exons_cmpt2 = {} - for cmpt, value in Gene_exons_cmpt.items(): - cmpt2 = [] - for allele in cmpt.split('-'): - if allele in primary_exon_alleles: - cmpt2.append(allele) - if len(cmpt2) == 0: - continue - cmpt2 = '-'.join(cmpt2) - if cmpt2 not in Gene_exons_cmpt2: - Gene_exons_cmpt2[cmpt2] = value - else: - Gene_exons_cmpt2[cmpt2] += value - exon_prob = typing_common.single_abundance(Gene_exons_cmpt2, - remove_low_abundance_alleles) - exon_prob2 = {} - for allele, prob in primary_exon_prob: - if allele not in primary_exon_alleles: - exon_prob2[allele] = prob - for allele, prob in exon_prob: - exon_prob2[allele] = prob * primary_exon_prob_sum - exon_prob = [[allele, prob] for allele, prob in exon_prob2.items()] - Gene_prob = exon_prob = sorted(exon_prob, cmp=typing_common.Gene_prob_cmp) - else: - # Incorporate representative alleles for exons - Gene_prob = exon_prob = typing_common.single_abundance(Gene_exons_cmpt, - remove_low_abundance_alleles) - - exon_alleles = set() - exon_prob_sum = 0.0 - for prob_i in range(len(exon_prob)): - allele, prob = exon_prob[prob_i][:2] - if prob_i >= 10 and prob < 0.03: - break - if len(allele_rep_groups[allele]) <= 1: - continue - - exon_prob_sum += prob - exon_alleles |= set(allele_rep_groups[allele]) - - # Incorporate full-length alleles, non-representative alleles - if len(exon_alleles) > 0: - Gene_cmpt2 = {} - for cmpt, value in Gene_cmpt.items(): - cmpt2 = [] - for allele in cmpt.split('-'): - if allele in exon_alleles: - cmpt2.append(allele) - if len(cmpt2) == 0: - continue - cmpt2 = '-'.join(cmpt2) - if cmpt2 not in Gene_cmpt2: - Gene_cmpt2[cmpt2] = value - else: - Gene_cmpt2[cmpt2] += value - Gene_cmpt = Gene_cmpt2 - Gene_prob = typing_common.single_abundance(Gene_cmpt, - True, - Gene_lengths[gene]) - - Gene_combined_prob = {} - for allele, prob in exon_prob: - if allele not in exon_alleles: - Gene_combined_prob[allele] = prob - - for allele, prob in Gene_prob: - Gene_combined_prob[allele] = prob * exon_prob_sum - - Gene_prob = [[allele, prob] for allele, prob in Gene_combined_prob.items()] - Gene_prob = sorted(Gene_prob, cmp=typing_common.Gene_prob_cmp) - else: - if len(Gene_cmpt.keys()) <= 1: - Gene_prob = [] - if len(Gene_cmpt.keys()) == 1: - Gene_prob = [[Gene_cmpt.keys()[0], 1.0]] - else: - Gene_prob = typing_common.single_abundance(Gene_cmpt) - - if index_type == "graph" and assembly: - allele_node_order = [] - predicted_allele_nodes = {} - for allele_name, prob in Gene_prob: - if prob < 0.1: # abundance of 10% - break - predicted_allele_nodes[allele_name] = create_allele_node(allele_name) - allele_node_order.append([allele_name, prob]) - if len(predicted_allele_nodes) >= 2: - break - asm_graph.predicted_allele_nodes = predicted_allele_nodes - asm_graph.allele_node_order = allele_node_order - asm_graph.calculate_coverage() - - # Start drawing assembly graph - asm_graph.begin_draw("%s.%s.%s" % (output_base, base_fname, gene)) - - # Draw assembly graph - begin_y = asm_graph.draw(0, "a. Read alignment") - begin_y += 200 - - # Apply De Bruijn graph - asm_graph.guided_DeBruijn(assembly_verbose) - - # Draw assembly graph - begin_y = asm_graph.draw(begin_y, "b. Asssembly") - begin_y += 200 - - # Draw assembly graph - asm_graph.nodes = asm_graph.nodes2 - asm_graph.to_node, asm_graph.from_node = {}, {} - begin_y = asm_graph.draw(begin_y, "c. Assembly with known alleles") - - # End drawing assembly graph - asm_graph.end_draw() - - # Compare two alleles - if simulation and len(test_Gene_names) == 2: - allele_name1, allele_name2 = test_Gene_names - print >> sys.stderr, allele_name1, "vs.", allele_name2 - asm_graph.print_node_comparison(asm_graph.true_allele_nodes) - - def compare_alleles(vars1, vars2, print_output = True): - skip = True - var_i, var_j = 0, 0 - exon_i = 0 - allele_seq, mismatches = list(ref_seq), 0 - while var_i < len(vars1) and var_j < len(vars2): - cmp_var_id, node_var_id = vars1[var_i], vars2[var_j] - cmp_var, node_var = gene_vars[cmp_var_id], gene_vars[node_var_id] - - min_pos = min(cmp_var[1], node_var[1]) - cmp_var_in_exon, node_var_in_exon = False, False - while exon_i < len(ref_exons): - exon_left, exon_right = ref_exons[exon_i] - if min_pos <= exon_right: - if cmp_var[1] >= exon_left and cmp_var[1] <= exon_right: - cmp_var_in_exon = True - else: - cmp_var_in_exon = False - if node_var[1] >= exon_left and node_var[1] <= exon_right: - node_var_in_exon = True - else: - node_var_in_exon = False - break - exon_i += 1 - - if cmp_var_id == node_var_id: - skip = False - if print_output: - if cmp_var_in_exon: - print >> sys.stderr, "\033[94mexon%d\033[00m" % (exon_i + 1), - print >> sys.stderr, cmp_var_id, cmp_var, "\t\t\t", mpileup[cmp_var[1]] - var_i += 1; var_j += 1 - - var_type, var_pos, var_data = cmp_var - if var_type == "single": - allele_seq[var_pos] = var_data - elif var_type == "deletion": - allele_seq[var_pos:var_pos+int(var_data)] = '.' * int(var_data) - else: - assert var_type == "insertion" - continue - if cmp_var[1] <= node_var[1]: - if not skip: - if (var_i > 0 and var_i + 1 < len(vars1)) or cmp_var[0] != "deletion": - if print_output: - if cmp_var_in_exon: - for f_ in [sys.stderr, report_file]: - print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1), - for f_ in [sys.stderr, report_file]: - print >> f_, "***", cmp_var_id, cmp_var, "==", "\t\t\t", mpileup[cmp_var[1]] - mismatches += 1 - var_i += 1 - else: - if print_output: - if node_var_in_exon: - for f_ in [sys.stderr, report_file]: - print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1), - for f_ in [sys.stderr, report_file]: - print >> f_, "*** ==", node_var_id, node_var, "\t\t\t", mpileup[node_var[1]] - mismatches += 1 - var_j += 1 - - allele_exons = ref_exons[:] - allele_seq = ''.join(allele_seq) - del_counts = [] - for del_i in range(len(allele_seq)): - del_count = 0 if del_i == 0 else del_counts[-1] - if allele_seq[del_i] == '.': - del_count += 1 - del_counts.append(del_count) - for exon_i in range(len(allele_exons)): - exon_left, exon_right = allele_exons[exon_i] - exon_left -= del_counts[exon_left] - exon_right -= del_counts[exon_right] - allele_exons[exon_i] = [exon_left, exon_right] - - allele_seq = allele_seq.replace('.', '') - return allele_seq, allele_exons, mismatches - - tmp_nodes = asm_graph.nodes - print >> sys.stderr, "Number of tmp nodes:", len(tmp_nodes) - count = 0 - for id, node in tmp_nodes.items(): - count += 1 - if count > 10: - break - node_vars = node.get_var_ids() - node.print_info(); print >> sys.stderr - if node.id in asm_graph.to_node: - for id2, at in asm_graph.to_node[node.id]: - print >> sys.stderr, "\tat %d ==> %s" % (at, id2) - - if simulation: - cmp_Gene_names = test_Gene_names - else: - cmp_Gene_names = [allele_name for allele_name, _ in allele_node_order] - - alleles, cmp_vars, max_common = [], [], -sys.maxint - for cmp_Gene_name in cmp_Gene_names: - tmp_vars = allele_nodes[cmp_Gene_name].get_var_ids(node.left, node.right) - tmp_common = len(set(node_vars) & set(tmp_vars)) - tmp_common -= len(set(node_vars) | set(tmp_vars)) - if max_common < tmp_common: - max_common = tmp_common - alleles = [[cmp_Gene_name, tmp_vars]] - elif max_common == tmp_common: - alleles.append([cmp_Gene_name, tmp_vars]) - - for allele_name, cmp_vars in alleles: - for f_ in [sys.stderr, report_file]: - print >> f_, "vs.", allele_name - allele_seq, allele_exons, allele_mm = compare_alleles(cmp_vars, node_vars) - print >> f_, "\t\tallele sequence (%d bps):" % len(allele_seq), allele_seq - print >> f_, "\t\texons (zero-based offset):", allele_exons - - print >> sys.stderr - print >> sys.stderr - - - # Identify alleles that perfectly or closesly match assembled alleles - for node_name, node in asm_graph.nodes.items(): - vars = set(node.get_var_ids()) - - max_allele_names, max_common = [], -sys.maxint - for allele_name, vars2 in allele_vars.items(): - vars2 = set(vars2) - tmp_common = len(vars & vars2) - len(vars | vars2) - if tmp_common > max_common: - max_common = tmp_common - max_allele_names = [allele_name] - elif tmp_common == max_common: - max_allele_names.append(allele_name) - - for f_ in [sys.stderr, report_file]: - print >> f_, "Genomic:", node_name - node_vars = node.get_var_ids() - min_mismatches = sys.maxint - for max_allele_name in max_allele_names: - cmp_vars = allele_vars[max_allele_name] - cmp_vars = sorted(cmp_vars, cmp=lambda a, b: int(a[2:]) - int(b[2:])) - print_output = False - _, _, tmp_mismatches = compare_alleles(cmp_vars, node_vars, print_output) - print >> f_, "\t\t%s:" % max_allele_name, max_common, tmp_mismatches - if tmp_mismatches < min_mismatches: - min_mismatches = tmp_mismatches - if min_mismatches > 0: - print >> f_, "Novel allele" - else: - print >> f_, "Known allele" - - """ - allele_exon_vars = {} - for allele_name, vars in allele_vars.items(): - allele_exon_vars[allele_name] = set(vars) & exon_vars - - for node_name, node in asm_graph.nodes.items(): - vars = [] - for left, right in ref_exons: - vars += node.get_var_ids(left, right) - vars = set(vars) & exon_vars - - max_allele_names, max_common = [], -sys.maxint - for allele_name, vars2 in allele_exon_vars.items(): - tmp_common = len(vars & vars2) - len(vars | vars2) - if tmp_common > max_common: - max_common = tmp_common - max_allele_names = [allele_name] - elif tmp_common == max_common: - max_allele_names.append(allele_name) - - for f_ in [sys.stderr, report_file]: - print >> f_, "Exonic:", node_name - for max_allele_name in max_allele_names: - print >> f_, "\t\t%s:" % max_allele_name, max_common - """ - - if simulation: - success = [False for i in range(len(test_Gene_names))] - found_list = [False for i in range(len(test_Gene_names))] - for prob_i in range(len(Gene_prob)): - prob = Gene_prob[prob_i] - if prob[1] < 0.01: - break - found = False - _allele_rep = prob[0] - """ - if partial and exonic_only: - _fields = _allele_rep.split(':') - if len(_fields) == 4: - _allele_rep = ':'.join(_fields[:-1]) - """ - if simulation: - for name_i in range(len(test_Gene_names)): - test_Gene_name = test_Gene_names[name_i] - if prob[0] == test_Gene_name: - rank_i = prob_i - while rank_i > 0: - if prob == Gene_prob[rank_i - 1][1]: - rank_i -= 1 - else: - break - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_Gene_name, prob[1] * 100.0) - if rank_i < len(success): - success[rank_i] = True - found_list[name_i] = True - found = True - # DK - for debugging purposes - if not False in found_list and prob_i >= 10: - break - if not found: - for f_ in [sys.stderr, report_file]: - print >> f_, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, _allele_rep, prob[1] * 100.0) - - if best_alleles and prob_i < 2: - for f_ in [sys.stderr, report_file]: - print >> f_, "SingleModel %s (abundance: %.2f%%)" % (_allele_rep, prob[1] * 100.0) - - # DK - debugging purposes - """ - # ref_allele_node_ = create_allele_node("A*03:01:01:01") - ref_allele_node_ = create_allele_node("DQA1*01:02:01:01") - cmp_node_ = create_allele_node(_allele_rep) - count_ = 0 - for i_ in range(len(ref_allele_node_.seq)): - if assembly_graph.get_major_nt(ref_allele_node_.seq[i_]) != assembly_graph.get_major_nt(cmp_node_.seq[i_]): - count_ += 1 - print "\t\t\t\t\tDK:", count_, len(ref_allele_node_.seq) - vars1, vars2 = allele_vars["DQA1*01:02:01:01"], allele_vars[_allele_rep] - print "\t\t\t\t\tDK:", set(vars1) - set(vars2), set(vars2) - set(vars1) - """ - - if not simulation and prob_i >= 9: - break - if prob_i >= 19: - break - print >> sys.stderr - - if simulation and not False in success: - aligner_type = "%s %s" % (aligner, index_type) - if not aligner_type in test_passed: - test_passed[aligner_type] = 1 - else: - test_passed[aligner_type] += 1 - - if not keep_alignment and remove_alignment_file: - os.system("rm %s*" % (alignment_fname)) - - report_file.close() - if simulation: - return test_passed - - -""" -""" -def read_backbone_alleles(genotype_genome, refGene_loci, Genes): - for gene_name in refGene_loci: - allele_name, chr, left, right = refGene_loci[gene_name][:4] - seq_extract_cmd = ["samtools", - "faidx", - "%s.fa" % genotype_genome, - "%s:%d-%d" % (chr, left+1, right+1)] - - length = right - left + 1 - proc = subprocess.Popen(seq_extract_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) - seq = "" - for line in proc.stdout: - line = line.strip() - if line.startswith('>'): - continue - seq += line - assert len(seq) == length - assert gene_name not in Genes - Genes[gene_name] = {} - Genes[gene_name][allele_name] = seq - - -""" -""" -def read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes): - for gene_name in Genes: - # Assert there is only one allele per gene, which is a backbone allele - assert len(Genes[gene_name]) == 1 - backbone_allele_name, backbone_seq = Genes[gene_name].items()[0] - gene_vars, gene_var_list = Vars[gene_name], Var_list[gene_name] - allele_vars = {} - for _, var_id in gene_var_list: - if var_id not in Links: - continue - for allele_name in Links[var_id]: - if allele_name not in allele_vars: - allele_vars[allele_name] = [] - allele_vars[allele_name].append(var_id) - - for allele_name, vars in allele_vars.items(): - seq = "" - prev_pos = 0 - for var_id in vars: - type, pos, data = gene_vars[var_id] - assert prev_pos <= pos - if pos > prev_pos: - seq += backbone_seq[prev_pos:pos] - if type == "single": - prev_pos = pos + 1 - seq += data - elif type == "deletion": - prev_pos = pos + int(data) - else: - assert type == "insertion" - seq += data - prev_pos = pos - if prev_pos < len(backbone_seq): - seq += backbone_seq[prev_pos:] - Genes[gene_name][allele_name] = seq - - if len(Genes[gene_name]) <= 1: - Genes[gene_name]["%s*GRCh38" % gene_name] = backbone_seq - - -""" -""" -def read_Gene_alleles(fname, Genes): - for line in open(fname): - if line.startswith(">"): - allele_name = line.strip().split()[0][1:] - gene_name = allele_name.split('*')[0] - if not gene_name in Genes: - Genes[gene_name] = {} - if not allele_name in Genes[gene_name]: - Genes[gene_name][allele_name] = "" - else: - Genes[gene_name][allele_name] += line.strip() - return Genes - - -""" -""" -def read_Gene_vars(fname): - Vars, Var_list = {}, {} - for line in open(fname): - var_id, var_type, allele, pos, data = line.strip().split('\t') - pos = int(pos) - gene = allele.split('*')[0] - if not gene in Vars: - Vars[gene] = {} - assert not gene in Var_list - Var_list[gene] = [] - - assert not var_id in Vars[gene] - Vars[gene][var_id] = [var_type, pos, data] - Var_list[gene].append([pos, var_id]) - - for gene, in_var_list in Var_list.items(): - Var_list[gene] = sorted(in_var_list) - - return Vars, Var_list - - -""" -""" -def read_Gene_vars_genotype_genome(fname, refGene_loci): - loci = {} - for gene, values in refGene_loci.items(): - allele_name, chr, left, right = values[:4] - if chr not in loci: - loci[chr] = [] - loci[chr].append([allele_name, left, right]) - - Vars, Var_list = {}, {} - for line in open(fname): - var_id, var_type, var_chr, pos, data = line.strip().split('\t') - if var_chr not in loci: - continue - pos = int(pos) - found = False - for allele_name, left, right in loci[var_chr]: - if pos >= left and pos <= right: - found = True - break - if not found: - continue - - gene = allele_name.split('*')[0] - if not gene in Vars: - Vars[gene] = {} - assert not gene in Var_list - Var_list[gene] = [] - - assert not var_id in Vars[gene] - Vars[gene][var_id] = [var_type, pos - left, data] - Var_list[gene].append([pos - left, var_id]) - - for gene, in_var_list in Var_list.items(): - Var_list[gene] = sorted(in_var_list) - - return Vars, Var_list - - -""" -""" -def read_Gene_links(fname): - Links = {} - for line in open(fname): - var_id, alleles = line.strip().split('\t') - alleles = alleles.split() - assert not var_id in Links - Links[var_id] = alleles - - return Links - - -""" -""" -def genotyping_locus(base_fname, - locus_list, - genotype_genome, - only_locus_list, - partial, - aligners, - read_fname, - fastq, - alignment_fname, - threads, - simulate_interval, - read_len, - fragment_len, - best_alleles, - num_editdist, - perbase_errorrate, - perbase_snprate, - skip_fragment_regions, - assembly, - output_base, - error_correction, - keep_alignment, - discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - verbose, - assembly_verbose, - debug_instr): - simulation = (read_fname == [] and alignment_fname == "") - if genotype_genome == "": - if not os.path.exists("hisatgenotype_db"): - typing_common.clone_hisatgenotype_database() - - # Download human genome and HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index() - - # Check if the pre-existing files (hla*) are compatible with the current parameter setting - if genotype_genome != "": - if os.path.exists("%s.locus" % base_fname): - left = 0 - Gene_genes = [] - BACKBONE = False - for line in open("%s.locus" % base_fname): - Gene_name = line.strip().split()[0] - if Gene_name.find("BACKBONE") != -1: - BACKBONE = True - Gene_gene = Gene_name.split('*')[0] - Gene_genes.append(Gene_gene) - delete_hla_files = False - if not BACKBONE: - delete_hla_files = True - if len(locus_list) == 0: - locus_list = Gene_genes - if not set(locus_list).issubset(set(Gene_genes)): - delete_hla_files = True - if delete_hla_files: - os.system("rm %s*" % base_fname) - - # Extract variants, backbone sequence, and other sequeces - if genotype_genome != "": - genome_fnames = [genotype_genome + ".fa", - genotype_genome + ".fa.fai", - genotype_genome + ".locus", - genotype_genome + ".snp", - genotype_genome + ".index.snp", - genotype_genome + ".haplotype", - genotype_genome + ".link", - genotype_genome + ".clnsig", - genotype_genome + ".coord", - genotype_genome + ".allele", - genotype_genome + ".partial"] - for i in range(8): - genome_fnames.append(genotype_genome + ".%d.ht2" % (i+1)) - - if not typing_common.check_files(genome_fnames): - print >> sys.stderr, "Error: some of the following files are not available:", ' '.join(genome_fnames) - sys.exit(1) - else: - typing_common.extract_database_if_not_exists(base_fname, - only_locus_list, - 30, # inter_gap - 50, # intra_gap - partial, - verbose >= 1) - for aligner, index_type in aligners: - typing_common.build_index_if_not_exists(base_fname, - aligner, - index_type, - threads, - verbose >= 1) - - # Read alleles - alleles = set() - if genotype_genome != "": - for line in open("%s.allele" % genotype_genome): - family, allele_name = line.strip().split('\t') - if family == base_fname: - alleles.add(allele_name) - else: - for line in open("%s.allele" % base_fname): - alleles.add(line.strip()) - - # Read partial alleles - partial_alleles = set() - if genotype_genome != "": - for line in open("%s.partial" % genotype_genome): - family, allele_name = line.strip().split('\t') - if family == base_fname: - partial_alleles.add(allele_name) - - else: - for line in open("%s.partial" % base_fname): - partial_alleles.add(line.strip()) - - # Read alleles (names and sequences) - refGenes, refGene_loci = {}, {} - if base_fname == "genome": - for chr, left, right in locus_list: - region_name = "%s:%d-%d" % (chr, left, right) - refGenes[region_name] = region_name - refGene_loci[region_name] = [region_name, chr, left, right, []] - else: - for line in open("%s.locus" % (genotype_genome if genotype_genome != "" else base_fname)): - fields = line.strip().split() - if genotype_genome != "" and base_fname != fields[0].lower(): - continue - if genotype_genome != "": - _, Gene_name, chr, left, right, exon_str, strand = fields - else: - Gene_name, chr, left, right, _, exon_str, strand = fields - Gene_gene = Gene_name.split('*')[0] - assert not Gene_gene in refGenes - refGenes[Gene_gene] = Gene_name - left, right = int(left), int(right) - exons, primary_exons = [], [] - for exon in exon_str.split(','): - primary = exon.endswith('p') - if primary: - exon = exon[:-1] - exon_left, exon_right = exon.split('-') - exon_left, exon_right = int(exon_left), int(exon_right) - exons.append([exon_left, exon_right]) - if primary: - primary_exons.append([exon_left, exon_right]) - refGene_loci[Gene_gene] = [Gene_name, chr, left, right, exons, primary_exons] - Genes = {} - if len(locus_list) == 0: - locus_list = refGene_loci.keys() - - # Read variants, and link information - if genotype_genome: - Vars, Var_list = read_Gene_vars_genotype_genome("%s.snp" % genotype_genome, refGene_loci) - Links = read_Gene_links("%s.link" % genotype_genome) - else: - Vars, Var_list = read_Gene_vars("%s.snp" % base_fname) - Links = read_Gene_links("%s.link" % base_fname) - - # Some loci may have only one allele such as AMELX and AMELY - for gene_name in refGene_loci.keys(): - if gene_name in Vars: - continue - Vars[gene_name], Var_list[gene_name], Links[gene_name] = {}, [], {} - - # Read allele sequences - if genotype_genome != "": - read_backbone_alleles(genotype_genome, refGene_loci, Genes) - read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes) - else: - read_Gene_alleles(base_fname + "_backbone.fa", Genes) - read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes) - - # alleles corresponding to backbones - for allele in alleles: - locus = allele.split('*')[0] - assert locus in Genes - if allele not in Genes[locus]: - Genes[locus][allele] = Genes[locus]["%s*BACKBONE" % locus] - - # Sanity Check - if os.path.exists(base_fname + "_backbone.fa") and \ - os.path.exists(base_fname + "_sequences.fa"): - Genes2 = {} - read_Gene_alleles(base_fname + "_backbone.fa", Genes2) - read_Gene_alleles(base_fname + "_sequences.fa", Genes2) - for gene_name, alleles in Genes.items(): - assert gene_name in Genes2 - for allele_name, allele_seq in alleles.items(): - assert allele_name in Genes2[gene_name] - allele_seq2 = Genes2[gene_name][allele_name] - assert allele_seq == allele_seq2 - - # alleles names - Gene_names = {} - for Gene_gene, data in Genes.items(): - Gene_names[Gene_gene] = list(data.keys()) - - # allele lengths - Gene_lengths = {} - for Gene_gene, Gene_alleles in Genes.items(): - Gene_lengths[Gene_gene] = {} - for allele_name, seq in Gene_alleles.items(): - Gene_lengths[Gene_gene][allele_name] = len(seq) - - # Test typing - test_list = [] - if simulation: - basic_test, pair_test = True, False - if debug_instr and "pair" in debug_instr: - basic_test, pair_test = False, True - - test_passed = {} - test_list = [] - genes = list(set(locus_list) & set(Gene_names.keys())) - if basic_test: - for gene in genes: - Gene_gene_alleles = Gene_names[gene] - for allele in Gene_gene_alleles: - if allele.find("BACKBONE") != -1: - continue - test_list.append([[allele]]) - random.shuffle(test_list) - if pair_test: - test_size = 200 - allele_count = 2 - for test_i in range(test_size): - test_pairs = [] - for gene in genes: - Gene_gene_alleles = [] - - for allele in Gene_names[gene]: - if allele.find("BACKBONE") != -1: - continue - - if "full" in debug: - if allele in partial_alleles: - continue - - Gene_gene_alleles.append(allele) - nums = [i for i in range(len(Gene_gene_alleles))] - random.shuffle(nums) - test_pairs.append(sorted([Gene_gene_alleles[nums[i]] for i in range(allele_count)])) - test_list.append(test_pairs) - - if "test_list" in debug_instr: - test_list = [[debug_instr["test_list"].split('-')]] - - for test_i in range(len(test_list)): - if "test_id" in debug_instr: - test_ids = debug_instr["test_id"].split('-') - if str(test_i + 1) not in test_ids: - continue - - print >> sys.stderr, "Test %d" % (test_i + 1), str(datetime.now()) - test_locus_list = test_list[test_i] - num_frag_list = typing_common.simulate_reads(Genes, - base_fname, - test_locus_list, - Vars, - Links, - simulate_interval, - read_len, - fragment_len, - perbase_errorrate, - perbase_snprate, - skip_fragment_regions) - - assert len(num_frag_list) == len(test_locus_list) - for i_ in range(len(test_locus_list)): - test_Gene_names = test_locus_list[i_] - num_frag_list_i = num_frag_list[i_] - assert len(num_frag_list_i) == len(test_Gene_names) - for j_ in range(len(test_Gene_names)): - test_Gene_name = test_Gene_names[j_] - gene = test_Gene_name.split('*')[0] - test_Gene_seq = Genes[gene][test_Gene_name] - seq_type = "partial" if test_Gene_name in partial_alleles else "full" - print >> sys.stderr, "\t%s - %d bp (%s sequence, %d pairs)" % (test_Gene_name, len(test_Gene_seq), seq_type, num_frag_list_i[j_]) - - if "single-end" in debug_instr: - read_fname = ["%s_input_1.fa" % base_fname] - else: - read_fname = ["%s_input_1.fa" % base_fname, "%s_input_2.fa" % base_fname] - - fastq = False - tmp_test_passed = typing(simulation, - base_fname, - test_locus_list, - genotype_genome, - partial, - partial_alleles, - refGenes, - Genes, - Gene_names, - Gene_lengths, - refGene_loci, - Vars, - Var_list, - Links, - aligners, - num_editdist, - assembly, - output_base, - error_correction, - keep_alignment, - discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - fastq, - read_fname, - alignment_fname, - num_frag_list, - read_len, - fragment_len, - threads, - best_alleles, - verbose, - assembly_verbose) - - for aligner_type, passed in tmp_test_passed.items(): - if aligner_type in test_passed: - test_passed[aligner_type] += passed - else: - test_passed[aligner_type] = passed - - print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1))) - - - for aligner_type, passed in test_passed.items(): - print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list)) - - else: # With real reads or BAMs - if base_fname == "genome": - print >> sys.stderr, "\t", locus_list - else: - print >> sys.stderr, "\t", ' '.join(locus_list) - typing(simulation, - base_fname, - locus_list, - genotype_genome, - partial, - partial_alleles, - refGenes, - Genes, - Gene_names, - Gene_lengths, - refGene_loci, - Vars, - Var_list, - Links, - aligners, - num_editdist, - assembly, - output_base, - error_correction, - keep_alignment, - discordant, - type_primary_exons, - remove_low_abundance_alleles, - display_alleles, - fastq, - read_fname, - alignment_fname, - [], - read_len, - fragment_len, - threads, - best_alleles, - verbose, - assembly_verbose) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='hisatgenotype_locus') - parser.add_argument("--base", "--base-fname", - dest="base_fname", - type=str, - default="hla", - help="base filename for backbone sequence, variants, and linking info (default: hla)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="A comma-separated list of genes (default: empty, all genes)") - parser.add_argument("--genotype-genome", - dest="genotype_genome", - type=str, - default="", - help="Base name for genotype genome, which the program will use instead of region-based small indexes (default: empty)") - parser.add_argument("-f", "--fasta", - dest='fastq', - action='store_false', - help='FASTA format') - parser.add_argument("-U", - dest="read_fname_U", - type=str, - default="", - help="filename for single-end reads") - parser.add_argument("-1", - dest="read_fname_1", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("-2", - dest="read_fname_2", - type=str, - default="", - help="filename for paired-end reads") - parser.add_argument("--alignment", - dest="alignment_fname", - type=str, - default="", - help="BAM file name") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument('--no-partial', - dest='partial', - action='store_false', - help='Include partial alleles (e.g. A_nuc.fasta)') - parser.add_argument("--aligner-list", - dest="aligners", - type=str, - default="hisat2.graph", - help="A comma-separated list of aligners such as hisat2.graph,hisat2.linear,bowtie2.linear (default: hisat2.graph)") - parser.add_argument("--simulate-interval", - dest="simulate_interval", - type=int, - default=10, - help="Reads simulated at every these base pairs (default: 10)") - parser.add_argument("--read-len", - dest="read_len", - type=int, - default=100, - help="Length of simulated reads (default: 100)") - parser.add_argument("--fragment-len", - dest="fragment_len", - type=int, - default=350, - help="Length of fragments (default: 350)") - parser.add_argument("--best-alleles", - dest="best_alleles", - action='store_true', - help="") - parser.add_argument("--random-seed", - dest="random_seed", - type=int, - default=1, - help="A seeding number for randomness (default: 1)") - parser.add_argument("--num-editdist", - dest="num_editdist", - type=int, - default=2, - help="Maximum number of mismatches per read alignment to be considered (default: 2)") - parser.add_argument("--perbase-errorrate", - dest="perbase_errorrate", - type=float, - default=0.0, - help="Per basepair error rate in percentage when simulating reads (default: 0.0)") - parser.add_argument("--perbase-snprate", - dest="perbase_snprate", - type=float, - default=0.0, - help="Per basepair SNP rate in percentage when simulating reads (default: 0.0)") - parser.add_argument("--skip-fragment-regions", - dest="skip_fragment_regions", - type=str, - default="", - help="A comma-separated list of regions from which no reads originate, e.g., 500-600,1200-1400 (default: None).") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument('--verbose-level', - dest='verbose_level', - type=int, - default=0, - help='also print some statistics to stderr (default: 0)') - parser.add_argument("--debug", - dest="debug", - type=str, - default="", - help="e.g., test_id:10,read_id:10000,basic_test") - parser.add_argument("--output-base", "--assembly-base", - dest="output_base", - type=str, - default="assembly_graph", - help="base file name (default: assembly_graph)") - parser.add_argument("--assembly", - dest="assembly", - action="store_true", - help="Perform assembly") - parser.add_argument("--no-error-correction", - dest="error_correction", - action="store_false", - help="Correct sequencing errors") - parser.add_argument("--keep-alignment", - dest="keep_alignment", - action="store_true", - help="Keep alignment file") - parser.add_argument("--only-locus-list", - dest="only_locus_list", - type=str, - default="", - help="A comma-separated list of genes (default: empty, all genes)") - parser.add_argument("--discordant", - dest="discordant", - action="store_true", - help="Allow discordantly mapped pairs or singletons") - parser.add_argument("--type-primary-exons", - dest="type_primary_exons", - action="store_true", - help="Look at primary exons first") - parser.add_argument("--keep-low-abundance-alleles", - dest="remove_low_abundance_alleles", - action="store_false", - help="Do not remove alleles with low abundance while performing typing") - parser.add_argument("--assembly-verbose", - dest="assembly_verbose", - action="store_true", - help="Output intermediate assembly information") - parser.add_argument("--display-alleles", - dest="display_alleles", - type=str, - default="", - help="A comma-separated list of alleles to display in HTML (default: empty)") - - args = parser.parse_args() - if args.locus_list == "": - locus_list = [] - else: - locus_list = args.locus_list.split(',') - if args.base_fname == "genome": - assert ':' in args.locus_list - for i in range(len(locus_list)): - assert ':' in locus_list[i] and '-' in locus_list[i] - chr, coord = locus_list[i].split(':') - left, right = coord.split('-') - locus_list[i] = [chr, int(left), int(right)] - - if args.only_locus_list == "": - only_locus_list = [] - else: - locus_list = only_locus_list = args.only_locus_list.split(',') - if args.aligners == "": - print >> sys.stderr, "Error: --aligners must be non-empty." - sys.exit(1) - args.aligners = args.aligners.split(',') - for i in range(len(args.aligners)): - args.aligners[i] = args.aligners[i].split('.') - if args.read_fname_U != "": - args.read_fname = [args.read_fname_U] - elif args.read_fname_1 != "" or args.read_fname_2 != "": - if args.read_fname_1 == "" or args.read_fname_2 == "": - print >> sys.stderr, "Error: please specify both -1 and -2." - sys.exit(1) - args.read_fname = [args.read_fname_1, args.read_fname_2] - else: - args.read_fname = [] - if args.alignment_fname != "" and \ - not os.path.exists(args.alignment_fname): - print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname - sys.exit(1) - - if args.verbose and args.verbose_level == 0: - args.verbose_level = 1 - - debug = {} - if args.debug != "": - for item in args.debug.split(','): - if ':' in item: - fields = item.split(':') - assert len(fields) >= 2 - key, value = fields[0], ':'.join(fields[1:]) - debug[key] = value - else: - debug[item] = 1 - - if not args.partial: - print >> sys.stderr, "Warning: --no-partial should be used for debugging purpose only." - - if args.read_len * 2 > args.fragment_len: - print >> sys.stderr, "Warning: fragment might be too short (%d)" % (args.fragment_len) - - skip_fragment_regions = [] - if args.skip_fragment_regions != "": - prev_left, prev_right = -1, -1 - for region in args.skip_fragment_regions.split(','): - left, right = region.split('-') - left, right = int(left), int(right) - assert left < right - assert prev_right < left - prev_left, prev_right = left, right - skip_fragment_regions.append([left, right]) - - if args.display_alleles == "": - display_alleles = [] - else: - display_alleles = args.display_alleles.split(',') - - random.seed(args.random_seed) - genotyping_locus(args.base_fname, - locus_list, - args.genotype_genome, - only_locus_list, - args.partial, - args.aligners, - args.read_fname, - args.fastq, - args.alignment_fname, - args.threads, - args.simulate_interval, - args.read_len, - args.fragment_len, - args.best_alleles, - args.num_editdist, - args.perbase_errorrate, - args.perbase_snprate, - skip_fragment_regions, - args.assembly, - args.output_base, - args.error_correction, - args.keep_alignment, - args.discordant, - args.type_primary_exons, - args.remove_low_abundance_alleles, - display_alleles, - args.verbose_level, - args.assembly_verbose, - debug) - diff --git a/hisatgenotype_modules/__init__.py b/hisatgenotype_modules/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/hisatgenotype_modules/hisatgenotype_assembly_graph.py b/hisatgenotype_modules/hisatgenotype_assembly_graph.py deleted file mode 100755 index 16794f40..00000000 --- a/hisatgenotype_modules/hisatgenotype_assembly_graph.py +++ /dev/null @@ -1,1902 +0,0 @@ -#!/usr/bin/env python - -import sys -import math, random -from datetime import datetime, date, time -from collections import deque -from copy import deepcopy - - -# -def get_major_nt(nt_dic): - nt = '' - max_count = 0 - for tmp_nt, tmp_value in nt_dic.items(): - tmp_count, tmp_var_id = tmp_value - if len(tmp_nt) == 1: - assert tmp_nt in "ACGTDN" - else: - assert len(tmp_nt) == 2 and tmp_nt[0] == 'I' and tmp_nt[1] in "ACGT" - if tmp_count > max_count: - max_count = tmp_count - nt = tmp_nt - if len(nt) == 1: - assert nt in "ACGTDN" - else: - assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT" - return nt - - -# -def match_score(nt_dic1, nt_dic2): - sum_1 = sum([count for count, _ in nt_dic1.values()]) - sum_2 = sum([count for count, _ in nt_dic2.values()]) - total1, total2 = sum_1 * 2.0, sum_2 * 2.0 - best = 0.0 - for nt in "ACGT": - if nt not in nt_dic1 or nt not in nt_dic2: - continue - tmp_best = nt_dic1[nt][0] / total1 + nt_dic2[nt][0] / total2 - if tmp_best > best: - best = tmp_best - return best - - -# -def get_ungapped_seq(seq): - ungapped_seq = [] - for i in range(len(seq)): - nt_dic = seq[i] - nt = get_major_nt(nt_dic) - if nt == 'D': - continue - ungapped_seq.append(nt_dic) - return ungapped_seq - - -# -def get_ungapped_seq_pos(seq, pos): - tot_del_len, tot_ins_len = 0, 0 - for i in range(len(seq)): - nt_dic = seq[i] - nt = get_major_nt(nt_dic) - if nt == 'D': - tot_del_len += 1 - elif nt[0] == 'I': - tot_ins_len += 1 - if i - tot_ins_len == pos: - return pos - tot_del_len - return -1 - - -# Get mate node id -# HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|L to HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|R or vice versa -def get_mate_node_id(node_id): - node_id2, end = node_id.split('|') - if end == 'L': - end = 'R' - else: - end = 'L' - node_id2 = '|'.join([node_id2, end]) - return node_id2 - - - -class Node: - # Initialize - def __init__(self, - id, - left, - seq, - qual, - var, - ref_seq, - ref_vars, - mpileup, - simulation): - self.next = [] # list of next nodes - - if simulation: - id = id.split('_')[0] - self.id = id # Node ID - self.left = left # starting position - - # sequence that node represents - # with information about how the sequence is related to backbone - assert len(seq) == len(var) - assert len(seq) == len(qual) - self.seq = [] - self.ins_len = 0 - for s in range(len(seq)): - nt = seq[s] - if len(nt) == 1: - assert nt in "ACGTDN" - else: - assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT" - self.ins_len += 1 - var_id = var[s] - self.seq.append({nt : [1, var_id]}) - self.qual = [] - for q in qual: - if q != '': - self.qual.append(max(0, ord(q) / 10 - 3)) - else: - self.qual.append(0) - - self.right = self.left + len(seq) - 1 - self.ins_len - - self.read_ids = set([id]) - self.mate_ids = set([id.split('|')[0]]) - - self.calculate_avg_cov() - - self.ref_seq = ref_seq - self.ref_vars = ref_vars - - self.mpileup = mpileup - - - # Check how compatible allele is in regard to read or pair - def compatible_with_rnode(self, rnode): - assert False - assert rnode.left + len(rnode.seq) <= len(self.seq) - score = 0 - for i in range(len(rnode.seq)): - allele_bp = self.seq[rnode.left + i] - read_bp = rnode.seq[i] - if allele_bp == read_bp: - score += 1 - - return float(score) / len(rnode.seq) - - - # Check how nodes overlap with each other without considering deletions - def overlap_with(self, other, vars, skipN = False, debug = False): - assert self.left <= other.left - if self.right < other.left: - return -1, -1 - seq = get_ungapped_seq(self.seq) - other_seq = get_ungapped_seq(other.seq) - add_mm = len(self.mate_ids & other.mate_ids) - i_left = get_ungapped_seq_pos(self.seq, other.left - self.left) - for i in range(i_left - 5, i_left + 6): - max_mm = 0.012 * (len(seq) - i) # 1 mismatch per 83 bases - tmp_mm = 0.0 - for j in range(len(other_seq)): - if i + j >= len(seq): - break - nt_dic, other_nt_dic = seq[i+j], other_seq[j] - nt, other_nt = get_major_nt(nt_dic), get_major_nt(other_nt_dic) - mismatch = 0.0 - if skipN and (nt == 'N' or other_nt == 'N'): - mismatch = 0.0 - elif nt != other_nt: - mismatch = 1.0 - match_score(seq[i+j], other_seq[j]) - - # Higher penalty for mismatches in variants - nt_var, other_nt_var = nt_dic[nt][1], other_nt_dic[other_nt][1] - if nt_var != other_nt_var: - mismatch = 5.0 - adjust = min(1.0, nt_dic[nt][0] / self.get_avg_cov()) * \ - min(1.0, other_nt_dic[other_nt][0] / other.get_avg_cov()) - mismatch *= adjust - if mismatch < 1.0: - mismatch = 1.0 - - assert mismatch >= 0.0 - tmp_mm += mismatch - if tmp_mm > max_mm: - break - - if debug: - print "at %d (%d) with overlap of %d and mismatch of %.2f" % (i, self.left + i, j, tmp_mm) - - if tmp_mm <= max_mm: - return i, min(len(seq) - i, len(other_seq)), tmp_mm - - return -1, -1, sys.maxint - - - # Combine two nodes with considering deletions - def combine_with(self, other): - # DK - debugging purposes - if self.left > other.left: - self.print_info() - other.print_info() - return - - assert self.left <= other.left - - # Merge two sequences - assert len(other.seq) > 0 and 'D' not in other.seq[0].keys() - j = 0 - # Merge the overlapped parts - if self.right >= other.left: - overlap, ins_len = False, 0 - for i in range(len(self.seq)): - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt.startswith('I'): - ins_len += 1 - if i == other.left - self.left + ins_len: - overlap = True - break - assert overlap - new_seq = self.seq[:i] - while i < len(self.seq) and j < len(other.seq): - nt_dic, nt_dic2 = self.seq[i], other.seq[j] - for nt, value in nt_dic2.items(): - count, var_id = value - if nt in nt_dic: - nt_dic[nt][0] += count - # if nt != 'D': - # assert nt_dic[nt][1] == var_id - else: - nt_dic[nt] = [count, var_id] - new_seq.append(nt_dic) - i += 1 - j += 1 - # this node contains the other node - if i < len(self.seq): - new_seq += self.seq[i:] - # Fill in the gap between the two nodes if exists - else: - new_seq = self.seq[:] - sum_1 = sum([count for count, _ in self.seq[-1].values()]) - sum_2 = sum([count for count, _ in other.seq[0].values()]) - flank_cov = (sum_1 + sum_2) / 2.0 - for k in range(other.left - self.right - 1): - ref_nt_dic = self.mpileup[k + 1 + self.right][1] - nt_dic = {} - # Fill in the gap with Ns for now - if len(ref_nt_dic) == 0 or True: - nt_dic = {'N' : [1, ""]} - else: - weight = flank_cov / max(1.0, sum([count for count, _ in ref_nt_dic.values()])) - for nt, value in ref_nt_dic.items(): - count, var_id = value - nt_dic[nt] = [count * weight, var_id] - new_seq.append(nt_dic) - - # Append the rest of the other sequence to it - if j < len(other.seq): - new_seq += deepcopy(other.seq[j:]) - self.read_ids |= other.read_ids - self.mate_ids |= other.mate_ids - - self.seq = new_seq - self.ins_len = 0 - for i in range(len(self.seq)): - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt[0] == 'I': - self.ins_len += 1 - self.right = self.left + len(self.seq) - 1 - self.ins_len - - # Update coverage - self.calculate_avg_cov() - - - # Return the length of the ungapped sequence - def ungapped_length(self): - return len(get_ungapped_seq(self.seq)) - - - # Contains Ns? - def contain_Ns(self): - for i in range(len(self.seq)): - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt == 'N': - return True - return False - - - # Get variant ids - def get_var_ids(self, left = 0, right = sys.maxint): - vars = [] - left = max(left, self.left) - right = min(right, self.right) - ins_len = 0 - for pos in range(left, right + 1): - var_i = pos - self.left + ins_len - while var_i < len(self.seq): - nt_dic = self.seq[var_i] - nt = get_major_nt(nt_dic) - if nt.startswith('I'): - var_i += 1 - ins_len += 1 - else: - break - for _, var in nt_dic.values(): - if var == "" or \ - var == "unknown": - continue - assert var in self.ref_vars - if len(vars) > 0 and var == vars[-1]: - continue - type, pos, data = self.ref_vars[var] - if (type == "single" and data == nt) or \ - (type == "deletion" and nt == 'D') or \ - (type == "insertion" and len(nt) == 2 and nt[1] == data): - vars.append(var) - - return vars - - - # Get variant ids - # left and right are gene-level coordinates - def get_vars(self, left = 0, right = sys.maxint): - vars = [] - left = max(left, self.left) - right = min(right, self.right) - skip_pos = -1 - ins_len = 0 - for pos in range(left, right + 1): - if pos <= skip_pos: - continue - var_i = pos - self.left + ins_len - while var_i < len(self.seq): - nt_dic = self.seq[var_i] - nt = get_major_nt(nt_dic) - if nt.startswith('I'): - var_i += 1 - ins_len += 1 - var = nt_dic[nt][1] - if len(vars) > 0 and var != vars[-1][0]: - vars.append([var, pos]) - else: - break - if nt == self.ref_seq[pos]: - continue - if nt == 'N': - vars.append(["gap", pos]) - continue - added = False - for _, var in nt_dic.values(): - if var == "" or \ - var == "unknown": - continue - if len(vars) > 0 and var == vars[-1][0]: - continue - assert var in self.ref_vars - type, var_pos, data = self.ref_vars[var] - if data == nt or (type == "deletion" and nt == 'D'): - assert pos + ins_len >= var_pos - if type == "deletion" and pos > var_pos: - continue - if type == "deletion": - skip_pos = pos + int(data) - 1 - added = True - vars.append([var, pos]) - if not added and "unknown" in [var_id for _, var_id in nt_dic.values()]: - vars.append(["unknown", pos]) - - return vars - - - # Get average coverage - def get_avg_cov(self): - return self.avg - - - # Calculate average coverage - def calculate_avg_cov(self): - self.avg = 0.0 - for nt_dic in self.seq: - for count, _ in nt_dic.values(): - self.avg += count - self.avg /= len(self.seq) - return self.avg - - - # Display node information - def print_info(self, output=sys.stderr): - seq, var_str = "", "" - prev_var = "" - ins_len = 0 - for i in range(len(self.seq)): - if (self.left + i - ins_len) % 100 == 0: - seq += ("|%d|" % (self.left + i - ins_len)) - elif (self.left + i - ins_len) % 20 == 0: - seq += '|' - nt_dic = self.seq[i] - nt = get_major_nt(nt_dic) - if nt[0] == 'I': - seq += "\033[93m" - elif nt != self.ref_seq[self.left + i - ins_len]: - var_id = nt_dic[nt][1] - if var_id == "unknown" or var_id.startswith("nv"): - seq += "\033[91m" # red - else: - seq += "\033[94m" # blue - if nt[0] == 'I': - seq += nt[1] - else: - seq += nt - if nt[0] == 'I' or nt != self.ref_seq[self.left + i - ins_len]: - seq += "\033[00m" - - var = [] - for _, var_id in nt_dic.values(): - if var_id == "": - continue - var.append(var_id) - var = '-'.join(var) - if var != "" and var != prev_var: - var_str += "\t%d: %s %s" % (self.left + i - ins_len, var, str(nt_dic)) - prev_var = var - if nt[0] == 'I': - ins_len += 1 - - print >> output, "Node ID:", self.id - print >> output, "Pos: [%d, %d], Avg. coverage: %.1f" % (self.left, self.right, self.get_avg_cov()) - print >> output, "\t", seq - print >> output, "\t", var_str - print >> output, "mates:", len(self.mate_ids) # sorted(self.mate_ids) - print >> output, "reads:", len(self.read_ids) # sorted(self.read_ids) - print >> output - - -class Graph: - def __init__(self, - backbone, - gene_vars, - exons, - primary_exons, - partial_allele_ids, - true_allele_nodes = {}, - predicted_allele_nodes = {}, - display_allele_nodes = {}, - simulation = False): - self.backbone = backbone # backbone sequence - self.gene_vars = gene_vars - self.exons = exons - self.primary_exons = primary_exons - self.partial_allele_ids = partial_allele_ids - self.true_allele_nodes = true_allele_nodes - self.predicted_allele_nodes = predicted_allele_nodes - self.allele_node_order = [] - self.display_allele_nodes = display_allele_nodes - self.simulation = simulation - - self.read_nodes = self.nodes = {} - self.other_nodes = {} - self.edges = {} - self.to_node, self.from_node = {}, {} - - self.left_margin = 350 - self.right_margin = 20 - self.top_margin = 20 - self.bottom_margin = 20 - - self.scalex, self.scaley = 5, 2 - self.width = len(self.backbone) * self.scalex + self.left_margin + self.right_margin - self.unscaled_height = 6000 - self.height = self.unscaled_height * self.scaley - self.coverage = {} - - - # Add node, which is an alignment w.r.t. the reference - def add_node(self, id, id_i, node, simulation = False): - if simulation: - id = id.split('_')[0] - - if id_i == 0: - if id in self.nodes: - print >> sys.stderr, "Warning) multi-mapped read:", id - # assert False - return - assert id not in self.nodes - self.nodes[id] = node - else: - if id not in self.other_nodes: - self.other_nodes[id] = [] - self.other_nodes[id].append(node) - - - # Remove nodes that are inside other nodes or with low coverage - def remove_nodes(self, nodes): - delete_ids = set() - node_list = [[id, node.left, node.right] for id, node in nodes.items()] - def node_cmp(a, b): - if a[2] != b[2]: - return a[2] - b[2] - else: - return a[1] - b[1] - node_list = sorted(node_list, cmp=node_cmp) - for n in range(len(node_list)): - id, left, right = node_list[n] - node = nodes[id] - i = n - 1 - while i >= 0: - id2, left2, right2 = node_list[i] - if right2 < left: - break - node2 = nodes[id2] - if left <= left2 and right2 <= right: - at, overlap, mm = node.overlap_with(node2, self.gene_vars) - - # DK - debugging purposes - """ - print node.id, "vs.", node2.id - print "at %d: overlap of %d with %d mismatches (mult: %.2f)" % \ - (at, overlap, mm, mult) - """ - if mm < 1.0: - mult = overlap / float(max(right - left, right2 - left2)) - if node2.get_avg_cov() * mult * 10 < node.get_avg_cov(): - delete_ids.add(id2) - elif left == left2 and right == right2: - delete_ids.add(id) - elif overlap > 0: - if node2.get_avg_cov() * 10 < node.get_avg_cov(): - delete_ids.add(id2) - elif node.get_avg_cov() * 10 < node2.get_avg_cov(): - delete_ids.add(id) - i -= 1 - - for delete_id in delete_ids: - del nodes[delete_id] - - - # - # - def guided_DeBruijn(self, - print_msg = False): - assert len(self.nodes) > 0 - k = 60 # k-mer - - DRB1_debug = False - - node_seq = {} - def add_node_seq(node_seq, id): - nodes = [self.nodes[id]] - if id in self.other_nodes: - nodes += self.other_nodes[id] - for node_i in range(len(nodes)): - node = nodes[node_i] - s, seq = 0, [] - while s < len(node.seq): - nt_dic = node.seq[s] # {'C': [1, '']} - nt = get_major_nt(nt_dic) - if nt in "ACGTND": - seq.append(nt) - else: - assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT" - s += 1 - - if len(seq) < k: - continue - - def leftshift(seq, ref_seq): - seq_len = len(seq) - assert seq_len > 0 and seq[0] != 'D' - - bp_i = 0 - while bp_i < seq_len: - bp = seq[bp_i] - if bp != 'D': - bp_i += 1 - continue - bp_j = bp_i + 1 - while bp_j < seq_len: - bp2 = seq[bp_j] - if bp2 != 'D': - break - else: - bp_j += 1 - - if bp_j >= seq_len: - bp_i = bp_j - break - - prev_i, prev_j = bp_i, bp_j - while bp_i > 0 and seq[bp_i-1] in "ACGT" and ref_seq[bp_j-1] in "ACGT": - if seq[bp_i-1] != ref_seq[bp_j-1]: - break - seq[bp_j-1] = seq[bp_i-1] - seq[bp_i-1] = 'D' - bp_i -= 1 - bp_j -= 1 - bp_i = bp_j - while bp_i < seq_len: - if seq[bp_i] in "ACGT": - break - bp_i += 1 - - if DRB1_debug: - leftshift(seq, self.backbone[node.left:node.left + len(seq)]) - node_seq["%s.%d" % (id, node_i)] = seq - - for id in self.nodes.keys(): - add_node_seq(node_seq, id) - - # AAA.1 => AAA, 1 - def get_id_and_sub(id): - id_split = id.split('.') - return '.'.join(id_split[:-1]), int(id_split[-1]) - - try_hard = False - while True: - delete_ids = set() - nodes = [] - for id, node in self.nodes.items(): - nodes_ = [node] - if id in self.other_nodes: - nodes_ += self.other_nodes[id] - for node_i in range(len(nodes_)): - node = nodes_[node_i] - id_ = "%s.%d" % (id, node_i) - if id_ not in node_seq: - continue - seq = node_seq[id_] - - if len(seq) < k or \ - 'N' in seq: - continue - kmer, seq = seq[:k], seq[k:] - nodes.append([id_, node.left, node.right, kmer, seq]) - - def node_cmp(a, b): - if a[1] != b[1]: - return a[1] - b[1] - else: - return a[2] - b[2] - nodes = sorted(nodes, cmp=node_cmp) - - # Generate numerical read IDs - id_to_num = {} - num_to_id = [] - for id in [node[0] for node in nodes]: - id_to_num[id] = len(id_to_num) - num_to_id.append(id) - - # Construct De Bruijn graph with 60-mer - self.debruijn = debruijn = [[] for i in range(len(self.backbone) - k + 1)] - min_n = 0 - for pos in range(len(debruijn)): - for n in range(min_n, len(nodes)): - id, node_pos, node_right, kmer, seq = nodes[n] - if node_pos < pos: - min_n = n + 1 - continue - elif node_pos > pos: - break - - assert len(kmer) == k - - # Add a new node or update the De Bruijn graph - curr_vertices = debruijn[pos] - found = False - kmer_seq = ''.join(kmer) - for v in range(len(curr_vertices)): - cmp_nt, cmp_k_m1_mer = curr_vertices[v][:2] - if kmer_seq == cmp_k_m1_mer + cmp_nt: - curr_vertices[v][3].append(n) - found = True - break - - if not found: - predecessors = [] - if pos > 0: - prev_vertices = debruijn[pos - 1] - for v in range(len(prev_vertices)): - cmp_nt, cmp_k_m1_mer = prev_vertices[v][:2] - if kmer_seq[:-1] == cmp_k_m1_mer[1:] + cmp_nt: - predecessors.append(v) - debruijn[pos].append([kmer_seq[-1], # base - ''.join(kmer_seq[:-1]), # (k-1)-mer - predecessors, # predecessors - [n]]) # numeric read IDs - - # Update k-mer - if len(seq) > 0: - kmer, seq = kmer[1:] + seq[:1], seq[1:] - nodes[n] = [id, node_pos + 1, node_right, kmer, seq] - - # Average number of kmers - total_kmers = 0 - for pos in range(len(debruijn)): - vertices = debruijn[pos] - for _, _, _, num_ids in vertices: - total_kmers += len(num_ids) - avg_kmers = float(total_kmers) / len(debruijn) - - # Filter out reads - for pos in range(len(debruijn)): - vertices = debruijn[pos] - num_vertices = 0 - num_kmers = 0 - for v in range(len(vertices)): - _, _, predecessors, num_ids = vertices[v] - if not (set(num_ids) <= delete_ids): - num_vertices += 1 - if DRB1_debug: - num_kmers = len(set(num_ids) - delete_ids) - if num_vertices <= 1: - if DRB1_debug: - if pos > 300 and pos + 300 < len(debruijn): - if num_vertices == 1 and num_kmers * 8 < avg_kmers: - for _, _, _, num_ids in vertices: - delete_ids |= set(num_ids) - continue - - vertice_count = [0] * len(vertices) - for v in range(len(vertices)): - _, _, predecessors, num_ids = vertices[v] - for num_id in num_ids: - if num_id in delete_ids: - continue - read_id = get_id_and_sub(num_to_id[num_id])[0] - if read_id in self.other_nodes: - continue - mate_read_id = get_mate_node_id(read_id) - if mate_read_id in self.nodes: - vertice_count[v] += 1 - - # First look at and remove reads that are multi-aligned locally - first_pair = None - for v in range(len(vertices)): - read_ids = set([get_id_and_sub(num_to_id[num_id])[0] for num_id in vertices[v][3]]) - for v2 in range(v + 1, len(vertices)): - read_ids2 = set([get_id_and_sub(num_to_id[num_id])[0] for num_id in vertices[v2][3]]) - if read_ids & read_ids2: - first_pair = [v, v2, read_ids & read_ids2] - break - - debug_msg = False - if debug_msg: - print >> sys.stderr, "at", pos, vertices - print >> sys.stderr, "count:", vertice_count - - if try_hard: - vertice_with_id = [[vertice_count[v], v] for v in range(len(vertice_count))] - vertice_with_id = sorted(vertice_with_id, key=lambda a: a[0]) - for v in range(len(vertice_count) - 2): - v = vertice_with_id[v][1] - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - if debug_msg: - print >> sys.stderr, v, "is removed with", num_ids - else: - if first_pair: - v, v2, multi_read_ids = first_pair - v_ = v if vertice_count[v] < vertice_count[v2] else v2 - for num_id in vertices[v_][3]: - id = get_id_and_sub(num_to_id[num_id])[0] - if id in multi_read_ids: - delete_ids.add(num_id) - else: - assert len(vertices) >= 2 - relative_avg = (sum(vertice_count) - vertice_count[v]) / float(len(vertice_count) - 1) - if len(vertices) == 2: - for v in range(len(vertices)): - # Eliminate reads that have conflicts with other reads due to a deletion - if vertice_count[v] * 2 < relative_avg: - nt, kmer, _, num_ids = vertices[1-v] - if nt == 'D': - num_id = num_ids[0] - id_sub = num_to_id[num_id] - id, sub = get_id_and_sub(id_sub) - if sub == 0: - left = pos - self.nodes[id].left - else: - left = pos - self.other_nodes[id][sub - 1].left - seq = node_seq[id_sub] - seq_right = ''.join(seq[left+k:]) - seq_right = seq_right.replace('D', '') - success = True - for num_id2 in vertices[v][3]: - id_sub2 = num_to_id[num_id2] - id2, sub2 = get_id_and_sub(id_sub2) - if sub2 == 0: - left2 = pos - self.nodes[id2].left - else: - left2 = pos - self.other_nodes[id2][sub2 - 1].left - seq2 = node_seq[id_sub2] - seq2_right = ''.join(seq2[left2+k:]) - if seq_right.find(seq2_right) != 0: - success = False - break - if success: - delete_ids |= set(vertices[v][3]) - - # DK - working on ... - if DRB1_debug: - if vertice_count[v] * 8 < relative_avg: - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - if debug_msg: - print >> sys.stderr, v, "is removed with", num_ids - elif vertice_count[v] * 8 < avg_kmers: - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - else: - second2last = sorted(vertice_count)[1] - for v in range(len(vertices)): - # if vertice_count[v] * 3 < relative_avg: - if vertice_count[v] < second2last: - num_ids = vertices[v][3] - delete_ids |= set(num_ids) - if debug_msg: - print >> sys.stderr, v, "is removed with", num_ids - - if debug_msg: - print >> sys.stderr - print >> sys.stderr - - # delete nodes - ids_to_be_updated = set() - for num_id in delete_ids: - id_sub = num_to_id[num_id] - id, sub = get_id_and_sub(id_sub) - ids_to_be_updated.add(id) - if sub == 0: - self.nodes[id] = None - else: - self.other_nodes[id][sub-1] = None - - for id in self.nodes.keys(): - other_nodes = [] - if id in self.other_nodes: - for other_node in self.other_nodes[id]: - if other_node != None: - other_nodes.append(other_node) - if self.nodes[id] == None: - if len(other_nodes) == 0: - del self.nodes[id] - else: - self.nodes[id] = other_nodes[0] - del other_nodes[0] - if id in self.other_nodes: - if len(other_nodes) == 0: - del self.other_nodes[id] - else: - self.other_nodes[id] = other_nodes - - for id in ids_to_be_updated: - if id in self.nodes: - add_node_seq(node_seq, id) - - if len(delete_ids) == 0: - if try_hard: - break - else: - try_hard = True - - # Print De Bruijn graph - for i in range(len(debruijn)): - curr_vertices = debruijn[i] - if len(curr_vertices) == 0: - continue - consensus_seq = [{} for j in range(k)] - for v in range(len(curr_vertices)): - nt, k_m1_mer = curr_vertices[v][:2] - kmer = k_m1_mer + nt - assert len(kmer) == k - for j in range(k): - nt = kmer[j] - if nt not in consensus_seq[j]: - consensus_seq[j][nt] = 1 - else: - consensus_seq[j][nt] += 1 - - if print_msg: print >> sys.stderr, i - for v in range(len(curr_vertices)): - nt, k_m1_mer, predecessors, num_ids = curr_vertices[v] - kmer = k_m1_mer + nt - kmer_seq = "" - for j in range(k): - nt = kmer[j] - if len(consensus_seq[j]) >= 2: - kmer_seq += "\033[94m" - kmer_seq += nt - if len(consensus_seq[j]) >= 2: - kmer_seq += "\033[00m" - - if print_msg: print >> sys.stderr, "\t%d:" % v, kmer_seq, len(num_ids), predecessors, num_ids - - id_to_num = {} - for num in range(len(num_to_id)): - id_sub = num_to_id[num] - id = get_id_and_sub(id_sub)[0] - num_to_id[num] = id - if id not in id_to_num: - id_to_num[id] = set() - id_to_num[id].add(num) - - # Generate compressed nodes - paths = [] - path_queue, done = deque(), set() - for i in range(len(debruijn)): - if len(debruijn[i]) == 0: - continue - for i2 in range(len(debruijn[i])): - path_queue.append("%d-%d" % (i, i2)) - break - - while len(path_queue) > 0: - i_str = path_queue.popleft() - if i_str in done: - continue - - i, i2 = i_str.split('-') - i, i2 = int(i), int(i2) - num_ids = debruijn[i][i2][3] - j = i + 1 - while j < len(debruijn): - merge, branch = len(debruijn[j-1]) > len(debruijn[j]), len(debruijn[j-1]) < len(debruijn[j]) - new_i2 = -1 - tmp_num_ids = [] - found = False - for j2 in range(len(debruijn[j])): - _, _, predecessors, add_read_ids = debruijn[j][j2] - if len(predecessors) == 0: - branch = True - path_queue.append("%d-%d" % (j, j2)) - elif i2 in predecessors: - found = True - # merge into one node - if len(predecessors) > 1: - merge = True - if new_i2 >= 0: - branch = True - new_i2 = j2 - tmp_num_ids += add_read_ids - - if merge or branch: - for j2 in range(len(debruijn[j])): - _, _, predecessors, add_num_ids = debruijn[j][j2] - if i2 in predecessors: - path_queue.append("%d-%d" % (j, j2)) - break - if not found: - break - - num_ids += tmp_num_ids - i2 = new_i2 - j += 1 - - done.add(i_str) - - num_ids = set(num_ids) - paths.append([i, j, num_ids]) - - if j < len(debruijn) and len(debruijn[j]) == 0: - j += 1 - while j < len(debruijn) and len(debruijn[j]) == 0: - j += 1 - if j < len(debruijn): - for j2 in range(len(debruijn[j])): - path_queue.append("%d-%d" % (j, j2)) - - - def get_mate_num_ids(num_ids): - mate_num_ids = set() - for num_id in num_ids: - read_id = num_to_id[num_id] - mate_read_id = get_mate_node_id(read_id) - if mate_read_id in id_to_num: - mate_num_id = id_to_num[mate_read_id] - mate_num_ids |= mate_num_id - - return mate_num_ids - - - # Generate a compressed assembly graph - def path_cmp(a, b): - if a[0] != b[0]: - return a[0] - b[0] - else: - return a[1] - b[1] - paths = sorted(paths, cmp=path_cmp) - - for p in range(len(paths)): - if print_msg: print >> sys.stderr, "path:", p, paths[p] - - excl_num_ids = set() # exclusive num ids - equiv_list = [] - p = 0 - while p < len(paths): - left, right, num_ids = paths[p] - p2 = p + 1 - while p2 < len(paths): - next_left, next_right, next_num_ids = paths[p2] - if next_left >= right: - break - p2 += 1 - - equiv_list.append([]) - for i in range(p, p2): - left, right, num_ids = paths[i] - equiv_list[-1].append([[i], num_ids, num_ids | get_mate_num_ids(num_ids), []]) - if p + 1 < p2: - assert p + 2 == p2 - excl_num_ids |= num_ids - - p = p2 - - new_equiv_list = [] - for classes in equiv_list: - if len(classes) > 1: - new_equiv_list.append(classes) - continue - assert len(classes) == 1 - num_ids = classes[0][1] - excl_num_ids - if len(num_ids) <= 0: - continue - classes[0][1] = num_ids - classes[0][2] = num_ids | get_mate_num_ids(num_ids) - new_equiv_list.append(classes) - equiv_list = new_equiv_list - - known_alleles = False - while True: - for i in range(len(equiv_list)): - classes = equiv_list[i] - for j in range(len(classes)): - ids, num_ids, all_ids, alleles = classes[j] - if print_msg: print >> sys.stderr, i, j, ids, len(num_ids), sorted(list(num_ids))[:20], alleles - - if print_msg: print >> sys.stderr - - if known_alleles: - for i in range(len(equiv_list)): - classes = equiv_list[i] - for j in range(len(classes)): - num_ids = sorted(list(classes[j][1])) - node_id = "(%d-%d)%s" % (i, j, num_to_id[num_ids[0]]) - node = self.nodes2[node_id] - node_vars = node.get_var_ids() - max_alleles, max_common = set(), -sys.maxint - for anode in self.predicted_allele_nodes.values(): - allele_vars = anode.get_var_ids(node.left, node.right) - tmp_common = len(set(node_vars) & set(allele_vars)) - len(set(node_vars) | set(allele_vars)) - if tmp_common > max_common: - max_common = tmp_common - max_alleles = set([anode.id]) - elif tmp_common == max_common: - max_alleles.add(anode.id) - classes[j][3] = max_alleles - - - best_common_mat, best_stat, best_i, best_i2 = [], -sys.maxint, -1, -1 - for i in range(len(equiv_list) - 1): - classes = equiv_list[i] - for i2 in range(i + 1, len(equiv_list)): - classes2 = equiv_list[i2] - common_mat = [] - for j in range(len(classes)): - common_mat.append([]) - if known_alleles: - ids = classes[j][3] - else: - ids = classes[j][2] - for j2 in range(len(classes2)): - if known_alleles: - ids2 = classes2[j2][3] - else: - ids2 = classes2[j2][2] - common_mat[-1].append(len(ids & ids2)) - - # Calculate stat - common_stat = 0 - if len(classes) == 1 or len(classes2) == 1: - for row in common_mat: - common_stat += sum(row) - else: - for row in common_mat: - sorted_row = sorted(row, reverse=True) - common_stat += (sorted_row[0] - sorted_row[1]) - if common_mat[0][0] + common_mat[1][1] == \ - common_mat[1][0] + common_mat[0][1]: - common_stat = -1 - - if common_stat > best_stat: - best_common_mat, best_stat, best_i, best_i2 = common_mat, common_stat, i, i2 - - if print_msg: - print >> sys.stderr, "best:", best_i, best_i2, best_stat, best_common_mat - print >> sys.stderr - print >> sys.stderr - - if known_alleles and best_stat < 0: - self.remove_nodes(self.nodes2) - break - if best_stat < 0: - known_alleles = True - new_nodes = {} - for i in range(len(equiv_list)): - classes = equiv_list[i] - for j in range(len(classes)): - ids, num_ids, all_ids, alleles = classes[j] - num_ids = sorted(list(num_ids)) - - if print_msg: print >> sys.stderr, i, j, num_ids - - assert (num_ids) > 0 - read_id = num_to_id[num_ids[0]] - node = deepcopy(self.nodes[read_id]) - for num_id2 in num_ids[1:]: - read_id2 = num_to_id[num_id2] - node2 = self.nodes[read_id2] - node.combine_with(node2) - - new_read_id = "(%d-%d)%s" % (i, j, read_id) - node.id = new_read_id - new_read_id not in new_nodes - new_nodes[new_read_id] = node - - self.nodes = new_nodes - self.nodes2 = deepcopy(self.nodes) - self.remove_nodes(self.nodes) - continue - - mat = best_common_mat - classes, classes2 = equiv_list[best_i], equiv_list[best_i2] - - # Filter vertices further if necessary - def del_row(classes, mat, r): - return classes[:r] + classes[r+1:], mat[:r] + mat[r+1:] - - def del_col(classes, mat, c): - new_mat = [] - for row in mat: - row = row[:c] + row[c+1:] - new_mat.append(row) - return classes[:c] + classes[c+1:], new_mat - - assert len(classes) <= 2 and len(classes2) <= 2 - if len(classes) == 2 and len(classes2) == 2: - # Check row - num_ids1, num_ids2 = len(classes[0][1]), len(classes[1][1]) - if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1: - row_sum1, row_sum2 = sum(mat[0]), sum(mat[1]) - if row_sum1 > max(2, row_sum2 * 6): - classes, mat = del_row(classes, mat, 1) - classes[0][1] -= excl_num_ids - elif row_sum2 > max(2, row_sum1 * 6): - classes, mat = del_row(classes, mat, 0) - classes[0][1] -= excl_num_ids - # Check column - if len(classes) == 2: - num_ids1, num_ids2 = len(classes2[0][1]), len(classes2[1][1]) - if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1: - col_sum1, col_sum2 = mat[0][0] + mat[1][0], mat[0][1] + mat[1][1] - if col_sum1 > max(2, col_sum2 * 6): - classes2, mat = del_col(classes2, mat, 1) - classes2[0][1] -= excl_num_ids - elif col_sum2 > max(2, col_sum1 * 6): - classes2, mat = del_col(classes2, mat, 0) - classes2[0][1] -= excl_num_ids - - merge_list = [] - def add_merge(classes, classes2, i, j, k): - if known_alleles: - num_ids1, num_ids2 = classes[i][1], classes2[j][1] - num_ids1, num_ids2 = sorted(list(num_ids1)), sorted(list(num_ids2)) - num_id1, num_id2 = num_ids1[0], num_ids2[0] - node_id1 = "(%d-%d)%s" % (best_i, i, num_to_id[num_id1]) - node_id2 = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id2]) - node_id3 = "(%d-%d)%s" % (best_i, k, num_to_id[min(num_id1, num_id2)]) - merge_list.append([node_id1, node_id2, node_id3]) - - classes[i][0] = sorted(classes[i][0] + classes2[j][0]) - classes[i][1] |= classes2[j][1] - - copy_list = [] - def add_copy(classes, classes2, i, j, k): - if known_alleles: - num_ids = classes2[j][1] - num_ids = sorted(list(num_ids)) - num_id = num_ids[0] - node_id = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id]) - node_id2 = "(%d-%d)%s" % (best_i, k, num_to_id[num_id]) - copy_list.append([node_id, node_id2]) - - classes[i] = classes2[j] - - remove_list = [] - def add_remove(classes, i): - if known_alleles: - num_ids = classes[i][1] - num_ids = sorted(list(num_ids)) - num_id = num_ids[0] - node_id = "(%d-%d)%s" % (best_i, i, num_to_id[num_id]) - remove_list.append([node_id]) - - classes = [classes[1-i]] - - if len(classes) == 1 and len(classes2) == 1: - add_merge(classes, classes2, 0, 0, 0) - - elif len(classes) == 1: - if 0 not in classes[0][0] and \ - mat[0][0] > max(2, mat[0][1] * 6) and \ - len(classes2[0][1]) > len(classes2[1][1]) * 2: - add_merge(classes, classes2, 0, 0, 0) - elif 0 not in classes[0][0] and \ - mat[0][1] > max(2, mat[0][0] * 6) and \ - len(classes2[1][1]) > len(classes2[0][1]) * 2: - add_merge(classes, classes2, 0, 1, 0) - else: - classes.append(deepcopy(classes[0])) - - # Handle a special case at 5' end - if 0 in classes[0][0] and \ - len(classes[0][0]) == 1 and \ - (mat[0][0] > mat[0][1] * 2 or mat[0][1] > mat[0][0] * 2): - if mat[0][0] > mat[0][1]: - add_merge(classes, classes2, 0, 0, 0) - add_copy(classes, classes2, 1, 1, 1) - else: - assert mat[0][1] > mat[0][0] - add_copy(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 1, 1) - else: - add_merge(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 1, 1) - - elif len(classes2) == 1: - if mat[0][0] > max(2, mat[1][0] * 6): - add_merge(classes, classes2, 0, 0, 0) - if len(classes[0][1]) > len(classes[1][1]) * 6: - add_remove(classes, 1) - elif mat[1][0] > max(2, mat[0][0] * 6): - add_merge(classes, classes2, 1, 0, 0) - if len(classes[1][1]) > len(classes[0][1]) * 6: - add_remove(classes, 0) - else: - add_merge(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 0, 1) - - else: - score00 = mat[0][0] + mat[1][1] - score01 = mat[0][1] + mat[1][0] - if score00 > score01: - add_merge(classes, classes2, 0, 0, 0) - add_merge(classes, classes2, 1, 1, 1) - elif score00 < score01: - add_merge(classes, classes2, 0, 1, 0) - add_merge(classes, classes2, 1, 0, 1) - else: - break - - for c in range(len(classes)): - classes[c][2] = classes[c][1] | get_mate_num_ids(classes[c][1]) - - equiv_list[best_i] = classes - equiv_list = equiv_list[:best_i2] + equiv_list[best_i2+1:] - - if known_alleles: - exclude_ids = set() - new_nodes = {} - for node_id1, node_id2, node_id3 in merge_list: - if self.nodes2[node_id1].left <= self.nodes2[node_id2].left: - node = deepcopy(self.nodes2[node_id1]) - node2 = self.nodes2[node_id2] - else: - node = deepcopy(self.nodes2[node_id2]) - node2 = self.nodes2[node_id1] - node.combine_with(node2) - node.id = node_id3 - new_nodes[node_id3] = node - exclude_ids.add(node_id1) - exclude_ids.add(node_id2) - - for node_id1, node_id2 in copy_list: - node = self.nodes2[node_id1] - node.id = node_id2 - new_nodes[node_id2] = node - exclude_ids.add(node_id1) - - exclude_ids |= set(remove_list) - - for node_id, node in self.nodes2.items(): - if node_id in exclude_ids: - continue - num, id = node_id.split(')') - i, i2 = num[1:].split('-') - i, i2 = int(i), int(i2) - if i > best_i2: - i -= 1 - node_id = "(%d-%d)%s" % (i, i2, id) - node.id = node_id - new_nodes[node_id] = node - - self.nodes2 = new_nodes - - - # Display graph information - def print_info(self): - print >> sys.stderr, "Backbone len: %d" % len(self.backbone) - print >> sys.stderr, "\t%s" % self.backbone - - - # Compare nodes and get information - def get_node_comparison_info(self, node_dic): - assert len(node_dic) > 0 - nodes = [[id, node.left, node.right] for id, node in node_dic.items()] - def node_cmp(a, b): - if a[1] != b[1]: - return a[1] - b[1] - else: - return a[2] - b[2] - nodes = sorted(nodes, cmp=node_cmp) - seqs, colors = [], [] - for p in range(len(self.backbone)): - nts = set() - for n in range(len(nodes)): - id, left, right = nodes[n] - node = node_dic[id] - if p >= left and p <= right: - nt_dic = node.seq[p - left] - nt = get_major_nt(nt_dic) - nts.add(nt) - - for n in range(len(nodes)): - if p == 0: - seqs.append([]) - colors.append([]) - id, left, right = nodes[n] - node = node_dic[id] - if p >= left and p <= right: - nt_dic = node.seq[p - left] - nt = get_major_nt(nt_dic) - seqs[n].append(nt) - if nt != self.backbone[p]: - if len(nts) > 1: - colors[n].append('R') - else: - colors[n].append('B') - else: - colors[n].append('N') - else: - seqs[n].append(' ') - - assert len(nodes) == len(seqs) - for n in range(len(nodes)): - node, seq, color = nodes[n], seqs[n], colors[n] - new_left, new_right = 0, len(seq) - 1 - while seq[new_left] == 'D': - new_left += 1 - while seq[new_right] == 'D': - new_right -= 1 - - node[1] = new_left - node[2] = new_right - seqs[n] = seq[new_left:new_right+1] - colors[n] = color[new_left:new_right+1] - - return nodes, seqs, colors - - - # Compare nodes - def print_node_comparison(self, node_dic): - nodes, seqs, colors = self.get_node_comparison_info(node_dic) - interval = 100 - for p in range(0, (len(self.backbone) + interval - 1) / interval * interval, interval): - cur_seqs = [] - for n in range(len(nodes)): - id, left, right = nodes[n] # inclusive coordinate - right += 1 - seq = [] - seq_left, seq_right = max(p, left), min(p+interval, right) - if seq_left >= seq_right: - continue - if p < left: - seq += ([' '] * (left - p)) - for s in range(seq_left, seq_right): - nt, color = seqs[n][s-left], colors[n][s-left] - if color in "RB": - if color == 'R': - nt = "\033[91m" + nt - else: - nt = "\033[94m" + nt - nt += "\033[00m" - seq.append(nt) - if right < p + interval: - seq += ([' '] * (p + interval - right)) - seq = ''.join(seq) - cur_seqs.append([seq, id]) - - if len(cur_seqs) <= 0: - continue - - print >> sys.stderr, p - for seq, id in cur_seqs: - print >> sys.stderr, "\t", seq, id - - - # Calculate coverage - def calculate_coverage(self): - allele_nodes = self.true_allele_nodes if self.simulation else self.predicted_allele_nodes - allele_nodes = [[id, node.left, node.right] for id, node in allele_nodes.items()] - coverage = {} - for allele_id, _, _ in allele_nodes: - coverage[allele_id] = [0.0 for _ in range(len(self.backbone))] - - nodes = [[id, node.left, node.right] for id, node in self.nodes.items()] - for id, left, right in nodes: - node = self.nodes[id] - nodes2 = [[node, left, right]] - if id in self.other_nodes: - for node in self.other_nodes[id]: - nodes2.append([node, node.left, node.right]) - - for node, left, right in nodes2: - node_vars = node.get_vars() - node_var_ids = node.get_var_ids() - max_common = -sys.maxint - max_allele_node_ids = [] - for allele_node_id, allele_left, allele_right in allele_nodes: - if right - left <= 500 and (left < allele_left or right > allele_right): - continue - if self.simulation: - allele_node = self.true_allele_nodes[allele_node_id] - else: - allele_node = self.predicted_allele_nodes[allele_node_id] - allele_vars = allele_node.get_var_ids(left, right) - common_vars = set(node_var_ids) & set(allele_vars) - tmp_common = len(common_vars) - len(set(node_var_ids) | set(allele_vars)) - if max_common < tmp_common: - max_common = tmp_common - max_allele_node_ids = [allele_node_id] - elif max_common == tmp_common: - max_allele_node_ids.append(allele_node_id) - if len(max_allele_node_ids) <= 0: - continue - add_cov = 1.0 / len(nodes2) / len(max_allele_node_ids) - assert add_cov > 0.0 - for allele_node_id in max_allele_node_ids: - for p in range(left, right + 1): - coverage[allele_node_id][p] += add_cov - - max_cov = 0.0 - for allele_id, cov in coverage.items(): - max_cov = max(max_cov, max(cov)) - for allele_id, cov in coverage.items(): - cov2 = [c / max_cov for c in cov] - coverage[allele_id] = cov2 - self.coverage = coverage - - - # Begin drawing graph - def begin_draw(self, fname_base): - pdfDraw = self.pdfDraw = open(fname_base + '.pdf', 'w') - print >> pdfDraw, r'%PDF-1.7' - self.objects, self.stream = [], [] - self.draw_items = [] - - # End drawing graph - def end_draw(self): - self.unscaled_height += 50 - self.height = self.unscaled_height * self.scaley - - def get_x(x): - return self.left_margin + x * self.scalex - - def get_y(y): - return self.height - self.top_margin - y * self.scaley - - # Get scalar - def get_sx(x): - return x * self.scalex - - def get_sy(y): - return y * self.scaley - - pdfDraw = self.pdfDraw - self.add_pdf_object('<>') - self.add_pdf_object('<>') - self.add_pdf_object('<>' % \ - (self.width, self.height)) - self.add_pdf_object('<>>>') - self.add_pdf_object('<>') - - # Draw vertical dotted lines at every 100nt and thick lines at every 500nt - pre_items = [] - for pos in range(0, len(self.backbone), 100): - main_line = (pos != 0 and pos % 500 == 0) - dic = {"coord": [pos, 2, pos, self.unscaled_height - 2], - "stroke" : "0.5 0.5 0.5", - "line_width" : 1 if main_line else 0.2} - if not main_line: - dic["line_dash"] = "[3] 0" - pre_items.append(["line", dic]) - self.draw_items = pre_items + self.draw_items - - fill, stroke, line_width, line_dash = "0 0 0", "0 0 0", 2.0, "" - for type, dic in self.draw_items: - commands = [] - if type != "state": - assert "coord" in dic - - if "fill" in dic and dic["fill"] != fill: - fill = dic["fill"] - commands.append("%s rg" % fill) - if "stroke" in dic and dic["stroke"] != stroke: - stroke = dic["stroke"] - commands.append("%s RG" % stroke) - if "line_width" in dic and dic["line_width"] != line_width: - line_width = dic["line_width"] - commands.append("%.1f w" % line_width) - if "line_dash" in dic: - if dic["line_dash"] != line_dash: - line_dash = dic["line_dash"] - commands.append("%s d" % line_dash) - elif line_dash != "": - line_dash = "" - commands.append("[] 0 d") - - if type == "rect": - x, y, sx, sy = dic["coord"] - re_str = "%d %d %d %d" % (get_x(x), get_y(y), get_sx(sx), get_sy(sy)) - if "fill" in dic: - commands.append("%s re f" % re_str) - if "stroke" in dic: - commands.append("%s re S" % re_str) - - elif type == "line": - x, y, x2, y2 = dic["coord"] - commands.append("%d %d m %d %d l h S" % \ - (get_x(x), get_y(y), get_x(x2), get_y(y2))) - elif type == "text": - assert "text" in dic and "font_size" in dic - x, y = dic["coord"] - commands.append("BT /F1 %d Tf %d %d Td (%s) Tj ET" % \ - (dic["font_size"], get_x(x), get_y(y), dic["text"])) - else: - assert type == "state" - - self.stream.append(' '.join(commands)) - - # Write stream - self.add_pdf_stream('\n'.join(self.stream)) - - # Write xref and trailer - to_xref = pdfDraw.tell() - print >> pdfDraw, 'xref' - print >> pdfDraw, "0 %d" % (len(self.objects) + 1) - print >> pdfDraw, r'0000000000 65535 f' - for object in self.objects: - print >> pdfDraw, "%s 00000 n" % "{:010}".format(object) - print >> pdfDraw, 'trailer <>' % (len(self.objects) + 1) - print >> pdfDraw, 'startxref' - print >> pdfDraw, str(to_xref) - print >> pdfDraw, r'%%EOF' - - self.pdfDraw.close() - - - def add_pdf_object(self, obj): - self.objects.append(self.pdfDraw.tell()) - print >> self.pdfDraw, "%d 0 obj %s" % (len(self.objects), obj) - print >> self.pdfDraw, 'endobj' - - - def add_pdf_stream(self, stream): - self.add_pdf_object("<>\nstream\n%s\nendstream" % (len(stream), stream)) - - - # Draw graph - # Top left as (0, 0) and Bottom right as (width, height) - def draw(self, - begin_y, - title = ""): - assert len(self.nodes) > 0 - nodes = [[id, node.left, node.right] for id, node in self.nodes.items()] - def node_cmp(a, b): - return a[1] - b[1] - nodes = sorted(nodes, cmp=node_cmp) - max_right = len(self.backbone) - - # display space - end_y = begin_y + 10000 - dspace = [[[begin_y, end_y]]] * (max_right + 1) - def get_dspace(left, right, height): - assert left < len(dspace) and right < len(dspace) - range1 = dspace[left] - for range2 in dspace[left + 1:right + 1]: - new_range = [] - # sub range - for t1, b1 in range1: - for t2, b2 in range2: - if b1 < t2: - break - if b2 < t1: - continue - t, b = max(t1, t2), min(b1, b2) - if b - t >= height: - new_range.append([t, b]) - - range1 = new_range - if len(range1) <= 0: - return -1 - - t, b = range1[0] - assert b - t >= height - b = t + height - for i in range(left, right+1): - range1 = dspace[i] - range2 = [] - found = False - for j in range(len(range1)): - t2, b2 = range1[j] - if t2 <= t and b <= b2: - found = True - if t2 < t: - range2.append([t2, t]) - if b < b2: - range2.append([b, b2]) - else: - range2.append([t2, b2]) - dspace[i] = range2 - assert found - return t - - def get_x(x): - return self.left_margin + x * self.scalex - - def get_y(y): - return self.height - self.top_margin - y * self.scaley - - # Get scalar - def get_sx(x): - return x * self.scalex - - def get_sy(y): - return y * self.scaley - - # Draw exons - y = get_dspace(0, max_right, 14) - for e in range(len(self.exons)): - left, right = self.exons[e] - right += 1 - - # Draw exon - self.draw_items.append(["rect", - {"coord" : [left, y + 10, right - left, 10], - "fill" : "1 1 1", - "stroke" : "0 0 0", - "line_width" : 2}]) - - primary = False - for left_, _ in self.primary_exons: - if left == left_: - primary = True - break - - # Draw label - self.draw_items.append(["text", - {"coord" : [left + 2, y + 7], - "text" : "Exon %d%s" % (e+1, " (primary)" if primary else ""), - "fill" : "0 0 0", - "font_size" : 12}]) - if e > 0: - prev_right = self.exons[e-1][1] + 1 - self.draw_items.append(["line", - {"coord": [prev_right, y + 5, left, y + 5], - "line_width" : 2}]) - - # Draw backbone sequence - y = get_dspace(0, max_right, 4) - for pos in range(len(self.backbone)): - base = self.backbone[pos] - self.draw_items.append(["text", - {"coord" : [pos, y + 2], - "text" : base, - "fill" : "0.5 0 0.5", - "font_size" : 8}]) - - # Draw true or predicted alleles - node_colors = ["1 1 0", "0 1 0", "1 0.8 0.64", "0.76 0.27 0.5"] - allele_node_colors = ["0.87 0.87 0", "0 0.53 0", "0.87 0.66 0.5", "0.63 0.14 0.38"] - def draw_alleles(allele_node_dic, allele_node_colors, display = False): - if len(allele_node_dic) <= 0: - return - allele_nodes, seqs, colors = self.get_node_comparison_info(allele_node_dic) - - def draw_coverage(allele_node, allele_id, left, right, allele_node_color): - if allele_id not in self.coverage: - return - y = get_dspace(0, max_right, 14) - for p in range(left, right): - cov = math.ceil(self.coverage[allele_id][p] * 12) - self.draw_items.append(["rect", - {"coord" : [p, y + 13, 1, cov], - "fill" : allele_node_color}]) - - - for n_ in range(len(allele_nodes)): - n = -1 - prob = "" - if not display and \ - not self.simulation and \ - len(self.allele_node_order) == len(allele_node_dic): - allele_id, prob = self.allele_node_order[n_] - for n2_ in range(len(allele_nodes)): - if allele_id == allele_nodes[n2_][0]: - n = n2_ - break - prob = ": %.2f" % prob - else: - n = n_ - assert n >= 0 and n < len(allele_nodes) - allele_id, left, right = allele_nodes[n] - right += 1 - allele_node = allele_node_dic[allele_id] - allele_node_color = allele_node_colors[n % len(allele_node_colors)] - - draw_coverage(allele_node, allele_id, left, right, allele_node_color) - - y = get_dspace(0, max_right, 14) - - # Draw allele name - if display: - allele_type = "display" - else: - if self.simulation: - allele_type = "true" - else: - allele_type = "predicted" - self.draw_items.append(["text", - {"coord" : [-55, y + 7], - "text" : "%s (%s, %s)" % (allele_id, "partial" if allele_id in self.partial_allele_ids else "full", allele_type), - "fill" : "0 0 1", - "font_size" : 18}]) - # Draw node - self.draw_items.append(["rect", - {"coord" : [left, y + 10, right - left, 10], - "fill" : allele_node_color, - "stroke" : "0 0 0", - "line_width" : 2}]) - - - color_boxes = [] - c = 0 - while c < len(colors[n]): - color = colors[n][c] - c2 = c + 1 - if color != 'N': - while c2 < len(colors[n]): - color2 = colors[n][c2] - if color != color2: - break - c2 += 1 - color_boxes.append([c, c2, color]) - c = c2 - - # Draw variants - for color_box in color_boxes: - cleft, cright, color = color_box - cleft += left; cright += left - if color == 'B': - color = "0 0 1" # blue - else: - color = "0.12 0.56 1" - # DK - debugging purposes - color = "0 0 1" - self.draw_items.append(["rect", - {"coord" : [cleft, y + 9, cright - cleft, 8], - "fill" : color}]) - - return allele_nodes, seqs, colors - - allele_nodes, seqs, colors = draw_alleles(self.true_allele_nodes if self.simulation else self.predicted_allele_nodes, - allele_node_colors) - draw_alleles(self.display_allele_nodes, - ["1 0.96 0.95"], - True) # display alleles? - - # Draw location at every 100bp - y = get_dspace(0, nodes[-1][2], 14) - for pos in range(0, nodes[-1][2], 100): - # Draw label - self.draw_items.append(["text", - {"coord" : [pos + 1, y + 2], - "text" : "%d" % (pos + 1), - "fill" : "0 0 0", - "font_size" : 10}]) - - # Draw nodes - node_to_y = {} - draw_title = False - for id, left, right in nodes: - node = self.nodes[id] - nodes2 = [[node, left, right]] - if id in self.other_nodes: - for node in self.other_nodes[id]: - nodes2.append([node, node.left, node.right]) - if left > node.left: - left = node.left - if right < node.right: - right = node.right - - # Get y position - y = get_dspace(left, right, 14 * len(nodes2)) - for node, left, right in nodes2: - if y < 0: - continue - node_to_y[id] = y - - node_vars = node.get_vars() - node_var_ids = node.get_var_ids() - if len(nodes2) > 1: - color = "0.85 0.85 0.85" - elif len(allele_nodes) > 0: - color = "1 1 1" - max_common = -sys.maxint - for a in range(len(allele_nodes)): - allele_node_id, allele_left, allele_right = allele_nodes[a] - if right - left <= 500 and (left < allele_left or right > allele_right): - continue - if self.simulation: - allele_node = self.true_allele_nodes[allele_node_id] - else: - allele_node = self.predicted_allele_nodes[allele_node_id] - allele_vars = allele_node.get_var_ids(left, right) - common_vars = set(node_var_ids) & set(allele_vars) - tmp_common = len(common_vars) - len(set(node_var_ids) | set(allele_vars)) - if max_common < tmp_common: - max_common = tmp_common - color = node_colors[a % len(node_colors)] - elif max_common == tmp_common: - color = "1 1 1" - else: - color = "1 1 0" # yellow - - # Draw node - right += 1 - self.draw_items.append(["rect", - {"coord" : [left, y + 10, right - left, 10], - "fill" : color, - "stroke" : "0 0 0", - "line_width" : 2}]) - - # Draw variants - for var_id, pos in node_vars: - if var_id == "gap": - var_type, var_left = "single", pos - color = "0 0 0" - elif var_id == "unknown" or var_id.startswith("nv"): - var_type, var_left = "single", pos - color = "1 0 0" - else: - var_type, var_left, var_data = self.gene_vars[var_id] - color = "0 0 1" - if var_type == "single": - var_right = var_left + 1 - elif var_type == "insertion": - var_right = var_left + len(var_data) - else: - assert var_type == "deletion" - var_right = var_left + int(var_data) - self.draw_items.append(["rect", - {"coord" : [var_left, y + 9, var_right - var_left, 8], - "fill" : color}]) - - # Draw label - if get_sx(right - left) >= 300: - self.draw_items.append(["text", - {"coord" : [left + 2, y + 7], - "text" : node.id, - "fill" : "0 0 1", - "font_size" : 12}]) - - - if not draw_title: - draw_title = True - self.draw_items.append(["text", - {"coord" : [-68, y + 7], - "text" : title, - "fill" : "0 0 0", - "font_size" : 24}]) - - y += 14 - - curr_y = get_dspace(0, nodes[-1][2], 1) - self.unscaled_height = curr_y if curr_y > 0 else end_y - return self.unscaled_height - diff --git a/hisatgenotype_modules/hisatgenotype_typing_common.py b/hisatgenotype_modules/hisatgenotype_typing_common.py deleted file mode 100755 index 04cb95f3..00000000 --- a/hisatgenotype_modules/hisatgenotype_typing_common.py +++ /dev/null @@ -1,1552 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re -import math -import random -from copy import deepcopy -from datetime import datetime - - -################################################## -# Sequence processing routines -################################################## - - -""" -""" -def reverse_complement(seq): - comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} - rc_seq = "" - for s in reversed(seq): - if s in comp_table: - rc_seq += comp_table[s] - else: - rc_seq += s - return rc_seq - - -""" -""" -def read_genome(genome_file): - chr_dic, chr_names, chr_full_names = {}, [], [] - chr_name, chr_full_name, sequence = "", "", "" - for line in genome_file: - if line.startswith(">"): - if chr_name and sequence: - chr_dic[chr_name] = sequence - chr_names.append(chr_name) - chr_full_name = line.strip()[1:] - chr_name = line.strip().split()[0][1:] - chr_full_names.append(chr_full_name) - sequence = "" - else: - sequence += line.strip() - if chr_name and sequence: - chr_dic[chr_name] = sequence - chr_names.append(chr_name) - chr_full_names.append(chr_full_name) - return chr_dic, chr_names, chr_full_names - - -################################################## -# Alleles, variants, haplotypes, etc. -################################################## - - -""" -""" -def read_allele_sequences(fname): - allele_seqs = {} - allele_name, sequence = "", "" - for line in open(fname): - if line.startswith(">"): - if allele_name != "" and allele_name not in allele_seqs: - allele_seqs[allele_name] = sequence - allele_name = line.strip()[1:] - sequence = "" - else: - sequence += line.strip() - if allele_name != "" and allele_name not in allele_seqs: - allele_seqs[allele_name] = sequence - return allele_seqs - - -""" -""" -def read_variants(fname): - allele_vars = {} - for line in open(fname): - var_id, type, allele_name, left, data = line.strip().split() - left = int(left) - if type == "deletion": - data = int(data) - if allele_name not in allele_vars: - allele_vars[allele_name] = [] - allele_vars[allele_name].append([left, type, data, var_id]) - return allele_vars - - -""" -""" -def read_haplotypes(fname): - allele_haplotypes = {} - for line in open(fname): - haplotype_id, allele_name, left, right, vars = line.strip().split() - vars = vars.split(',') - left, right = int(left), int(right) - if allele_name not in allele_haplotypes: - allele_haplotypes[allele_name] = [] - allele_haplotypes[allele_name].append([left, right, vars]) - return allele_haplotypes - - -""" -""" -def read_links(fname): - links = [] - for line in open(fname): - var_id, allele_names = line.strip().split('\t') - links.append([var_id, allele_names]) - return links - - -""" -Compare two variants -""" -def compare_vars(a, b): - a_pos, a_type, a_data = a[:3] - b_pos, b_type, b_data = b[:3] - - if a_pos != b_pos: - return a_pos - b_pos - if a_type != b_type: - if a_type == 'I': - return -1 - elif b_type == 'I': - return 1 - if a_type == 'S': - return -1 - else: - return 1 - if a_data < b_data: - return -1 - elif a_data > b_data: - return 1 - else: - return 0 - - -""" -""" -def lower_bound(Var_list, pos): - low, high = 0, len(Var_list) - while low < high: - m = (low + high) / 2 - m_pos = Var_list[m][0] - if m_pos < pos: - low = m + 1 - elif m_pos > pos: - high = m - else: - assert m_pos == pos - while m > 0: - if Var_list[m-1][0] < pos: - break - m -= 1 - return m - return low - - - -""" -""" -def check_files(fnames): - for fname in fnames: - if not os.path.exists(fname): - return False - return True - - -################################################## -# Database releated routines -################################################## - - -""" -Download GRCh38 human reference and HISAT2 indexes -""" -def download_genome_and_index(): - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not check_files(HISAT2_fnames): - os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz") - os.system("hisat2-inspect grch38/genome > genome.fa") - os.system("samtools faidx genome.fa") - - -""" -""" -def clone_hisatgenotype_database(): - os.system("git clone https://github.com/DaehwanKimLab/hisatgenotype_db.git") - os.system("cd hisatgenotype_db; git checkout hisatgenotype_v1.0.2_beta; cd ..") - - -""" -""" -def extract_database_if_not_exists(base, - locus_list, - inter_gap = 30, - intra_gap = 50, - partial = True, - verbose = False): - fnames = [base + "_backbone.fa", - base + "_sequences.fa", - base + ".locus", - base + ".snp", - base + ".index.snp", - base + ".haplotype", - base + ".link", - base + ".allele", - base + ".partial"] - if check_files(fnames): - return - - extract_cmd = ["hisatgenotype_extract_vars.py"] - extract_cmd += ["--base", base] - if len(locus_list) > 0: - extract_cmd += ["--locus-list", ','.join(locus_list)] - if not partial: - extract_cmd += ["--no-partial"] - extract_cmd += ["--inter-gap", str(inter_gap), - "--intra-gap", str(intra_gap)] - if base == "hla": - extract_cmd += ["--min-var-freq", "0.1"] - - if base == "codis": - extract_cmd += ["--leftshift"] - - # DK - debugging purposes - # extract_cmd += ["--ext-seq", "300"] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd) - proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - - if not check_files(fnames): - print >> sys.stderr, "Error: hisatgenotype_extract_vars failed!" - sys.exit(1) - - -""" -""" -def build_index_if_not_exists(base, - aligner, - index_type, - threads = 1, - verbose = False): - if aligner == "hisat2": - # Build HISAT2 graph indexes based on the above information - if index_type == "graph": - hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base, i+1) for i in range(8)] - if not check_files(hisat2_graph_index_fnames): - build_cmd = ["hisat2-build", - "-p", str(threads), - "--snp", "%s.index.snp" % base, - "--haplotype", "%s.haplotype" % base, - "%s_backbone.fa" % base, - "%s.graph" % base] - if verbose: - print >> sys.stderr, "\tRunning:", ' '.join(build_cmd) - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(hisat2_graph_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed! Perhaps, you may have forgotten to build hisat2 executables?" - sys.exit(1) - # Build HISAT2 linear indexes based on the above information - else: - assert index_type == "linear" - hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base, i+1) for i in range(8)] - if not check_files(hisat2_linear_index_fnames): - build_cmd = ["hisat2-build", - "%s_backbone.fa,%s_sequences.fa" % (base, base), - "%s.linear" % base] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w')) - proc.communicate() - if not check_files(hisat2_linear_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - else: - # Build Bowtie2 indexes based on the above information - assert aligner == "bowtie2" and index_type == "linear" - bowtie2_index_fnames = ["%s.%d.bt2" % (base, i+1) for i in range(4)] - bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base, i+1) for i in range(2)] - if not tcheck_files(bowtie2_index_fnames): - build_cmd = ["bowtie2-build", - "%s_backbone.fa,%s_sequences.fa" % (base, base), - base] - proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w')) - proc.communicate() - if not check_files(bowtie2_index_fnames): - print >> sys.stderr, "Error: indexing HLA failed!" - sys.exit(1) - - - -################################################## -# Read simulation and alignment -################################################## - - -""" -Simulate reads from alleles with headers (>) filled with mapping information. - For an example, see hisat2_test_HLA_genotyping.py. -""" -def simulate_reads(seq_dic, # seq_dic["A"]["A*24:36N"] = "ACGTCCG ..." - base_fname, # hla, codis, cyp, or so on - allele_list, # ["A*32:29", "B*07:02:01"] - Vars, # Vars["A"]["hv326"] = ["single", 604, "C"] - Links, - simulate_interval = 1, - read_len = 100, - frag_len = 250, - perbase_errorrate = 0.0, - perbase_snprate = 0.0, - skip_fragment_regions = []): - reads_1, reads_2 = [], [] - num_pairs = [] - for allele_names in allele_list: - gene = allele_names[0].split('*')[0] - num_pairs.append([]) - - # Introduce SNPs into allele sequences - def introduce_snps(seq): - seq = list(seq) - for i in range(len(seq)): - if random.random() * 100 < perbase_snprate: - if seq[i] == 'A': - alt_bases = ['C', 'G', 'T'] - elif seq[i] == 'C': - alt_bases = ['A', 'G', 'T'] - elif seq[i] == 'G': - alt_bases = ['A', 'C', 'T'] - else: - assert seq[i] == 'T' - alt_bases = ['A', 'C', 'G'] - random.shuffle(alt_bases) - alt_base = alt_bases[0] - seq[i] = alt_base - seq = ''.join(seq) - return seq - - # Simulate reads from two alleles - def simulate_reads_impl(seq, - seq_map, - ex_seq_map, - ex_seq, - ex_desc, - simulate_interval = 1, - read_len = 100, - frag_len = 250, - perbase_errorrate = 0.0, - skip_fragment_regions = []): - # Introduce sequencing errors - def introduce_seq_err(read_seq, pos): - read_seq = list(read_seq) - for i in range(read_len): - map_pos = seq_map[pos + i] - if ex_desc[map_pos] != "": - continue - if random.random() * 100 < perbase_errorrate: - if read_seq[i] == 'A': - alt_bases = ['C', 'G', 'T'] - elif read_seq[i] == 'C': - alt_bases = ['A', 'G', 'T'] - elif read_seq[i] == 'G': - alt_bases = ['A', 'C', 'T'] - else: - assert read_seq[i] == 'T' - alt_bases = ['A', 'C', 'G'] - random.shuffle(alt_bases) - alt_base = alt_bases[0] - read_seq[i] = alt_base - read_seq = ''.join(read_seq) - return read_seq - - # Get read alignment, e.g., 260|R_483_61M5D38M23D1M_46|S|hv154,3|S|hv162,10|D|hv185,38|D|hv266 - def get_info(read_seq, pos): - info = "%d_" % (seq_map[pos] + 1) - total_match, match, sub_match = 0, 0, 0 - var_str = "" - ins_len, ins_var = 0, "" - for i in range(pos, pos + read_len): - map_i = ex_seq_map[i] - assert ex_seq[map_i] != 'D' - total_match += 1 - match += 1 - if ex_seq[map_i] == 'I': - if ins_var != "": - assert ins_var == ex_desc[map_i] - ins_var = ex_desc[map_i] - ins_len += 1 - elif ins_var != "": - if var_str != "": - var_str += ',' - var_str += ("%s|I|%s" % (sub_match, ins_var)) - ins_len, ins_var = 0, "" - sub_match = 0 - if ex_seq[map_i] != 'I': - if ex_desc[map_i] != "" or read_seq[i-pos] != ex_seq[map_i]: - if var_str != "": - var_str += ',' - var_str += ("%d|S|%s" % (sub_match, ex_desc[map_i] if ex_desc[map_i] != "" else "unknown")) - sub_match = 0 - else: - sub_match += 1 - if i + 1 < pos + read_len and ex_seq[map_i+1] == 'D': - assert match > 0 - info += ("%dM" % match) - match = 0 - del_len = 1 - while map_i + 1 + del_len < len(ex_seq): - if ex_seq[map_i + 1 + del_len] != 'D': - break - del_len += 1 - info += ("%dD" % del_len) - if var_str != "": - var_str += ',' - var_str += ("%s|D|%s" % (sub_match, ex_desc[map_i + 1])) - sub_match = 0 - assert match > 0 - info += ("%dM" % match) - assert total_match == read_len - if var_str: - info += "_" - info += var_str - return info - - comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'} - reads_1, reads_2 = [], [] - for i in range(0, len(seq) - frag_len + 1, simulate_interval): - if len(skip_fragment_regions) > 0: - skip = False - for skip_left, skip_right in skip_fragment_regions: - if i <= skip_right and i + frag_len > skip_left: - skip = True - break - if skip: - continue - - pos1 = i - seq1 = seq[pos1:pos1+read_len] - if perbase_errorrate > 0.0: - seq1 = introduce_seq_err(seq1, pos1) - info1 = get_info(seq1, pos1) - reads_1.append([seq1, info1]) - - pos2 = i + frag_len - read_len - seq2 = seq[pos2:pos2+read_len] - if perbase_errorrate > 0.0: - seq2 = introduce_seq_err(seq2, pos2) - info2 = get_info(seq2, pos2) - tmp_read_2 = reversed(seq2) - read_2 = "" - for s in tmp_read_2: - if s in comp_table: - read_2 += comp_table[s] - else: - read_2 += s - reads_2.append([read_2, info2]) - return reads_1, reads_2 - - # for each allele in a list of alleles such as ['A*32:29', 'B*07:02:01'] - for allele_name in allele_names: - allele_seq = seq_dic[gene][allele_name] - backbone_seq = seq_dic[gene]["%s*BACKBONE" % gene] - allele_ex_seq = list(backbone_seq) - allele_ex_desc = [''] * len(allele_ex_seq) - allele_seq_map = [i for i in range(len(allele_seq))] - allele_ex_seq_map = [i for i in range(len(allele_seq))] - - if perbase_snprate > 0: - HLA_seq = introduce_snps(allele_seq) - - # Extract variants included in each allele - var_ids = [] - for var_id, allele_list in Links.items(): - if allele_name in allele_list: - var_ids.append(var_id) - - def var_cmp(a, b): - assert a.startswith("hv") and b.startswith("hv") - return int(a[2:]) - int(b[2:]) - var_ids = sorted(var_ids, cmp=var_cmp) - - # Build annotated sequence for the allele w.r.t backbone sequence - add_pos = 0 - for var_id in var_ids: - var_type, var_pos, var_data = Vars[gene][var_id] - var_pos += add_pos - if var_type == "single": - allele_ex_seq[var_pos] = var_data - allele_ex_desc[var_pos] = var_id - elif var_type == "deletion": - del_len = int(var_data) - assert var_pos + del_len <= len(allele_ex_seq) - allele_ex_seq[var_pos:var_pos+del_len] = ['D'] * del_len - allele_ex_desc[var_pos:var_pos+del_len] = [var_id] * del_len - else: - assert var_type == "insertion" - ins_len = len(var_data) - allele_ex_seq = allele_ex_seq[:var_pos] + (['I'] * ins_len) + allele_ex_seq[var_pos:] - allele_ex_desc = allele_ex_desc[:var_pos] + ([var_id] * ins_len) + allele_ex_desc[var_pos:] - add_pos += ins_len - allele_ex_seq = ''.join(allele_ex_seq) - assert len(backbone_seq) + add_pos == len(allele_ex_seq) - - # Build mapping from the allele to the annotated sequence - prev_j, minus_pos = 0, 0 - for i in range(len(allele_seq)): - for j in range(prev_j, len(allele_ex_seq)): - if allele_ex_seq[j] != 'D': - if allele_ex_seq[j] == 'I': - minus_pos += 1 - break - allele_seq_map[i] = j - minus_pos - allele_ex_seq_map[i] = j - prev_j = j + 1 - - # DK - debugging purposes - """ - for t in range(0, len(allele_ex_seq), 100): - print t, allele_ex_seq[t:t+100] - print t, '-'.join(allele_ex_desc[t:t+100]) - print t, allele_seq_map[t:t+100] - print "allele_seq length:", len(allele_seq) - print len(allele_ex_seq), "vs.", len(seq_dic[gene]["A*BACKBONE"]), "vs.", len(allele_seq_map) - print allele_ex_seq[1943:1946] - print allele_ex_desc[1943:1946] - sys.exit(1) - """ - - tmp_reads_1, tmp_reads_2 = simulate_reads_impl(allele_seq, - allele_seq_map, - allele_ex_seq_map, - allele_ex_seq, - allele_ex_desc, - simulate_interval, - read_len, - frag_len, - perbase_errorrate, - skip_fragment_regions) - reads_1 += tmp_reads_1 - reads_2 += tmp_reads_2 - num_pairs[-1].append(len(tmp_reads_1)) - - # Write reads into a FASTA file - def write_reads(reads, idx): - read_file = open('%s_input_%d.fa' % (base_fname, idx), 'w') - for read_i in range(len(reads)): - query_name = "%d|%s_%s" % (read_i + 1, "LR"[idx-1], reads[read_i][1]) - if len(query_name) > 254: - query_name = query_name[:254] - print >> read_file, ">%s" % query_name - print >> read_file, reads[read_i][0] - read_file.close() - write_reads(reads_1, 1) - write_reads(reads_2, 2) - - return num_pairs - - -""" -Align reads, and sort the alignments into a BAM file -""" -def align_reads(aligner, - simulation, - index_name, - index_type, - base_fname, - read_fname, - fastq, - threads, - out_fname, - verbose): - if aligner == "hisat2": - aligner_cmd = [aligner, "--mm"] - if not simulation: - aligner_cmd += ["--no-unal"] - DNA = True - if DNA: - aligner_cmd += ["--no-spliced-alignment"] # no spliced alignment - aligner_cmd += ["-X", "1000"] # max fragment length - if index_type == "linear": - aligner_cmd += ["-k", "10"] - else: - aligner_cmd += ["--max-altstried", "64"] - aligner_cmd += ["--haplotype"] - if base_fname == "codis": - aligner_cmd += ["--enable-codis"] - aligner_cmd += ["--no-softclip"] - - elif aligner == "bowtie2": - aligner_cmd = [aligner, - "--no-unal", - "-k", "10"] - else: - assert False - aligner_cmd += ["-x", index_name] - assert len(read_fname) in [1,2] - aligner_cmd += ["-p", str(threads)] - if not fastq: - aligner_cmd += ["-f"] - if len(read_fname) == 1: - aligner_cmd += ["-U", read_fname[0]] - else: - aligner_cmd += ["-1", "%s" % read_fname[0], - "-2", "%s" % read_fname[1]] - - if verbose >= 1: - print >> sys.stderr, ' '.join(aligner_cmd) - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - sambam_cmd = ["samtools", - "view", - "-bS", - "-"] - sambam_proc = subprocess.Popen(sambam_cmd, - stdin=align_proc.stdout, - stdout=open(out_fname + ".unsorted", 'w'), - stderr=open("/dev/null", 'w')) - sambam_proc.communicate() - if index_type == "graph": - bamsort_cmd = ["samtools", - "sort", - out_fname + ".unsorted", - "-o", out_fname] - bamsort_proc = subprocess.Popen(bamsort_cmd, - stderr=open("/dev/null", 'w')) - bamsort_proc.communicate() - - bamindex_cmd = ["samtools", - "index", - out_fname] - bamindex_proc = subprocess.Popen(bamindex_cmd, - stderr=open("/dev/null", 'w')) - bamindex_proc.communicate() - - os.system("rm %s" % (out_fname + ".unsorted")) - - -""" -HISAT-genotype's mpileup -""" -def get_mpileup(alignview_cmd, - ref_seq, - base_locus, - vars, - allow_discordant): - ref_seq_len = len(ref_seq) - mpileup = [] - for i in range(ref_seq_len): - mpileup.append([[], {}]) - - proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - prev_pos = -1 - cigar_re = re.compile('\d+\w') - for line in proc.stdout: - line = line.strip() - cols = line.split() - read_id, flag, _, pos, _, cigar_str = cols[:6] - read_seq = cols[9] - flag, pos = int(flag), int(pos) - # Unalined? - if flag & 0x4 != 0: - continue - pos -= (base_locus + 1) - if pos < 0: - continue - - # Concordantly mapped? - if flag & 0x2 != 0: - concordant = True - else: - concordant = False - - if not allow_discordant and not concordant: - continue - - read_pos, left_pos = 0, pos - right_pos = left_pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MD": - for j in range(length): - if cigar_op == 'M': - read_nt = read_seq[read_pos + j] - else: - read_nt = 'D' - if right_pos + j < len(mpileup): - if read_nt not in mpileup[right_pos + j][1]: - mpileup[right_pos + j][1][read_nt] = 1 - else: - mpileup[right_pos + j][1][read_nt] += 1 - - if cigar_op in "MND": - right_pos += length - - if cigar_op in "MIS": - read_pos += length - - # Choose representative bases or 'D' - for i in range(len(mpileup)): - nt_dic = mpileup[i][1] - num_nt = sum(nt_dic.values()) - nt_set = [] - if num_nt >= 20: - for nt, count in nt_dic.items(): - if nt not in "ACGT": - continue - if count >= num_nt * 0.2 or count >= 7: - nt_set.append(nt) - mpileup[i][0] = nt_set - - # Sort variants - var_list = [[] for i in range(len(mpileup))] - for var_id, value in vars.items(): - var_type, var_pos, var_data = value - assert var_pos < len(var_list) - var_list[var_pos].append([var_id, var_type, var_data]) - - # Assign known or unknown variants - skip_i, prev_del_var_id = -1, "" - for i in range(len(mpileup)): - nt_dic = mpileup[i][1] - ref_nt = ref_seq[i] - new_nt_dic = {} - for nt, count in nt_dic.items(): - var_id = "" - if nt == 'D': - if i <= skip_i: - assert prev_del_var_id != "" - var_id = prev_del_var_id - else: - for var_id_, var_type, var_data in var_list[i]: - if var_type != "deletion": - continue - del_len = int(var_data) - del_exist = True - for j in range(i + 1, i + del_len): - assert j < len(mpileup) - nt_dic2 = mpileup[j][1] - if 'D' not in nt_dic2: - del_exist = False - break - if del_exist: - var_id = var_id_ - prev_del_var_id = var_id - skip_i = i + del_len - 1 - break - elif nt != 'N' and nt != ref_nt: - assert nt in "ACGT" - id = "unknown" - for var_id_, var_type, var_data in var_list[i]: - if var_type != "single": - continue - if nt == var_data: - var_id = var_id_ - break - new_nt_dic[nt] = [count, var_id] - - mpileup[i][1] = new_nt_dic - - return mpileup - - -""" -""" -def get_pair_interdist(alignview_cmd, - simulation, - verbose): - bamview_proc = subprocess.Popen(alignview_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting - alignview_proc = subprocess.Popen(sort_read_cmd, - stdin=bamview_proc.stdout, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - dist_list = [] - prev_read_id = None - cigar_re = re.compile('\d+\w') - reads = [] - for line in alignview_proc.stdout: - line = line.strip() - cols = line.split() - read_id, flag, _, pos, _, cigar_str = cols[:6] - read_seq = cols[9] - flag, pos = int(flag), int(pos) - # Unalined? - if flag & 0x4 != 0: - continue - - if simulation: - read_id = read_id.split('|')[0] - - # Concordantly mapped? - if flag & 0x2 != 0: - concordant = True - else: - concordant = False - - NH, YT = sys.maxint, "" - for i in range(11, len(cols)): - col = cols[i] - if col.startswith("NH"): - NH = int(col[5:]) - elif col.startswith("YT"): - YT = col[5:] - if NH > 1 or YT != "CP": - continue - - if prev_read_id != None and read_id != prev_read_id: - if len(reads) == 2: - left1, right1 = reads[0] - left2, right2 = reads[1] - if left1 <= left2: - dist = left2 - right1 - 1 - else: - dist = left1 - right2 - 1 - dist_list.append(dist) - reads = [] - - left_pos = right_pos = pos - cigars = cigar_re.findall(cigar_str) - cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars] - for i in range(len(cigars)): - cigar_op, length = cigars[i] - if cigar_op in "MND": - right_pos += length - - reads.append([left_pos, right_pos - 1]) - - prev_read_id = read_id - - dist_list = sorted(dist_list) - dist_avg = sum(dist_list) / max(1, len(dist_list)) - if len(dist_list) > 0: - dist_median = dist_list[len(dist_list)/2] - else: - dist_median = -1 - - return dist_median - - -################################################## -# Statistical routines -################################################## - - -""" -""" -def prob_diff(prob1, prob2): - diff = 0.0 - for allele in prob1.keys(): - if allele in prob2: - diff += abs(prob1[allele] - prob2[allele]) - else: - diff += prob1[allele] - return diff - - -""" -""" -def Gene_prob_cmp(a, b): - if a[1] != b[1]: - if a[1] < b[1]: - return 1 - else: - return -1 - assert a[0] != b[0] - if a[0] < b[0]: - return -1 - else: - return 1 - - -""" -""" -def single_abundance(Gene_cmpt, - remove_low_abundance_allele = False, - Gene_length = {}): - def normalize(prob): - total = sum(prob.values()) - for allele, mass in prob.items(): - prob[allele] = mass / total - - def normalize_len(prob, length): - total = 0 - for allele, mass in prob.items(): - assert allele in length - total += (mass / length[allele]) - for allele, mass in prob.items(): - assert allele in length - prob[allele] = mass / length[allele] / total - - Gene_prob, Gene_prob_next = {}, {} - for cmpt, count in Gene_cmpt.items(): - alleles = cmpt.split('-') - for allele in alleles: - if allele not in Gene_prob: - Gene_prob[allele] = 0.0 - Gene_prob[allele] += (float(count) / len(alleles)) - if len(Gene_length) > 0: - normalize_len(Gene_prob, Gene_length) - else: - normalize(Gene_prob) - - def next_prob(Gene_cmpt, Gene_prob, Gene_length): - Gene_prob_next = {} - for cmpt, count in Gene_cmpt.items(): - alleles = cmpt.split('-') - alleles_prob = 0.0 - for allele in alleles: - if allele not in Gene_prob: - continue - alleles_prob += Gene_prob[allele] - if alleles_prob <= 0.0: - continue - for allele in alleles: - if allele not in Gene_prob: - continue - if allele not in Gene_prob_next: - Gene_prob_next[allele] = 0.0 - Gene_prob_next[allele] += (float(count) * Gene_prob[allele] / alleles_prob) - if len(Gene_length) > 0: - normalize_len(Gene_prob_next, Gene_length) - else: - normalize(Gene_prob_next) - return Gene_prob_next - - def select_alleles(Gene_prob): - if len(Gene_prob) == 0: - return Gene_prob - Gene_prob2 = {} - max_prob = max(Gene_prob.values()) - for allele, prob in Gene_prob.items(): - if prob >= max_prob / 10.0: - Gene_prob2[allele] = prob - return Gene_prob2 - - fast_EM = True - diff, iter = 1.0, 0 - while diff > 0.0001 and iter < 1000: - Gene_prob_next = next_prob(Gene_cmpt, Gene_prob, Gene_length) - if fast_EM: - # Accelerated version of EM - SQUAREM iteration - # Varadhan, R. & Roland, C. Scand. J. Stat. 35, 335-353 (2008) - # Also, this algorithm is used in Sailfish - http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html - Gene_prob_next2 = next_prob(Gene_cmpt, Gene_prob_next, Gene_length) - sum_squared_r, sum_squared_v = 0.0, 0.0 - p_r, p_v = {}, {} - for a in Gene_prob.keys(): - p_r[a] = Gene_prob_next[a] - Gene_prob[a] - sum_squared_r += (p_r[a] * p_r[a]) - p_v[a] = Gene_prob_next2[a] - Gene_prob_next[a] - p_r[a] - sum_squared_v += (p_v[a] * p_v[a]) - if sum_squared_v > 0.0: - gamma = -math.sqrt(sum_squared_r / sum_squared_v) - for a in Gene_prob.keys(): - Gene_prob_next2[a] = max(0.0, Gene_prob[a] - 2 * gamma * p_r[a] + gamma * gamma * p_v[a]); - Gene_prob_next = next_prob(Gene_cmpt, Gene_prob_next2, Gene_length) - - diff = prob_diff(Gene_prob, Gene_prob_next) - Gene_prob = Gene_prob_next - - # Accelerate convergence - if iter >= 10 and remove_low_abundance_allele: - Gene_prob = select_alleles(Gene_prob) - - # DK - debugging purposes - if iter % 10 == 0 and False: - print >> sys.stderr, "iter", iter - for allele, prob in Gene_prob.items(): - if prob >= 0.001: - print >> sys.stderr, "\t", iter, allele, prob - - iter += 1 - - if remove_low_abundance_allele: - Gene_prob = select_alleles(Gene_prob) - if len(Gene_length) > 0: - normalize_len(Gene_prob, Gene_length) - else: - normalize(Gene_prob) - Gene_prob = [[allele, prob] for allele, prob in Gene_prob.items()] - Gene_prob = sorted(Gene_prob, cmp=Gene_prob_cmp) - return Gene_prob - - -################################################## -# Realignment, alternative alignments -################################################## - - -""" -Identify alternative haplotypes - insertions are not considered... - - INPUT: see the function's parameters below - OUPUT: 529-hv8-hv22-606: set(['529-hv13-570', '529-hv4-hv18-590', '529-hv2-hv16-582']) - 529-hv3-hv17-598: set(['529-hv6-hv21-hv26-610']) -""" -def get_alternatives(ref_seq, # GATAACTAGATACATGAGATAGATTTGATAGATAGATAGATACATACATACATACATACATACAGGATAGATAACTAGG... - allele_vars, # {'VWA*20(22)': ['hv231', 'hv245'], "VWA*16(18')": ['hv235', 'hv250', 'hv256'], ...} - Vars, # {'hv241': ['deletion', 529, '52'], 'hv240': ['deletion', 529, '48'], ... } - Var_list, # [[529, 'hv230'], [529, 'hv231'], [529, 'hv232'], [529, 'hv233'], ...] - verbose): - haplotype_alts_left, haplotype_alts_right = {}, {} - second_order_haplotypes = set() - for allele_name, vars in allele_vars.items(): - for v in range(len(vars) - 1): - ht = vars[v] + "-" + vars[v+1] - second_order_haplotypes.add(ht) - - rev_Var_list = [] - for _, var_id in Var_list: - var_type, var_pos, var_data = Vars[var_id] - if var_type == "deletion": - var_pos = var_pos + int(var_data) - 1 - elif var_type == "insertion": - var_pos += 1 - rev_Var_list.append([var_pos, var_id]) - rev_Var_list = sorted(rev_Var_list, cmp=lambda a, b: a[0] - b[0]) - - def nextbases(haplotype, - left = True, - exclude_list = []): - if left: - pos = int(haplotype[0]) - 1 - else: - pos = haplotype[-1] + 1 - if pos < 0 or pos >= len(ref_seq): - return [] - - if left: - bases = [[[pos] + haplotype[1:], ref_seq[pos]]] - prev_id = None - if len(haplotype) > 2: - prev_id = haplotype[1] - - var_i = lower_bound(rev_Var_list, pos + 1) - for var_j in reversed(range(0, var_i)): - _, var_id = rev_Var_list[var_j] - var_type, var_pos, var_data = Vars[var_id] - if var_type == "deletion": - if var_pos == 0: - continue - var_pos = var_pos + int(var_data) - 1 - if var_pos > pos: - continue - if var_pos < pos: - break - if var_id in exclude_list: - continue - if prev_id: - second_ht = var_id + "-" + prev_id - if second_ht not in second_order_haplotypes: - continue - - if var_type == "single": - bases.append([[var_pos, var_id] + haplotype[1:], var_data]) - elif var_type == "deletion": - bases2 = nextbases([var_pos - int(var_data) + 1, var_id] + haplotype[1:], - left, - exclude_list) - bases += bases2 - else: - assert var_type == "insertion" - else: - bases = [[haplotype[:-1] + [pos], ref_seq[pos]]] - prev_id = None - if len(haplotype) > 2: - prev_id = haplotype[-2] - - var_i = lower_bound(Var_list, pos) - for var_j in range(var_i, len(Var_list)): - _, var_id = Var_list[var_j] - var_type, var_pos, var_data = Vars[var_id] - if var_pos < pos: - continue - if var_pos > pos: - break - if var_id in exclude_list: - continue - if prev_id: - second_ht = prev_id + "-" + var_id - if second_ht not in second_order_haplotypes: - continue - - if var_type == "single": - bases.append([haplotype[:-1] + [var_id, var_pos], var_data]) - elif var_type == "deletion": - bases2 = nextbases(haplotype[:-1] + [var_id, var_pos + int(var_data) - 1], - left, - exclude_list) - bases += bases2 - else: - assert var_type == "insertion" - - return bases - - def get_haplotype_seq(haplotype): - seq = "" - pos = int(haplotype[0]) - for i in range(1, len(haplotype) - 1): - var_id = haplotype[i] - var_type, var_pos, var_data = Vars[var_id] - if pos < var_pos: - seq += ref_seq[pos:var_pos] - if var_type == "single": - seq += var_data - pos = var_pos + 1 - elif var_type == "deletion": - pos = var_pos + int(var_data) - else: - assert var_type == "insertion" - seq += var_data - pos = var_pos - - last_pos = int(haplotype[-1]) + 1 - assert pos <= last_pos - if pos < last_pos: - seq += ref_seq[pos:last_pos] - return seq - - def get_alternative_recur(var_orig_id, - haplotype, - haplotype_alt, - left = True, - dep = 0): - bases1 = nextbases(haplotype, - left) - bases2 = nextbases(haplotype_alt, - left, - [var_orig_id]) # exclude - - found = False - for base1 in bases1: - next_haplotype, bp = base1 - for base2 in bases2: - next_haplotype_alt, bp2 = base2 - if bp != bp2: - continue - - # Todo: implement a routine to handle haplotypes ending with the same coordinate - if left: - left1, left2 = int(next_haplotype[0]), int(next_haplotype_alt[0]) - if left1 == left2: - continue - else: - right1, right2 = int(next_haplotype[-1]), int(next_haplotype_alt[-1]) - if right1 == right2: - continue - - found = True - get_alternative_recur(var_orig_id, - next_haplotype, - next_haplotype_alt, - left, - dep + 1) - - if dep > 0: - if not found: - def to_haplotype_str(haplotype): - if len(haplotype) <= 2: - haplotype = "%d-%d" % (haplotype[0], haplotype[1]) - else: - haplotype = "%d-%s-%d" % (haplotype[0], '-'.join(haplotype[1:-1]), haplotype[-1]) - return haplotype - - haplotype, haplotype_alt = to_haplotype_str(haplotype), to_haplotype_str(haplotype_alt) - haplotype_alts = haplotype_alts_left if left else haplotype_alts_right - if haplotype not in haplotype_alts: - haplotype_alts[haplotype] = set() - haplotype_alts[haplotype].add(haplotype_alt) - - if haplotype_alt not in haplotype_alts: - haplotype_alts[haplotype_alt] = set() - haplotype_alts[haplotype_alt].add(haplotype) - - # Search alternative haplotypes in both left and right directions - for var_i in range(len(Var_list)): - _, var_id = Var_list[var_i] - var_type, var_pos, var_data = Vars[var_id] - if var_pos == 0: - continue - if var_type != "deletion": - continue - del_len = int(var_data) - if var_pos + del_len >= len(ref_seq): - continue - - # Left direction - get_alternative_recur(var_id, - [var_pos, var_id, var_pos + del_len - 1], - [var_pos + del_len, var_pos + del_len - 1]) - - # Right direction - get_alternative_recur(var_id, - [var_pos, var_id, var_pos + del_len - 1], - [var_pos, var_pos - 1], - False) - - # Print alternative haplotypes / Sanity check - def print_haplotype_alts(haplotype_alts): - for haplotype, haplotype_set in haplotype_alts.items(): - if verbose: print "\t%s:" % haplotype, haplotype_set - haplotype_seq = get_haplotype_seq(haplotype.split('-')) - for haplotype_alt in haplotype_set: - haplotype_alt_seq = get_haplotype_seq(haplotype_alt.split('-')) - assert haplotype_seq == haplotype_alt_seq - - if verbose: print "number of left haplotypes:", len(haplotype_alts_left) - print_haplotype_alts(haplotype_alts_left) - if verbose: print "number of right haplotypes:", len(haplotype_alts_right) - print_haplotype_alts(haplotype_alts_right) - - return haplotype_alts_left, haplotype_alts_right - - -""" -Identify ambigious differences that may account for other alleles, - given a list of differences (cmp_list) between a read and a potential allele -""" -def identify_ambigious_diffs(ref_seq, - Vars, - Alts_left, - Alts_right, - Alts_left_list, - Alts_right_list, - cmp_list, - verbose, - debug = False): - cmp_left, cmp_right = 0, len(cmp_list) - 1 - left, right = cmp_list[0][1], cmp_list[-1][1] + cmp_list[-1][2] - 1 - left_alt_set, right_alt_set = set(), set() - - def get_haplotype_and_seq(cmp_list): - ht, seq = [], "" - for i in range(len(cmp_list)): - cmp_i = cmp_list[i] - type, pos, length = cmp_i[:3] - if len(cmp_i) <= 3: - var_id = "" - else: - var_id = cmp_i[3] - if type == "match": - seq += ref_seq[pos:pos+length] - elif type == "mismatch": - seq += ref_seq[pos] - elif type == "insertion": - None - # seq += data - else: - assert type == "deletion" - - if var_id != "" and var_id != "unknown": - ht.append(var_id) - return ht, seq - - # Left direction - found = False - for i in reversed(range(len(cmp_list))): - i_found = False - cmp_i = cmp_list[i] - type, cur_left, length = cmp_i[:3] - var_id = cmp_i[3] if type in ["mismatch", "deletion"] else "" - - # DK - debugging purposes - if type in ["mismatch", "deletion", "insertion"]: - if not var_id.startswith("hv"): - continue - - if type in ["match", "deletion"]: - cur_right = cur_left + length - 1 - else: - cur_right = cur_left - - cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[:i+1]) - if len(cur_ht) == 0: - cur_ht_str = str(left) - else: - cur_ht_str = "%d-%s" % (left, '-'.join(cur_ht)) - ht_i = lower_bound(Alts_left_list, cur_right + 1) - for ht_j in reversed(range(0, min(ht_i + 1, len(Alts_left_list)))): - ht_pos, ht = Alts_left_list[ht_j] - if ht_pos < cur_left: - break - if ht_pos > cur_right: - continue - - if len(cur_ht) > 0: - if ht.find('-'.join(cur_ht)) == -1: - continue - - ht = ht.split('-')[:-1] - if len(cur_ht) + 1 == len(ht): - ht_pos = int(ht[0]) - if left < ht_pos: - continue - else: - var_id2 = ht[len(ht) - len(cur_ht) - 1] - ht_type, ht_pos, ht_data = Vars[var_id2] - if ht_type == "deletion": - ht_pos = ht_pos + int(ht_data) - 1 - if left <= ht_pos: - continue - - i_found = True - if debug: - print cmp_list[:i+1] - print "\t", cur_ht, "vs", Alts_left_list[ht_j] - - _, rep_ht = Alts_left_list[ht_j] - - if debug: - print "DK1:", cmp_i, cmp_list - print "DK2:", rep_ht, Alts_left[rep_ht] - print "DK3:", left, right - - for alt_ht_str in Alts_left[rep_ht]: - alt_ht = alt_ht_str.split('-') - alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1]) - assert alt_ht_right <= cur_right - seq_pos = cur_right - alt_ht_right - cur_pos = alt_ht_right - part_alt_ht = [] - alt_ht = alt_ht[1:-1] - for var_id_ in reversed(alt_ht): - var_type_, var_pos_, var_data_ = Vars[var_id_] - if var_type_ == "deletion": - del_len = int(var_data_) - var_pos_ = var_pos_ + del_len - 1 - assert var_pos_ <= cur_pos - next_seq_pos = seq_pos + (cur_pos - var_pos_) - if next_seq_pos >= len(cur_seq): - break - if var_type_ == "single": - next_seq_pos += 1 - next_cur_pos = var_pos_ - 1 - elif var_type_ == "deletion": - next_cur_pos = var_pos_ - del_len - else: - assert var_type_ == "insertion" - assert False - - part_alt_ht.insert(0, var_id_) - if next_seq_pos >= len(cur_seq): - break - seq_pos, cur_pos = next_seq_pos, next_cur_pos - - if len(part_alt_ht) > 0: - seq_left = len(cur_seq) - seq_pos - 1 - part_alt_ht_str = "" - if found: - var_id_list = [] - for j in range(i + 1, cmp_left): - cmp_j = cmp_list[j] - if cmp_j[0] in ["mismatch", "deletion", "insertion"]: - var_id_ = cmp_j[3] - if var_id_.startswith("hv"): - var_id_list.append(var_id_) - if len(var_id_list) > 0: - part_alt_ht_str = '-' + '-'.join(var_id_list) - part_alt_ht_str = ("%d-%s" % (cur_pos - seq_left, '-'.join(part_alt_ht))) + part_alt_ht_str - left_alt_set.add(part_alt_ht_str) - - if debug: - print "\t\t", cur_left, alt_ht_str - - if i_found: - if not found: - cmp_left = i + 1 - left_alt_set.add(cur_ht_str) - found = True - - if not found: - left_alt_set.add(str(left)) - - # Right direction - found = False - for i in range(0, len(cmp_list)): - i_found = False - cmp_i = cmp_list[i] - type, cur_left, length = cmp_i[:3] - var_id = cmp_i[3] if type in ["mismatch", "deletion"] else "" - - # DK - debugging purpose - if type in ["mismatch", "deletion", "insertion"]: - if not var_id.startswith("hv"): - continue - - if type in ["match", "deletion"]: - cur_right = cur_left + length - 1 - else: - cur_right = cur_left - - cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[i:]) - if len(cur_ht) == 0: - cur_ht_str = str(right) - else: - cur_ht_str = "%s-%d" % ('-'.join(cur_ht), right) - - ht_i = lower_bound(Alts_right_list, cur_left) - for ht_j in range(ht_i, len(Alts_right_list)): - ht_pos, ht = Alts_right_list[ht_j] - if ht_pos > cur_right: - break - if ht_pos < cur_left: - continue - - if len(cur_ht) > 0: - if ht.find('-'.join(cur_ht)) == -1: - continue - - ht = ht.split('-')[1:] - if len(cur_ht) + 1 == len(ht): - ht_pos = int(ht[-1]) - if right > ht_pos: - continue - else: - var_id2 = ht[len(cur_ht)] - var_type, ht_pos, _ = Vars[var_id2] - if right >= ht_pos: - continue - - i_found = True - _, rep_ht = Alts_right_list[ht_j] - - if debug: - print "DK1:", cmp_i, cmp_list - print "DK2:", rep_ht, Alts_right[rep_ht] - print "DK3:", left, right, ht_pos - - for alt_ht_str in Alts_right[rep_ht]: - alt_ht = alt_ht_str.split('-') - alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1]) - assert cur_left <= alt_ht_left - seq_pos = alt_ht_left - cur_left - cur_pos = alt_ht_left - part_alt_ht = [] - alt_ht = alt_ht[1:-1] - for var_id_ in alt_ht: - var_type_, var_pos_, var_data_ = Vars[var_id_] - assert var_pos_ >= cur_pos - next_seq_pos = seq_pos + (var_pos_ - cur_pos) - if next_seq_pos >= len(cur_seq): - break - - if var_type_ == "single": - next_seq_pos += 1 - next_cur_pos = var_pos_ + 1 - elif var_type_ == "deletion": - next_cur_pos = var_pos_ + int(var_data_) - else: - assert var_type_ == "insertion" - assert False - - part_alt_ht.append(var_id_) - if next_seq_pos >= len(cur_seq): - break - seq_pos, cur_pos = next_seq_pos, next_cur_pos - - if len(part_alt_ht) > 0: - seq_left = len(cur_seq) - seq_pos - 1 - assert seq_left >= 0 - part_alt_ht_str = "" - if found: - var_id_list = [] - for j in range(cmp_right + 1, i): - cmp_j = cmp_list[j] - if cmp_j[0] in ["mismatch", "deletion", "insertion"]: - var_id_ = cmp_j[3] - if var_id_.startswith("hv"): - var_id_list.append(var_id_) - if len(var_id_list) > 0: - part_alt_ht_str = '-'.join(var_id_list) + '-' - part_alt_ht_str += ("%s-%d" % ('-'.join(part_alt_ht), cur_pos + seq_left)) - right_alt_set.add(part_alt_ht_str) - - if i_found: - if not found: - cmp_right = i - 1 - right_alt_set.add(cur_ht_str) - found = True - - if not found: - right_alt_set.add(str(right)) - - if cmp_right < cmp_left: - cmp_left = 0 - left_alt_set = set([str(left)]) - - # Sanity check - ht_set_ = set() - for ht in left_alt_set: - ht = '-'.join(ht.split('-')[1:]) - if ht == "": - continue - if ht in ht_set_: - print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_ - - # DK - debugging purposes - print "DK: cmp_list_range: [%d, %d]" % (cmp_left, cmp_right) - print "DK: cmp_list:", cmp_list - print "DK: left_alt_set:", left_alt_set, "right_alt_set:", right_alt_set - - assert False - ht_set_.add(ht) - for ht in right_alt_set: - ht = '-'.join(ht.split('-')[:-1]) - if ht == "": - continue - if ht in ht_set_: - print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_ - assert False - ht_set_.add(ht) - - if debug: - print "cmp_list_range: [%d, %d]" % (cmp_left, cmp_right) - print "left alt set:", left_alt_set - print "right alt set:", right_alt_set - - return cmp_left, cmp_right, list(left_alt_set), list(right_alt_set) - diff --git a/hisatgenotype_scripts/compare_HLA.py b/hisatgenotype_scripts/compare_HLA.py deleted file mode 100755 index d32b593c..00000000 --- a/hisatgenotype_scripts/compare_HLA.py +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python - -import sys, os -from argparse import ArgumentParser, FileType -use_message = ''' -''' - -def compare(hisatgenotype_fname, - utsw_fname): - hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"] - for level in [1,2]: - print >> sys.stderr, "Level: %d" % level - def read_hla_types(fname): - hla, hla_orig = {}, {} - for line in open(fname): - line = line.strip() - fields = line.split('\t') - if len(fields) == 2: - sample, allele = fields - abundance, vars_covered = 0.0, "" - elif len(fields) == 3: - sample, allele, abundance = fields - vars_covered = "" - else: - assert len(fields) == 4 - sample, allele, abundance, vars_covered = fields - # sample = sample.split('_')[0] - abundance = float(abundance) - if sample not in hla: - hla[sample] = {} - hla_orig[sample] = {} - gene, allele = allele.split('*') - if gene not in hla[sample]: - hla[sample][gene] = [] - hla_orig[sample][gene] = [] - hla_orig[sample][gene].append([allele, abundance]) - - if level == 1: - allele = allele.split(':')[0] - else: - assert level == 2 - allele = ':'.join(allele.split(':')[:2]) - - found = False - for i in range(len(hla[sample][gene])): - cmp_allele, cmp_abundance = hla[sample][gene][i] - if level == 1 or allele.find(':') == -1: - one = two = allele - cmp_one = cmp_two = cmp_allele - else: - one, two = allele.split(':') - cmp_one, cmp_two = cmp_allele.split(':') - if one == cmp_one and two == cmp_two: - found = True - hla[sample][gene][i][1] = cmp_abundance + abundance - break - - if not found: - hla[sample][gene].append([allele, abundance]) - - for sample_hla in hla.values(): - for gene, allele_list in sample_hla.items(): - sample_hla[gene] = sorted(allele_list, key=lambda a: a[1], reverse=True) - - return hla, hla_orig - - hla1, hla1_orig = read_hla_types(hisatgenotype_fname) - hla2, hla2_orig = read_hla_types(utsw_fname) - - for gene in hla_list: - count, count_10 = [0, 0, 0], [0, 0, 0] - print >> sys.stderr, "\t%s" % gene - for sample in hla2.keys(): - if sample not in hla1: - continue - hla1_sample = hla1[sample] - hla2_sample = hla2[sample] - if gene not in hla1_sample or gene not in hla2_sample: - continue - hla1_gene = hla1_sample[gene] - hla2_gene = hla2_sample[gene] - num_match, num_match_10 = 0, 0 - for hla2_allele, _ in hla2_gene: - hla2_allele = hla2_allele.split(':') - for allele_idx in range(len(hla1_gene)): - hla1_allele = hla1_gene[allele_idx][0] - hla1_allele = hla1_allele.split(':') - equal = True - for i in range(min(len(hla1_allele), len(hla2_allele), level)): - hla1_num = hla1_allele[i] - hla2_num = hla2_allele[i] - if hla1_num != hla2_num: - equal = False - break - - if equal: - if allele_idx < 2: - num_match += 1 - if len(hla2_gene) == 1: - num_match += 1 - num_match_10 += 1 - if len(hla2_gene) == 1: - num_match_10 += 1 - break - - # DK - for debugging purposes - # """ - # if gene in ["A", "B", "C", "DQA1", "DQB1", "DRB1"] and num_match < 2: - if level == 3 and gene in ["B"] and num_match < 2: - print sample - print "\t", hla1_gene, "orig:", hla1_orig[sample][gene] - print "\t", hla2_gene, "orig:", hla2_orig[sample][gene] - # sys.exit(1) - # """ - - # DK - debugging purposes - if num_match >= len(count) or num_match_10 >= len(count_10): - print sample, num_match, num_match_10 - - assert num_match < len(count) and num_match_10 < len(count_10) - count[num_match] += 1 - count_10[num_match_10] += 1 - - if sum(count) <= 0: - continue - - print >> sys.stderr, "\t\tTop two\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count[0], count[1], count[2], (count[1] + count[2] * 2) / float(sum(count) * 2) * 100.0) - print >> sys.stderr, "\t\tTop ten\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count_10[0], count_10[1], count_10[2], (count_10[1] + count_10[2] * 2) / float(sum(count_10) * 2) * 100.0) - - -if __name__ == "__main__": - parser = ArgumentParser( - description='Compare HISAT-genotype and Utsw HLA typing results') - parser.add_argument('hisatgenotype_fname', - nargs='?', - type=str, - help='hisatgenotype file name (e.g. cp_hla.txt)') - parser.add_argument('utsw_fname', - nargs='?', - type=str, - help='utsw file name (e.g. utsw_caapa_hla.txt)') - - args = parser.parse_args() - - compare(args.hisatgenotype_fname, - args.utsw_fname) - diff --git a/hisatgenotype_scripts/compare_HLA_Omixon.py b/hisatgenotype_scripts/compare_HLA_Omixon.py deleted file mode 100755 index ad79c19e..00000000 --- a/hisatgenotype_scripts/compare_HLA_Omixon.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python - -import sys, os -from argparse import ArgumentParser, FileType -use_message = ''' -''' - -def compare(hisatgenotype_fname, omixon_fname): - hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"] - - # Read HISAT-genotype predicted HLA alleles for the CAAPA genomes - hisat_hla = {} - for line in open(hisatgenotype_fname): - line = line.strip() - fields = line.split('\t') - if len(fields) == 2: - sample, allele = fields - abundance, vars_covered = 0.0, "" - elif len(fields) == 3: - sample, allele, abundance = fields - vars_covered = "" - else: - assert len(fields) == 4 - sample, allele, abundance, vars_covered = fields - abundance = float(abundance) - if sample not in hisat_hla: - hisat_hla[sample] = {} - gene, allele = allele.split('*') - if gene not in hisat_hla[sample]: - hisat_hla[sample][gene] = [] - hisat_hla[sample][gene].append([allele, abundance]) - - # Read Omixon predicted HLA alleles for the CAAPA genomes - omixon_hla = {} - for line in open(omixon_fname): - line = line.strip() - sample, allele1, allele2 = line.split('\t') - gene1, allele1 = allele1.split('*') - gene2, allele2 = allele2.split('*') - - assert gene1 == gene2 - if sample not in omixon_hla: - omixon_hla[sample] = {} - if gene1 not in omixon_hla[sample]: - omixon_hla[sample][gene1] = [] - - if len(omixon_hla[sample][gene1]) >= 2: - continue - - omixon_hla[sample][gene1].append(allele1) - omixon_hla[sample][gene1].append(allele2) - - for gene in hla_list: - count, count_10 = [0, 0, 0], [0, 0, 0] - print >> sys.stderr, gene - for sample in omixon_hla.keys(): - if sample not in hisat_hla: - continue - hisat_sample = hisat_hla[sample] - omixon_sample = omixon_hla[sample] - if gene not in omixon_sample or gene not in hisat_sample: - continue - hisat_gene = hisat_sample[gene] - omixon_gene = omixon_sample[gene] - num_match, num_match_10 = 0, 0 - for omixon_allele in omixon_gene: - omixon_allele = omixon_allele.split(':') - for hisat_allele_idx in range(len(hisat_gene)): - hisat_allele = hisat_gene[hisat_allele_idx] - hisat_allele = hisat_allele[0].split(':') - equal = True - for i in range(min(len(omixon_allele), len(hisat_allele), 2)): - omixon_num = omixon_allele[i] - hisat_num = hisat_allele[i] - """ - if not omixon_num[-1].isdigit(): - omixon_num = omixon_num[:-1] - if not hisat_num[-1].isdigit(): - hisat_num = hisat_num[:-1] - if int(hisat_num) != int(omixon_num): - equal = False - break - """ - if hisat_num != omixon_num: - equal = False - break - if equal: - if hisat_allele_idx < 2: - num_match += 1 - num_match_10 += 1 - break - - # DK - for debugging purposes - """ - if gene in ["A", "B", "C", "DQA1", "DQB1", "DRB1"] and num_match < 2: - print sample - print "\t", omixon_gene - print "\t", hisat_gene - # sys.exit(1) - """ - - assert num_match < len(count) - count[num_match] += 1 - count_10[num_match_10] += 1 - - if sum(count) <= 0: - continue - - print >> sys.stderr, "\tTop two\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count[0], count[1], count[2], (count[1] + count[2] * 2) / float(sum(count) * 2) * 100.0) - print >> sys.stderr, "\tTop ten\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count_10[0], count_10[1], count_10[2], (count_10[1] + count_10[2] * 2) / float(sum(count_10) * 2) * 100.0) - - -if __name__ == "__main__": - parser = ArgumentParser( - description='Compare HISAT-genotype and Omixon HLA typing results') - parser.add_argument('hisatgenotype_fname', - nargs='?', - type=str, - help='hisatgenotype file name (e.g. cp_hla.txt)') - parser.add_argument('omixon_fname', - nargs='?', - type=str, - help='omixon file name (e.g. omixon_caapa_hla.txt)') - - args = parser.parse_args() - - compare(args.hisatgenotype_fname, - args.omixon_fname) - diff --git a/hisatgenotype_scripts/extract_Omixon_HLA.py b/hisatgenotype_scripts/extract_Omixon_HLA.py deleted file mode 100755 index 23aaa045..00000000 --- a/hisatgenotype_scripts/extract_Omixon_HLA.py +++ /dev/null @@ -1,115 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2016, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import sys, os, subprocess, glob - -if __name__ == '__main__': - hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"] - gen_alleles = {} - for hla in hla_list: - for line in open("IMGTHLA/fasta/%s_gen.fasta" % hla): - if line.startswith(">"): - allele = line.split()[1] - gene = allele.split('*')[0] - if gene not in gen_alleles: - gen_alleles[gene] = set() - gen_alleles[gene].add(allele) - - nuc_alleles = {} - for hla in hla_list: - for line in open("IMGTHLA/fasta/%s_nuc.fasta" % hla): - if line.startswith(">"): - allele = line.split()[1] - gene = allele.split('*')[0] - if gene not in nuc_alleles: - nuc_alleles[gene] = set() - nuc_alleles[gene].add(allele) - - print >> sys.stderr, "IMGTHLA" - for gene, alleles in nuc_alleles.items(): - print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles)) - - # Read HLA alleles from Omixon data - omixon_alleles = {} - omixon_fnames = glob.glob("HLAresults/*.gz") - for fname in omixon_fnames: - genome = fname.split("/")[1].split("_HLA")[0] - view_cmd = ["gzip", "-cd", fname] - proc = subprocess.Popen(view_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) - allele_count = {} - prev_allele1, prev_allele2 = "", "" - for line in proc.stdout: - if not line.startswith("HLA"): - continue - - fields = line.strip().split() - if len(fields) > 6: - allele1, allele2 = fields[0][4:-1], fields[6][4:-1] - else: - allele1 = allele2 = fields[0][4:-1] - - gene = allele1.split("*")[0] - if gene not in hla_list: - continue - if gene not in omixon_alleles: - omixon_alleles[gene] = set() - if gene not in allele_count: - allele_count[gene] = 0 - if allele_count[gene] >= 10: - continue - - if allele2 == "": - allele2 = prev_allele2 - assert allele1 != "" and allele2 != "" - - def update_allele(allele): - if allele == "DRB1*08:01:03": - allele = "DRB1*08:01:01" - elif allele == "DRB1*11:11:02": - allele = "DRB1*11:11:01" - return allele - - allele1, allele2 = update_allele(allele1), update_allele(allele2) - - allele_count[gene] += 1 - omixon_alleles[gene].add(allele1) - omixon_alleles[gene].add(allele2) - prev_allele1, prev_allele2 = allele1, allele2 - - print "%s\t%s\t%s" % (genome, allele1, allele2) - - print >> sys.stderr, "Omixon" - for gene, alleles in omixon_alleles.items(): - print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles)) - for allele in alleles: - if allele in nuc_alleles[gene]: - continue - found = False - for allele_cmp in nuc_alleles[gene]: - if allele_cmp.find(allele) != -1: - found = True - break - - if not found: - print >> sys.stderr, "\t\t%s is missing" % allele - - diff --git a/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py b/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py deleted file mode 100755 index 34cd4ecf..00000000 --- a/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2015, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import sys, os, subprocess, re -import inspect -import random -from argparse import ArgumentParser, FileType - -# Gold Standard (experimentally verified, a lot of literature, ...) -gold_allele_info = { - "NA12877" : {"A" : ["03:01", "02:01"], "B" : ["15:01", "44:02"], "C" : ["05:01", "03:04"], "DQA1" : ["03:03", "03:01"], "DQB1" : ["03:02", "03:01"], "DRB1" : ["04:01", "04:01"]}, - "NA12878" : {"A" : ["01:01", "11:01"], "B" : ["08:01", "56:01"], "C" : ["01:02", "07:01"], "DQA1" : ["05:01", "01:01"], "DQB1" : ["02:01", "05:01"], "DRB1" : ["03:01", "01:01"]}, - "NA12879" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12880" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12881" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["05:01", "01:02"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["04:01", "01:01"]}, - "NA12882" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["04:01", "01:01"]}, - "NA12883" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["01:02", "05:01"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["01:01", "04:01"]}, - "NA12884" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:01"]}, - "NA12885" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["05:01", "07:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["03:01", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12886" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["07:01", "05:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["02:01", "03:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12887" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12888" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["07:01", "03:04"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]}, - "NA12889" : {"A" : ["03:01", "03:01"], "B" : ["07:02", "44:02"], "C" : ["05:01", "07:02"], "DQA1" : ["03:03", "01:02"], "DQB1" : ["03:01", "06:02"], "DRB1" : ["15:01", "04:01"]}, - "NA12890" : {"A" : ["03:01", "02:01"], "B" : ["44:03", "15:01"], "C" : ["16:01", "03:04"], "DQA1" : ["03:01", "02:01"], "DQB1" : ["03:02", "02:02"], "DRB1" : ["04:03", "07:01"]}, - "NA12891" : {"A" : ["24:02", "01:01"], "B" : ["08:01", "07:02"], "C" : ["07:02", "07:01"], "DQA1" : ["05:01", "01:02"], "DQB1" : ["06:02", "02:01"], "DRB1" : ["03:01", "15:01"]}, - "NA12892" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "04:01"], "DQA1" : ["01:01", "01:01"], "DQB1" : ["05:01", "05:01"], "DRB1" : ["01:01", "01:01"]}, - "NA12893" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:01"]} - } - -# CEPH pedigree (17 family members) -pedigree = { - "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]}, - "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]}, - "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]}, - "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]}, - "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - } - - -""" -""" -def test_HLA_genotyping(reference_type, - hla_list, - aligners, - query_genomes, - exclude_allele_list, - num_mismatch, - verbose): - # Current script directory - curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping)) - ex_path = os.path.dirname(curr_script) - - if not os.path.exists("illumina/HLA"): - print >> sys.stderr, "Error: illumina/HLA data is needed (please send an email to infphilo@gmail.com for getting the data)" - sys.exit(1) - - num_test, num_success = 0, 0 - for genome in sorted(gold_allele_info.keys()): - if not genome in query_genomes: - continue - genes = gold_allele_info[genome] - read_fname_1, read_fname_2 = "illumina/HLA/%s.fished_1.fq" % genome, "illumina/HLA/%s.fished_2.fq" % genome - if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2): - continue - print >> sys.stderr, genome - cmd_aligners = ['.'.join(aligners[i]) for i in range(len(aligners))] - test_hla_script = os.path.join(ex_path, "hisat2_test_HLA_genotyping.py") - for gene in sorted(genes.keys()): - if not gene in hla_list: - continue - alleles = genes[gene] - print >> sys.stderr, "\t%s - %s" % (gene, ' / '.join(alleles)) - test_hla_cmd = [test_hla_script, - "--reference-type", reference_type, - "--hla-list", gene, - "--aligner-list", ','.join(cmd_aligners), - "--reads", "%s,%s" % (read_fname_1, read_fname_2), - "--best-alleles", - "--exclude-allele-list", ','.join(exclude_allele_list), - "--num-mismatch", str(num_mismatch)] - - if verbose: - print >> sys.stderr, ' '.join(test_hla_cmd) - - proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) - num_test += 2 - test_alleles = set() - for line in proc.stdout: - print "\t\t", line, - model, allele = line.split()[:2] - if model != "SingleModel": - continue - allele = allele.split('*')[1] - allele = ':'.join(allele.split(':')[:2]) - test_alleles.add(allele) - proc.communicate() - for allele in alleles: - if allele in test_alleles: - num_success += 1 - - print >> sys.stderr, "%d/%d (%.2f%%)" % (num_success, num_test, num_success * 100.0 / num_test) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='test HLA genotyping for Platinum Genomes') - parser.add_argument("--reference-type", - dest="reference_type", - type=str, - default="gene", - help="Reference type: gene, chromosome, and genome (default: gene)") - parser.add_argument("--hla-list", - dest="hla_list", - type=str, - default="A,B,C,DQA1,DQB1,DRB1", - help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)") - parser.add_argument("--aligner-list", - dest="aligners", - type=str, - default="hisat2.graph", - help="A comma-separated list of aligners (default: hisat2.graph)") - genomes_default = ','.join(gold_allele_info.keys()) - parser.add_argument("--genome-list", - dest="genome_list", - type=str, - default=genomes_default, - help="A comma-separated list of genomes (default: %s)" % genomes_default) - parser.add_argument("--exclude-allele-list", - dest="exclude_allele_list", - type=str, - default="", - help="A comma-separated list of allleles to be excluded") - parser.add_argument("--num-mismatch", - dest="num_mismatch", - type=int, - default=0, - help="Maximum number of mismatches per read alignment to be considered (default: 0)") - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - - args = parser.parse_args() - - if not args.reference_type in ["gene", "chromosome", "genome"]: - print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type) - sys.exit(1) - args.hla_list = args.hla_list.split(',') - if args.aligners == "": - print >> sys.stderr, "Error: --aligners must be non-empty." - sys.exit(1) - args.aligners = args.aligners.split(',') - for i in range(len(args.aligners)): - args.aligners[i] = args.aligners[i].split('.') - args.genome_list = args.genome_list.split(',') - args.exclude_allele_list = args.exclude_allele_list.split(',') - - test_HLA_genotyping(args.reference_type, - args.hla_list, - args.aligners, - args.genome_list, - args.exclude_allele_list, - args.num_mismatch, - args.verbose) diff --git a/hisatgenotype_scripts/hisatgenotype_convert_codis.py b/hisatgenotype_scripts/hisatgenotype_convert_codis.py deleted file mode 100755 index 415a42c8..00000000 --- a/hisatgenotype_scripts/hisatgenotype_convert_codis.py +++ /dev/null @@ -1,654 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import os, sys, subprocess, re -import inspect, operator -from copy import deepcopy -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common -try: - import openpyxl -except ImportError: - print >> sys.stderr, "Error: please install openpyxl by running 'pip install openpyxl'." - sys.exit(1) - - -# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm - -orig_CODIS_seq = { - "CSF1PO" : - # http://www.cstl.nist.gov/biotech/strbase/str_CSF1PO.htm - # allele 13: 5:150076172-150076490 - (samtools faidx genome.fa - GRCh38) - ["[AGAT]13", - "AACCTGAGTCTGCCAAGGACTAGCAGGTTGCTAACCACCCTGTGTCTCAGTTTTCCTACCTGTAAAATGAAGATATTAACAGTAACTGCCTTCATAGATAGAAGATAGATAGATT", # left flanking sequence - "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", # STR - "AGGAAGTACTTAGAACAGGGTCTGACACAGGAAATGCTGTCCAAGTGTGCACCAGGAGATAGTATCTGAGAAGGCTCAGTCTGGCACCATGTGGGTTGGGTGGGAACCTGGAGGCTGGAGAATGGGCTGAAGATGGCCAGTGGTGTGTGGAA"], # right flanking sequence - - "FGA" : - # http://www.cstl.nist.gov/biotech/strbase/str_FGA.htm - # allele 22: 4:154587696-154587891 - - ["[TTTC]3TTTTTTCT[CTTT]14CTCC[TTCC]2", - "GCCCCATAGGTTTTGAACTCACAGATTAAACTGTAACCAAAATAAAATTAGGCATATTTACAAGCTAG", - "TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC", - "TTTCTTCCTTTCTTTTTTGCTGGCAATTACAGACAAATCA"], - - "TH01" : - # http://www.cstl.nist.gov/biotech/strbase/str_TH01.htm - # allele 7: 11:2170990-2171176 + - ["[AATG]7", - "GTGGGCTGAAAAGCTCCCGATTATCCAGCCTGGCCCACACAGTCCCCTGTACACAGGGCTTCCGAGTGCAGGTCACAGGGAACACAGACTCCATGGTG", - "AATGAATGAATGAATGAATGAATGAATG", - "AGGGAAATAAGGGAGGAACAGGCCAATGGGAATCACCCCAGAGCCCAGATACCCTTTGAAT"], - - "TPOX" : - # http://www.cstl.nist.gov/biotech/strbase/str_TPOX.htm - # allele 8: 2:1489617-1489848 - ["[AATG]8", - "ACTGGCACAGAACAGGCACTTAGGGAACCCTCACTG", - "AATGAATGAATGAATGAATGAATGAATGAATG", - "TTTGGGCAAATAAACGCTGACAAGGACAGAAGGGCCTAGCGGGAAGGGAACAGGAGTAAGACCAGCGCACAGCCCGACTTGTGTTCAGAAGACCTGGGATTGGACCTGAGGAGTTCAATTTTGGATGAATCTCTTAATTAACCTGTGGGGTTCCCAGTTCCTCC"], - - "VWA" : - # http://www.cstl.nist.gov/biotech/strbase/str_VWA.htm - # allele unknown: 12:5983938-5984087 - - ["TCTA[TCTG]5[TCTA]11TCCA TCTA", - "CCCTAGTGGATGATAAGAATAATCAGTATGTGACTTGGATTGA", - "TCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA", - "TCCATCCATCCTATGTATTTATCATCTGTCC"], - - "D3S1358" : - # http://www.cstl.nist.gov/biotech/strbase/str_D3S1358.htm - # allele unknown: 3:45540713-45540843 + - ["TCTATCTG[TCTA]14", - "ATGAAATCAACAGAGGCTTGCATGTA", - "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", - "TGAGACAGGGTCTTGCTCTGTCACCCAGATTGGACTGCAGT"], - - "D5S818" : - # http://www.cstl.nist.gov/biotech/strbase/str_D5S818.htm - # allele 11: 5:123775504-123775638 - - ["[AGAT]11", - "GGTGATTTTCCTCTTTGGTATCCTTATGTAATATTTTGA", - "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", - "AGAGGTATAAATAAGGATACAGATAAAGATACAAATGTTGTAAACTGTGGCT"], - - "D7S820" : - # http://www.cstl.nist.gov/biotech/strbase/str_D7S820.htm - # allele 13: 7:84160125-84160367 - - ["[GATA]13", - "ATGTTGGTCAGGCTGACTATGGAGTTATTTTAAGGTTAATATATATAAAGGGTATGATAGAACACTTGTCATAGTTTAGAACGAACTAAC", - "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA", - "GACAGATTGATAGTTTTTTTTAATCTCACTAAATAGTCTATAGTAAACATTTAATTACCAATATTTGGTGCAATTCTGTCAATGAGGATAAATGTGGAATC"], - - "D8S1179" : - # http://www.cstl.nist.gov/biotech/strbase/str_D8S1179.htm - # allele 13: 8:124894838-124895018 + - ["[TCTA]1[TCTG]1[TCTA]11", - "TTTTTGTATTTCATGTGTACATTCGTA", - "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", - "TTCCCCACAGTGAAAATAATCTACAGGATAGGTAAATAAATTAAGGCATATTCACGCAATGGGATACGATACAGTGATGAAAATGAACTAATTATAGCTACG"], - - "D13S317" : - # http://www.cstl.nist.gov/biotech/strbase/str_D13S317.htm - # Perhaps, allele 11: 13:82147921-82148112 + - ["[TATC]11A", - "ATCACAGAAGTCTGGGATGTGGAGGAGAGTTCATTTCTTTAGTGGGCATCCGTGACTCTCTGGACTCTGACCCATCTAACGCCTATCTGTATTTACAAATACAT", - "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCA", - "ATCAATCATCTATCTATCTTTCTGTCTGTCTTTTTGGGCTGCC"], - - "D16S539" : - # http://www.cstl.nist.gov/biotech/strbase/str_D16S539.htm - # allele 11: 16:86352518-86352805 + - ["[GATA]11", - "GGGGGTCTAAGAGCTTGTAAAAAGTGTACAAGTGCCAGATGCTCGTTGTGCACAAATCTAAATGCAGAAAAGCACTGAAAGAAGAATCCAGAAAACCACAGTTCCCATTTTTATATGGGAGCAAACAAAGGCAGATCCCAAGCTCTTCCTCTTCCCTAGATCAATACAGACAGACAGACAGGTG", - "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA", - "TCATTGAAAGACAAAACAGAGATGGATGATAGATACATGCTTACAGATGCACACACAAAC"], - - "D18S51" : - # http://www.cstl.nist.gov/biotech/strbase/str_D18S51.htm - # allele 18: 18:63281611-63281916 + - ["[AGAA]18", - "GAGCCATGTTCATGCCACTGCACTTCACTCTGAGTGACAAATTGAGACCTTGTCTC", - "AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA", - "AAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTAAGACATCTCCACACACCAGAGAAGTTAATTTTAATTTTAACATGTTAAGAACAGAGAGAAGCCAACATGTCCACCTTAGGCTGACGGTTTGTTTATTTGTGTTGTTGCTGGTAGTCGGGTTTG"], - - "D21S11" : - # http://www.cstl.nist.gov/biotech/strbase/str_D21S11.htm - # Perhaps, allele 29: 21:19181945-19182165 + - ["[TCTA]4[TCTG]6[TCTA]3TA[TCTA]3TCA[TCTA]2TCCATA[TCTA]11", - "GTGAGTCAATTCCCCAAGTGAATTGCCT", - "TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA", - "TCGTCTATCTATCCAGTCTATCTACCTCCTATTAGTCTGTCTCTGGAGAACATTGACTAATACAAC"], - - # "AMEL" - http://www.cstl.nist.gov/biotech/strbase/jpg_amel.htm - # X chromosome has 6 bp deletion and Y chromosome doesn't - "AMELX" : - ["", - "TGTTGATTCTTTATCCCAGATGTTTCTCAAGTGG", # chromosome X at 11296898 - "", - ""], - - "AMELY" : - ["", - "AGAAACCACTTTATTTGGGATGAAGAATCCACC", # chromosome Y at 6869902 - "", - ""] -} - -CODIS_ref_name = {} - - -""" -""" -def get_flanking_seqs(seq, - flank_len = 500): - def align_seq(seq): - aligner_cmd = ["hisat2", - "--score-min", "C,0", - "--no-unal", - "-x", "grch38/genome", - "-c", seq] - align_proc = subprocess.Popen(aligner_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - chr, left, right, strand = "", -1, -1, '+' - for line in align_proc.stdout: - if line.startswith('@'): - continue - line = line.strip() - cols = line.split() - allele_id, flag, chr, left, _, cigar_str = cols[:6] - assert cigar_str[-1] == 'M' - left = int(left) - flag = int(flag) - strand = '-' if flag & 0x10 else '+' - assert cigar_str == ("%dM" % len(seq)) - right = left + len(seq) - break - - assert chr != "" and left >= 0 and right > left - return chr, left, right, strand - - chr, left, right, strand = align_seq(seq) - left_flank_seq, right_flank_seq = "", "" - if left > 1: - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, max(1, left - flank_len), left - 1)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - left_flank_seq += line - extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + flank_len - 1)] - extract_seq_proc = subprocess.Popen(extract_seq_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - for line in extract_seq_proc.stdout: - if line.startswith('>'): - continue - line = line.strip() - right_flank_seq += line - - if strand == '-': - left_flank_seq, right_flank_seq = typing_common.reverse_complement(right_flank_seq), typing_common.reverse_complement(left_flank_seq) - - chr, _, _, _ = align_seq(left_flank_seq + seq + right_flank_seq) - assert chr != "" - - return left_flank_seq, right_flank_seq - - - -""" -""" -def get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j): - if repeat_i == repeat_j: - # DK - experimental SW alignment - min_diff = sys.maxint - for repeat_num_i in repeat_nums_i: - for repeat_num_j in repeat_nums_j: - min_diff = min(abs(repeat_num_i - repeat_num_j), min_diff) - equal_score = -min_diff / 10.0 + (len(repeat_nums_i) + len(repeat_nums_j)) / 100.0 - equal_score = max(min(0.0 if min_diff == 0 else -0.1, equal_score), -0.9) - - # DK - just for now - equal_score = 0 - - return equal_score - elif repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1]): - return -1 - else: - return -2 - - -""" -Smith Waterman Algorithm -""" -def SW_alignment(allele_i, allele_j): - n, m = len(allele_i), len(allele_j) - a = [[-(i+j) if i == 0 or j == 0 else 0 for j in range(m + 1)] for i in range(n + 1)] - - # Fill 2D array - for i in range(n): - repeat_i, repeat_nums_i = allele_i[i] - for j in range(m): - repeat_j, repeat_nums_j = allele_j[j] - equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j) - a[i+1][j+1] = max(a[i][j+1] - 1, a[i+1][j] - 1, a[i][j] + equal_score) - - return a, n, m - - -""" -""" -def combine_alleles(backbone_allele, add_allele): - allele_i, allele_j = backbone_allele, add_allele - a, n, m = SW_alignment(allele_i, allele_j) - - # Back tracking - new_backbone_allele = [] - i, j = n - 1, m - 1 - while i >= 0 or j >= 0: - if i < 0: - repeat_j, repeat_nums_j = allele_j[j] - new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])]) - j -= 1 - elif j < 0: - repeat_i, repeat_nums_i = allele_i[i] - new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])]) - i -= 1 - else: - repeat_i, repeat_nums_i = allele_i[i] - repeat_j, repeat_nums_j = allele_j[j] - equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j) - if a[i][j+1] - 1 == a[i+1][j+1]: - new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])]) - i -= 1 - elif a[i+1][j] - 1 == a[i+1][j+1]: - new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])]) - j -= 1 - else: - assert a[i][j] + equal_score == a[i+1][j+1] - if repeat_i == repeat_j: - new_backbone_allele.append([repeat_i, repeat_nums_i | repeat_nums_j]) - else: - assert repeat_nums_i == repeat_nums_j - assert repeat_nums_i == set([1]) - new_backbone_allele.append([repeat_i | repeat_j, repeat_nums_i | repeat_nums_j]) - i -= 1 - j -= 1 - - new_backbone_allele = new_backbone_allele[::-1] - return new_backbone_allele - - -""" -""" -def msf_alignment(backbone_allele, allele): - allele_i, allele_j = backbone_allele, allele - a, n, m = SW_alignment(allele_i, allele_j) - - # Back tracking - allele_seq, backbone_seq = "", "" - i, j = n - 1, m - 1 - while i >= 0 or j >= 0: - assert i >= 0 - repeats_i, repeat_nums_i = allele_i[i] - repeat_i = "" - max_repeat = "" - for repeat_str in repeats_i: - if len(repeat_str) > len(repeat_i): - repeat_i = repeat_str - repeat_num_i = max(repeat_nums_i) - if j < 0: - allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq - backbone_seq = repeat_i * repeat_num_i + backbone_seq - i -= 1 - else: - repeats_j, repeat_nums_j = allele_j[j] - assert len(repeats_j) == 1 and len(repeat_nums_j) == 1 - repeat_j, repeat_num_j = list(repeats_j)[0], list(repeat_nums_j)[0] - equal_score = get_equal_score(repeats_i, repeat_nums_i, repeats_j, repeat_nums_j) - if a[i][j+1] - 1 == a[i+1][j+1]: - allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq - backbone_seq = repeat_i * repeat_num_i + backbone_seq - i -= 1 - else: - assert a[i][j] + equal_score == a[i+1][j+1] - if repeat_i == repeat_j: - add_seq = repeat_i * repeat_num_j - dot_seq = '.' * (len(repeat_i) * (repeat_num_i - repeat_num_j)) - allele_seq = add_seq + dot_seq + allele_seq - add_seq = repeat_i * repeat_num_i - backbone_seq = add_seq + backbone_seq - else: - assert repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1]) - dot_seq = '.' * (len(repeat_i) - len(repeat_j)) - allele_seq = repeat_j + dot_seq + allele_seq - backbone_seq = repeat_i + backbone_seq - i -= 1 - j -= 1 - - return allele_seq, backbone_seq - - -""" -Extract multiple sequence alignments -""" -def extract_msa(base_dname, - base_fname, - locus_list, - min_freq, - verbose): - # Download human genome and HISAT2 index - HISAT2_fnames = ["grch38", - "genome.fa", - "genome.fa.fai"] - if not typing_common.check_files(HISAT2_fnames): - typing_common.download_genome_and_index(ex_path) - - # Load allele frequency information - allele_freq = {} - if min_freq > 0.0: - excel = openpyxl.load_workbook("hisatgenotype_db/CODIS/NIST-US1036-AlleleFrequencies.xlsx") - sheet = excel.get_sheet_by_name(u'All data, n=1036') - for col in range(2, 100): - locus_name = sheet.cell(row = 3, column = col).value - if not locus_name: - break - locus_name = locus_name.encode('ascii','ignore') - locus_name = locus_name.upper() - assert locus_name not in allele_freq - allele_freq[locus_name] = {} - - for row in range(4, 101): - allele_id = sheet.cell(row = row, column = 1).value - allele_id = str(allele_id) - freq = sheet.cell(row = row, column = col).value - if not freq: - continue - allele_freq[locus_name][allele_id] = float(freq) - excel.close() - - CODIS_seq = orig_CODIS_seq - if len(locus_list) > 0: - new_CODIS_seq = {} - for locus_name, fields in CODIS_seq.items(): - if locus_name in locus_list: - new_CODIS_seq[locus_name] = fields - CODIS_seq = new_CODIS_seq - - # Add some additional sequences to allele sequences to make them reasonably long for typing and assembly - for locus_name, fields in CODIS_seq.items(): - _, left_seq, repeat_seq, right_seq = fields - allele_seq = left_seq + repeat_seq + right_seq - left_flank_seq, right_flank_seq = get_flanking_seqs(allele_seq) - CODIS_seq[locus_name][1] = left_flank_seq + left_seq - CODIS_seq[locus_name][3] = right_seq + right_flank_seq - - print >> sys.stderr, "%s is found on the reference genome (GRCh38)" % locus_name - - for locus_name in CODIS_seq.keys(): - alleles = [] - for line in open("hisatgenotype_db/CODIS/codis.dat"): - locus_name2, allele_id, repeat_st = line.strip().split('\t') - if locus_name != locus_name2: - continue - if min_freq > 0.0: - assert locus_name in allele_freq - if allele_id not in allele_freq[locus_name] or \ - allele_freq[locus_name][allele_id] < min_freq: - continue - - alleles.append([allele_id, repeat_st]) - - # From [TTTC]3TTTTTTCT[CTTT]20CTCC[TTCC]2 - # To [['TTTC', [3]], ['TTTTTTCT', [1]], ['CTTT', [20]], ['CTCC', [1]], ['TTCC', [2]]] - def read_allele(repeat_st): - allele = [] - s = 0 - while s < len(repeat_st): - ch = repeat_st[s] - if ch == ' ': - s += 1 - continue - assert ch in "[ACGT" - if ch == '[': - s += 1 - repeat = "" - while s < len(repeat_st): - nt = repeat_st[s] - if nt in "ACGT": - repeat += nt - s += 1 - else: - assert nt == ']' - s += 1 - break - assert s < len(repeat_st) - num = 0 - while s < len(repeat_st): - digit = repeat_st[s] - if digit.isdigit(): - num = num * 10 + int(digit) - s += 1 - else: - break - assert num > 0 - allele.append([set([repeat]), set([num])]) - else: - repeat = "" - while s < len(repeat_st): - nt = repeat_st[s] - if nt in "ACGT": - repeat += nt - s += 1 - else: - assert nt == ' ' or nt == '[' - break - allele.append([set([repeat]), set([1])]) - - # Sanity check - cmp_repeat_st = "" - for repeats, repeat_nums in allele: - repeat = list(repeats)[0] - repeat_num = list(repeat_nums)[0] - if repeat_num > 1 or locus_name == "D8S1179": - cmp_repeat_st += "[" - cmp_repeat_st += repeat - if repeat_num > 1 or locus_name == "D8S1179": - cmp_repeat_st += "]%d" % repeat_num - - assert repeat_st.replace(' ', '') == cmp_repeat_st.replace(' ', '') - return allele - - alleles = [[allele_id, read_allele(repeat_st)] for allele_id, repeat_st in alleles] - - def to_sequence(repeat_st): - sequence = "" - for repeats, repeat_nums in repeat_st: - repeat = list(repeats)[0] - repeat_num = list(repeat_nums)[0] - sequence += (repeat * repeat_num) - return sequence - - def remove_redundant_alleles(alleles): - seq_to_ids = {} - new_alleles = [] - for allele_id, repeat_st in alleles: - allele_seq = to_sequence(repeat_st) - if allele_seq in seq_to_ids: - print >> sys.stderr, "Warning) %s: %s has the same sequence as %s" % \ - (locus_name, allele_id, seq_to_ids[allele_seq]) - continue - if allele_seq not in seq_to_ids: - seq_to_ids[allele_seq] = [allele_id] - else: - seq_to_ids[allele_seq].append(allele_id) - new_alleles.append([allele_id, repeat_st]) - - return new_alleles - - alleles = remove_redundant_alleles(alleles) - - allele_seqs = [[allele_id, to_sequence(repeat_st)] for allele_id, repeat_st in alleles] - - ref_allele_st, ref_allele_left, ref_allele, ref_allele_right = CODIS_seq[locus_name] - ref_allele_st = read_allele(ref_allele_st) - for allele_id, allele_seq in allele_seqs: - if ref_allele == allele_seq: - CODIS_ref_name[locus_name] = allele_id - break - - # Add GRCh38 allele - if locus_name not in CODIS_ref_name: - allele_id = "GRCh38" - CODIS_ref_name[locus_name] = allele_id - allele_seqs = [[allele_id, ref_allele]] + allele_seqs - alleles = [[allele_id, ref_allele_st]] + alleles - - print >> sys.stderr, "%s: %d alleles with reference allele as %s" % (locus_name, len(alleles), CODIS_ref_name[locus_name]) - if verbose: - print >> sys.stderr, "\t", ref_allele_left, ref_allele, ref_allele_right - for allele_id, allele in alleles: - print >> sys.stderr, allele_id, "\t", allele - - # Create a backbone sequence - assert len(alleles) > 0 - backbone_allele = deepcopy(alleles[-1][1]) - for allele_id, allele_st in reversed(alleles[:-1]): - if verbose: - print >> sys.stderr - print >> sys.stderr, allele_id - print >> sys.stderr, "backbone :", backbone_allele - print >> sys.stderr, "allele :", allele_st - backbone_allele = combine_alleles(backbone_allele, allele_st) - msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st) - if verbose: - print >> sys.stderr, "combined backbone:", backbone_allele - print >> sys.stderr, "msf_allele_seq :", msf_allele_seq - print >> sys.stderr, "msf_backbone_seq:", msf_backbone_seq - print >> sys.stderr - - allele_dic = {} - for allele_id, allele_seq in allele_seqs: - allele_dic[allele_id] = allele_seq - - allele_repeat_msf = {} - for allele_id, allele_st in alleles: - msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st) - allele_repeat_msf[allele_id] = msf_allele_seq - - # Sanity check - assert len(allele_dic) == len(allele_repeat_msf) - repeat_len = None - for allele_id, repeat_msf in allele_repeat_msf.items(): - if not repeat_len: - repeat_len = len(repeat_msf) - else: - assert repeat_len == len(repeat_msf) - - # Creat full multiple sequence alignment - ref_allele_id = CODIS_ref_name[locus_name] - allele_msf = {} - for allele_id, repeat_msf in allele_repeat_msf.items(): - allele_msf[allele_id] = ref_allele_left + repeat_msf + ref_allele_right - - # Make sure the length of allele ID is short, less than 20 characters - max_allele_id_len = max([len(allele_id) for allele_id in allele_dic.keys()]) - assert max_allele_id_len < 20 - - # Write MSF (multiple sequence alignment file) - msf_len = len(ref_allele_left) + len(ref_allele_right) + repeat_len - msf_fname = "%s_gen.msf" % locus_name - msf_file = open(msf_fname, 'w') - for s in range(0, msf_len, 50): - for allele_id, msf in allele_msf.items(): - assert len(msf) == msf_len - allele_name = "%s*%s" % (locus_name, allele_id) - print >> msf_file, "%20s" % allele_name, - for s2 in range(s, min(msf_len, s + 50), 10): - print >> msf_file, " %s" % msf[s2:s2+10], - print >> msf_file - - if s + 50 >= msf_len: - break - print >> msf_file - msf_file.close() - - # Write FASTA file - fasta_fname = "%s_gen.fasta" % locus_name - fasta_file = open(fasta_fname, 'w') - for allele_id, allele_seq in allele_seqs: - gen_seq = ref_allele_left + allele_seq + ref_allele_right - print >> fasta_file, ">%s*%s %d bp" % (locus_name, allele_id, len(gen_seq)) - for s in range(0, len(gen_seq), 60): - print >> fasta_file, gen_seq[s:s+60] - fasta_file.close() - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Extract multiple sequence alignments for DNA Fingerprinting loci") - parser.add_argument("-b", "--base", - dest="base_fname", - type=str, - default="codis", - help="base filename (default: codis)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="base filename (default: empty)") - parser.add_argument("--min-freq", - dest="min_freq", - type=float, - default=0.0, - help="minimum allele frequency (default: 0.0)") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.base_fname.find('/') != -1: - elems = args.base_fname.split('/') - base_fname = elems[-1] - base_dname = '/'.join(elems[:-1]) - else: - base_fname = args.base_fname - base_dname = "" - if args.locus_list != "": - locus_list = args.locus_list.split(',') - else: - locus_list = [] - - extract_msa(base_dname, - base_fname, - locus_list, - args.min_freq, - args.verbose) - diff --git a/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py b/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py deleted file mode 100755 index c17d86c5..00000000 --- a/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py +++ /dev/null @@ -1,166 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import inspect, operator -from argparse import ArgumentParser, FileType - -# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm - -CODIS_loci = ["CSF1PO", "FGA", "TH01", "TPOX", "VWA", "D3S1358", "D5S818", "D7S820", "D8S1179", "D13S317", "D16S539", "D18S51", "D21S11"] - - -""" -## Download variant information from website -""" -def get_html(url): - download_cmd = ["wget", - "-O", "-", - url] - proc = subprocess.Popen(download_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - output = "" - for line in proc.stdout: - output += line - - return output - - -""" -Download CODIS data -""" -def download_codis(base_dname, - base_fname, - locus_list, - verbose): - # CODIS database base URL - base_url = "http://www.cstl.nist.gov/biotech/strbase" - - # Refer to Python's regular expression at https://docs.python.org/2/library/re.html - # 47.2 - allele_re = re.compile('>(\d+\.?\d?\"?\'*\(?\d*\.?\d?\"?\'*\)?\*?)[TTTC]4TTTT TT [CTTT]14[CTGT]3[CTTT]14 [CTTC]4[CTTT]3CTCC[TTCC]4 - # repeat_re = re.compile('^(\[[ACGT]+\]\d+|[ACGT]+)+$') - repeat_re = re.compile('^(\[[ACGT]+\]\d+|\[[ACGT]+\]|[ACGT]+|\s)+$') - # Remove extra tags - tag_re = re.compile('(<[^>]*>)') - nbsp_re = re.compile(' ') - quot_re = re.compile('"') - codis_data_file = open(base_fname + ".dat", 'w') - for locus_name in CODIS_loci: - if len(locus_list) > 0 and locus_name not in locus_list: - continue - url = "%s/str_%s.htm" % (base_url, locus_name) - content = get_html(url).split("\r\n") - content = map(lambda x: x.strip(), content) - content2 = [] - for line in content: - if line.startswith("> codis_data_file, "%s\t%s\t%s" % (locus_name, allele_id, repeat_st) - - codis_data_file.close() - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description="Extract multiple sequence alignments for DNA Fingerprinting loci") - parser.add_argument("-b", "--base", - dest="base_fname", - type=str, - default="codis", - help="base filename (default: codis)") - parser.add_argument("--locus-list", - dest="locus_list", - type=str, - default="", - help="base filename (default: empty)") - parser.add_argument("-v", "--verbose", - dest="verbose", - action="store_true", - help="also print some statistics to stderr") - - args = parser.parse_args() - if args.base_fname.find('/') != -1: - elems = args.base_fname.split('/') - base_fname = elems[-1] - base_dname = '/'.join(elems[:-1]) - else: - base_fname = args.base_fname - base_dname = "" - if args.locus_list != "": - locus_list = args.locus_list.split(',') - else: - locus_list = [] - - download_codis(base_dname, - base_fname, - locus_list, - args.verbose) - diff --git a/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py b/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py deleted file mode 100755 index b0b4d039..00000000 --- a/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py +++ /dev/null @@ -1,1061 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2016, Raymon Cao and Daehwan Kim -# -# This file is part of HISAT 2. -# -# HISAT 2 is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT 2 is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT 2. If not, see . -# - - -import os, sys, subprocess, re -import inspect, operator -import glob -from argparse import ArgumentParser, FileType - - -global gene_names -gene_names = ['cyp1a1','cyp1a2','cyp1b1','cyp2a6', - 'cyp2a13','cyp2b6','cyp2c8','cyp2c9', - 'cyp2c19','cyp2d6','cyp2e1','cyp2f1', - 'cyp2j2','cyp2r1','cyp2S1','cyp2w1', - 'cyp3a4','cyp3a5','cyp3a7','cyp3a43', - 'cyp4a11','cyp4a22','cyp4b1','cyp4f2', - 'cyp5a1','cyp8a1','cyp19a1','cyp21a2', - 'cyp26a1'] - -""" -Download variant information from website database -""" - -def get_html(url): - download_cmd = ["wget", - "-O", "-", - url] - proc = subprocess.Popen(download_cmd, - stdout=subprocess.PIPE, - stderr=open("/dev/null", 'w')) - - output = "" - for line in proc.stdout: - output += line - - return output - - -def download_CYP(verbose): - print("Downloading data from:") - - # CYP database base URL - base_url = "http://www.cypalleles.ki.se" - - # Current script directory - curr_script = os.path.realpath(inspect.getsourcefile(download_CYP)) - ex_path = os.path.dirname(curr_script) - - # Refer to Python's regular expression at https://docs.python.org/2/library/re.html - cyp_re = re.compile('http://www.cypalleles.ki.se/cyp\w+.htm') - output = get_html(base_url) - cyp_urls = cyp_re.findall(output) - # Original list had duplicate urls, removes duplicates - cyp_urls = set(cyp_urls) - - os.system('mkdir cyp_var_files') - for cyp_url in cyp_urls: - cyp_gene_name = cyp_url.split('/')[-1] - cyp_gene_name = cyp_gene_name.split('.')[0] - - # Hardcoded for cyp21 database (has inconsistant url naming) - if cyp_gene_name.lower() == "cyp21".lower(): - cyp_gene_name = cyp_gene_name + "a2" - - # Changed to match all instances of "cyp" - if not re.compile("cyp[\d\w]+", re.IGNORECASE).search(cyp_gene_name): - continue - - # Open file to write on - cyp_file = open("cyp_var_files/%s.var" % (cyp_gene_name), 'w') - - print >> sys.stderr, cyp_url, cyp_gene_name - print >> cyp_file, cyp_url, cyp_gene_name - - cyp_output = get_html(cyp_url) - if cyp_output == "": - continue - - listA = cyp_output.split("") - for ind in range(len(tabRow)): - tabRow[ind] = tabRow[ind].replace("\r\n","") - - allele_name_re = re.compile(cyp_gene_name.upper() + '\*[\w\d]+') - varInfo_re = re.compile('-?\d+[ACGT]\>[ACGT]|-?\d+_?-?\d+?del[ACGT]+|-?\d+_?-?\d+?ins[ACGT]+|None') - - alleleName = allele_name_re.findall(tabRow[0]) - if len(alleleName) > 0: - alleleName = alleleName[0] - - # @RaymonFix - some databases have extra table, ignores headers (CYP2A6) - # @Daehwan - some databases (e.g. http://www.cypalleles.ki.se/cyp3a4.htm) - # have 2 rows of Nucleotide changes (cDNA and Gene), might need - # to look at all rows for snps - # - # @RaymonFix - look in 4th column for "Gene" nt changes first, then consider cDNA if applicable; updated re to remove "<>" formating expressions - - if cyp_url == 'http://www.cypalleles.ki.se/cyp21.htm': # Hardcoded for special format for cyp21a2 - try: - varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[1])) - except IndexError: - continue - - else: - try: - varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[3])) - if len(varInfo) == 0: - varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[2])) - except IndexError: - continue - - for varInd in range(len(varInfo)): - varInfo[varInd] = varInfo[varInd].replace('>','>') - - if 'None' in varInfo: - try: - assert len(varInfo) == 1 - except: - varInfo = filter(lambda a: a != 'None', varInfo) - - - if isinstance(alleleName, basestring): - print >> cyp_file, (str(alleleName) + "\t" + ','.join(varInfo)) - - cyp_file.close() - - -""" -Make MSF files from variants -""" - -def checkNTloc(fasta_fileName,var_fileName,gene_name): - print "\nGene: %s" % gene_name - seq = "" - for line in open(fasta_fileName,'r'): - if line[0] == '>': - continue - seq += line.strip() - - cyp_var_file = open(var_fileName,'r') - cyp_var_dict = makeVarDict(cyp_var_file) - cyp_var_file.close() - - print "len:", len(seq) - varsPos = set() - varsNeg = set() - - for varList in cyp_var_dict.values(): - for var in varList: - if ">" in var: # is SNP - posNt = int(var[:-3]) - ntChange = var[-3:].replace('>','') - assert len(ntChange) == 2 - for nt in ntChange: - assert nt in "ACGT" - - if posNt > 0: - varsPos.add(str(posNt) + '->' + ntChange[0]) - else: - assert posNt < 0 - varsNeg.add(str(posNt) + '->' + ntChange[0]) - - elif "del" in var: # is deletion - posNt = var.split('del')[0].split('_') - posNt = [int(p) for p in posNt] - ntDel = var.split('del')[1] - for nt in ntDel: - assert nt in "ACGT" - - if len(posNt) == 1: # single nt deletion - assert len(ntDel) == 1 - if posNt[0] > 0: - varsPos.add(str(posNt[0]) + '->' + ntDel) - else: - assert posNt[0] < 0 - varsNeg.add(str(posNt[0]) + '->' + ntDel) - - else: # mutliple nt deletion - assert len(posNt) == 2 - try: - assert posNt[1] - posNt[0] + 1 == len(ntDel) - except AssertionError: - print "Incorrect deletion format: %s , skipping variation" % (var) - '''sys.exit(1)''' - continue - ntDelList = list(ntDel) - for i in range(posNt[0],posNt[1] + 1): - if i > 0: - varsPos.add(str(i) + '->' + ntDelList.pop(0)) - else: - assert i < 0 - varsNeg.add(str(i) + '->' + ntDelList.pop(0)) - assert len(ntDelList) == 0 - - else: - assert ("ins" in var) or ("None" in var) - continue - - scorePos = {} # { position offset : number of alignments } for positive positions - for i in range(-len(seq), len(seq)): - align_score = 0 - for var in varsPos: - pos, base = var.split('->') - pos = int(pos) - - try: - seq[pos+i] - except IndexError: - continue - - if seq[pos+i] == base: - align_score += 1 - - scorePos[i] = align_score - oSetPos = max(scorePos.iteritems(), key=operator.itemgetter(1))[0] - print "Positive postitions offset: %d" % oSetPos - print "Score: %d out of %d\n" % (scorePos[oSetPos], len(varsPos)) - - - print "Checking negative position offset: %d" % (oSetPos + 1) - align_score = 0 - oSetNeg = oSetPos + 1 - for var in varsNeg: - pos, base = var.split('->') - pos = int(pos) - - try: - seq[pos + oSetNeg] - except IndexError: - continue - - if seq[pos + oSetNeg] == base: - align_score += 1 - print "Score: %d out of %d\n\n" % (align_score, len(varsNeg)) - - if len(varsNeg) == 0 and len(varsPos) != 0: - return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), 1.0, float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg)) - elif len(varsNeg) != 0 and len(varsPos) == 0: - return oSetPos, oSetNeg, 1.0, float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg)) - elif len(varsNeg) == 0 and len(varsPos) == 0: - return oSetPos, oSetNeg, 1.0, 1.0, 1.0 - else: - assert len(varsNeg) != 0 and len(varsPos) != 0 - return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg)) - - -def create_map(seq): - seq_map = {} - count = 0 - for i in range(len(seq)): - bp = seq[i] - if bp == '.': - continue - assert bp.upper() in "ACGT" - seq_map[count] = i - count += 1 - return seq_map - -def splitString(someStr,posList): - posList.insert(0,-1) - posList.append(len(someStr) - 1) - splitStr = [] - for i in range(len(posList) - 1): - left = posList[i] + 1 - right = posList[i+1] + 1 - splitStr.append(someStr[left:right]) - - return splitStr - -def extractSeq(faFile): - seq = "" - for line in faFile: - if line.startswith(">"): - continue - - seq += line.strip() - - return seq - -def makeVarDict(fname): - alleleVarDict = {} - - allLines = [line.strip() for line in fname] - - ref_al_id_present = False - for line in allLines[1:]: - if 'None' in line: - ref_al_id_present = True - - line_num = 0 - for line in allLines[1:]: - line_num += 1 - assert line.upper().startswith("CYP") - alleleName = line.split("\t")[0].upper() - - if (not ref_al_id_present) and line_num == 1: - varList = ['None'] - else: - try: - varList = line.split("\t")[1].split(',') - except IndexError: - continue - - try: - assert not alleleName in alleleVarDict - alleleVarDict[alleleName] = set(varList) - except: - print >> sys.stdout, ("Warning, %s allele is already represented" % alleleName) - alleleVarDict[alleleName] = alleleVarDict[alleleName] | set(varList) - - return alleleVarDict - -def makeSnp(oldSeq, pos, oldNt, newNt): - assert oldSeq[pos] == oldNt - newSeq = oldSeq[:pos] + newNt + oldSeq[pos+1:] - assert len(newSeq) == len(oldSeq) - return newSeq - -def makeDel(oldSeq, left, right, toDel): - assert right - left + 1 == len(toDel) - assert oldSeq[left:right + 1] == toDel - newSeq = oldSeq[:left] + '.'*len(toDel) + oldSeq[right + 1:] - assert len(newSeq) == len(oldSeq) - return newSeq - -def makeIns(oldSeq,left,right,toIns): - assert right - left - 1 >= len(toIns) - for nt in oldSeq[left + 1:right]: - assert nt == '.' - remDots = right - left - 1 - len(toIns) - newSeq = oldSeq[:left + 1] + toIns + '.'*remDots + oldSeq[right:] - assert len(newSeq) == len(oldSeq) - return newSeq - - -def makeMSF(gene_name, oSetPos, oSetNeg): - cyp_var_file = open("cyp_var_files/%s.var" % gene_name,'r') - cyp_var_dict = makeVarDict(cyp_var_file) - cyp_var_file.close() - - if len(cyp_var_dict) < 2: - print('\tOnly reference allele included, skipping gene') - return - - try: - blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name) - if len(blast_allele_var) > 0: - cyp_var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var) - except IOError: - print('\t%s blast file was skipped.' % gene_name) - - cyp_faFile = open("cyp_fasta/%s.fasta" % gene_name,'r') - cyp_seq = extractSeq(cyp_faFile) - cyp_faFile.close() - preBackbone_seq = '' - - - msfTable = {} - - # Building backbone structure (augment length with insertions) - longestIns = {} # { key = position : value = length } - for allele,varList in cyp_var_dict.items(): - for var in varList: - if not "ins" in var: - continue - pos = var.split('ins')[0].split('_') - pos = [int(p) for p in pos] - ntIns = var.split('ins')[1] - correctFormat = len(pos) == 2 and pos[1] - pos[0] == 1 - if not correctFormat: - correctFormat = len(pos) == 1 - try: - assert correctFormat - except: - print >> sys.stdout, "\tIncorrect format for insertion: variation %s on allele %s" % (var, allele) - continue - - # convert to position in string - if not 'GRCH38' in allele: - if pos[0] > 0: - pos = pos[0] + oSetPos - else: - pos = pos[0] + oSetNeg - else: - pos = pos[0] - - # Make dictionary of longest insertions - if not pos in longestIns: - longestIns[pos] = len(ntIns) - else: - if len(ntIns) > longestIns[pos]: - longestIns[pos] = len(ntIns) - - posInsList = sorted(longestIns.keys()) - - splitSeq = splitString(cyp_seq,posInsList) - posInsList = posInsList[1:-1] - - for i in range(len(posInsList)): - splitSeq[i] += '.' * longestIns[posInsList[i]] - - for subseq in splitSeq: - try: - assert len(subseq) > 0 and not subseq.startswith('.') - preBackbone_seq += subseq - except: - continue - # pre-backbone built - - - map_cyp = create_map(preBackbone_seq) # { Index of bp in original seq : Actual index in string } - - - for allele,varList in cyp_var_dict.items(): - for var in varList: - isSnp = False - isDel = False - isIns = False - - if ">" in var: - isSnp = True - elif "del" in var: - isDel = True - elif "ins" in var: - isIns = True - else: - assert("None" in var) - isRef = True - - if isSnp: - pos = int(var[:-3]) - dbPos = pos - ntChange = var[-3:].replace('>','') - assert len(ntChange) == 2 - for nt in ntChange: - assert nt in "ACGT" - - if not 'GRCH38' in allele: - if pos > 0: - pos = pos + oSetPos - else: - pos = pos + oSetNeg - - if pos < 0 or pos > len(cyp_seq) - 1: - print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - continue - - try: - assert(preBackbone_seq[map_cyp[pos]] == ntChange[0]) # nt at pos in seq must match database - except: - print >> sys.stdout, "\tWarning: position %d in sequence contains %s, but expected %s from database" % (dbPos, preBackbone_seq[map_cyp[pos]], ntChange[0]) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - continue - - # Adding to msf table - if not allele in msfTable: - msfTable[allele] = makeSnp(preBackbone_seq, map_cyp[pos], ntChange[0], ntChange[1]) - else: - msfTable[allele] = makeSnp(msfTable[allele], map_cyp[pos], ntChange[0], ntChange[1]) - - elif isDel: - pos = var.split('del')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707]) - pos.append(pos[0]) - assert len(pos) == 2 - dbPos = pos - ntDel = var.split('del')[1] - for nt in ntDel: - assert nt in "ACGT" - - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - - skipDel = False - for i in range(len(pos)): - if pos[i] < 0 or pos[i] > len(cyp_seq) - 1: - print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos[i]) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - skipDel = True - - if skipDel: - continue - - - try: - assert pos[1] - pos[0] + 1 == len(ntDel) - except: - print >> sys.stdout, "\tIncorrect deletion data with %s on allele %s. Skipping variation." % (var, allele) - continue - - try: - assert preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ] == ntDel - except: - print >> sys.stdout, "\tWarning, positions %d to %d in sequence contains %s, but expected %s from database" % \ - (dbPos[0], dbPos[1], preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ], ntDel) - print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - continue - - - # Adding to msf table - if not allele in msfTable: - msfTable[allele] = makeDel(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntDel) - else: - msfTable[allele] = makeDel(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntDel) - - - elif isIns: - pos = var.split('ins')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: - pos.append(pos[0] + 1) - assert len(pos) == 2 - dbPos = pos - try: - assert pos[1] - pos[0] == 1 - except AssertionError: - print >> sys.stdout, "\tIncorrect insertion data with %s on allele %s. Skipping variation." % (var, allele) - continue - ntIns = var.split('ins')[1] - for nt in ntIns: - assert nt in "ACGT" - - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - - skipIns = False - for i in range(len(pos)): - if pos[i] < 0 or pos[i] > len(cyp_seq) - 1: - print >> sys.stdout, "Warning: position %d out of bounds" % (dbPos[i]) - print >> sys.stdout, "\tError occured on variation %s on allele %s. Skipping variation." % (var, allele) - skipIns = True - - if skipIns: - continue - - - # Adding to msf table - if not allele in msfTable: - msfTable[allele] = makeIns(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntIns) - else: - msfTable[allele] = makeIns(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntIns) - - - else: - assert isRef - assert not allele in msfTable - msfTable[allele] = preBackbone_seq - - # Sanity checking - seq_len = 0 - for allele, msf_seq in msfTable.items(): - if seq_len == 0: - seq_len = len(msf_seq) - else: - assert seq_len == len(msf_seq) - assert seq_len > 0 - - # Follow MSF style of IMGT/HLA database - msfFile = open('cyp_msf/%s_gen.msf' % gene_name[3:].upper(),'w') - for i in range(0, seq_len, 50): - for allele, msf_seq in msfTable.items(): - output = "%12s" % allele[3:].upper() - for j in range(i, i+50, 10): - if j >= seq_len: - break - if j == i: - output += "\t" - else: - output += " " - output += msf_seq[j:j+10] - print >> msfFile, output - print >> msfFile - - msfFile.close() - - -def build_msf_files(): - os.system('mkdir cyp_msf') - - oSetPos = 0 - oSetNeg = 0 - oSetScorePos = 0.0 - oSetScoreNeg = 0.0 - tot_score = 0.0 - - print('\nBuilding MSF files:') - for gene_name in gene_names: - oSetPos, oSetNeg, oSetScorePos, oSetScoreNeg, tot_score = checkNTloc("cyp_fasta/%s.fasta" % gene_name,"cyp_var_files/%s.var" % gene_name,gene_name) - if not (tot_score >= 0.95): - print "\tLess than 95% match, skipping gene." - continue - - makeMSF(gene_name, oSetPos, oSetNeg) - - -''' -Check MSF files against variants files -''' - -global incorrect_msf_entries -incorrect_msf_entries = [] - -def create_inv_map(seq): - seq_map = {} - count = 0 - for i in range(len(seq)): - bp = seq[i] - if bp == '.': - continue - assert bp.upper() in "ACGT" - seq_map[i] = count - count += 1 - return seq_map - -def readMSF(msf_fname): # { Allele name : MSF sequence } - msf_dict = {} - all_lines = [line for line in msf_fname] - for line in all_lines: - line = line.strip().replace(' ','') - if len(line) == 0 : continue - allele_name = 'CYP' + line.split('\t')[0] - msf_seq = line.split('\t')[1] - if not allele_name in msf_dict: - msf_dict[allele_name] = msf_seq - else: - msf_dict[allele_name] = msf_dict[allele_name] + msf_seq - - return msf_dict - -def msf_removeIns(ref_seq, al_seq): - assert len(ref_seq) == len(al_seq) - ins_ind_list = [] - for i in range(len(ref_seq)): - if ref_seq[i] == '.': - ins_ind_list.append(i) - - ori_ref_seq = ref_seq.replace('.','') - ori_al_seq = list(al_seq) - - for i in ins_ind_list: - ori_al_seq[i] = '-' - - ori_al_seq = ''.join(ori_al_seq).replace('-','') - - assert len(ori_ref_seq) == len(ori_al_seq) - return ori_ref_seq, ori_al_seq - -def msfToVarList(ref_seq, al_seq): - var_list = [] - - assert len(ref_seq) == len(al_seq) - for bp in ref_seq: assert bp in "ACGT." - for bp in al_seq: assert bp in "ACGT." - inv_map = create_inv_map(ref_seq) - - ins_re = re.compile('[ACGT]\.+') - ins_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ref_seq)] # list of duples of indicies of insertions in ref_seq - ins_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in ins_subStrPos] - - for tup in ins_pos_length: - ins_pos, ins_length = tup[0], tup[1] - ins_seq = al_seq[ins_pos + 1: ins_pos + ins_length + 1] - ins_seq = ins_seq.replace('.','') - if len(ins_seq) == 0: - continue - ins_str_data = str(inv_map[tup[0]]) + '_' + str(inv_map[tup[0]] + 1) + 'ins' + ins_seq - var_list.append(ins_str_data) - - # insertions finished - - ori_ref_seq, ori_al_seq = msf_removeIns(ref_seq, al_seq) - - for i in range(len(ori_ref_seq)): - if ori_al_seq[i] == '.': - continue - elif ori_al_seq[i] != ori_ref_seq[i]: # snp - var_list.append(str(i) + ori_ref_seq[i] + '>' + ori_al_seq[i]) - - del_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ori_al_seq)] # list of duples of indicies of deletions in ori_al_seq - del_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in del_subStrPos] - - for tup in del_pos_length: - del_pos, del_length = tup[0], tup[1] - del_seq = ori_ref_seq[del_pos + 1 : del_pos + del_length + 1] - if del_length == 1: - assert len(del_seq) == 1 - del_str_data = str(tup[0] + 1) + 'del' + del_seq - else: - del_str_data = str(tup[0] + 1) + '_' + str(tup[0] + tup[1]) + 'del' + del_seq - var_list.append(del_str_data) - - # deletions finished - - return var_list - -def checkMSFfile(gene_name, msf_fname, var_fname, fasta_filename): - oSetPos, oSetNeg, oSet_pos_score, oSet_neg_score, tot_score = checkNTloc(fasta_filename, var_fname, gene_name) - - try: - msf_file = open(msf_fname,'r') - msf_dict = readMSF(msf_file) # { Allele name : MSF sequence } - msf_file.close() - except IOError: - print("\t%s msf file was skipped.\n" % (gene_name)) - return - - var_file = open(var_fname,'r') - var_dict = makeVarDict(var_file) - var_file.close() - - try: - blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name) - if len(blast_allele_var) > 0: - var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var) - except IOError: - print('\t%s blast file was skipped.' % gene_name) - - fa_file = open(fasta_filename,'r') - oriSeq = extractSeq(fa_file) - fa_file.close() - - - # Find reference allele - ref_allele = '' - for allele_name in var_dict.keys(): - if len(var_dict[allele_name]) == 1 and list(var_dict[allele_name])[0] == "None": - assert ref_allele == '' - ref_allele = allele_name - assert not ref_allele == '' - - - # Check if ref allele seq in msf matches fasta - assert ref_allele in msf_dict - - try: - assert msf_dict[ref_allele].replace('.','') == oriSeq - print("Sequences match for reference allele %s" % ref_allele) - except AssertionError: - print("Warning: sequences do not match for reference allele %s" % ref_allele) - sys.exit(1) - - - # Check all alleles are included - try: - assert set([k.upper() for k in msf_dict.keys()]).issubset(set([k.upper() for k in var_dict.keys()])) - except AssertionError: - print("Extra alleles in MSF!\n") - print(sorted(msf_dict.keys())) - print("\n\n") - print(sorted(var_dict.keys())) - sys.exit(1) - - - # Convert from database positions to sequence positions (using offset) - for allele, var_list in var_dict.items(): - oSet_var_list = [] - for var in var_list: - if '>' in var: # snp - pos = int(var.split('>')[0][:-1]) - ntSnp = [var.split('>')[0][-1]] - ntSnp.append(var.split('>')[1]) - assert len(ntSnp) == 2 - if not 'GRCH38' in allele: - if pos > 0: - pos = pos + oSetPos - else: - pos = pos + oSetNeg - - if pos < 0 or pos > len(oriSeq) - 1: # out of bounds - continue - if oriSeq[pos] != ntSnp[0]: # mismatch - print('\tMismatch on variation %s' % var) - continue - - oSet_var = str(pos) + ntSnp[0] + '>' + ntSnp[1] - oSet_var_list.append(oSet_var) - - elif 'del' in var: # deletion - pos = var.split('del')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707]) - pos.append(pos[0]) - assert len(pos) == 2 - ntDel = var.split('del')[1] - for nt in ntDel: - assert nt in "ACGT" - - skipDel = False - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds - skipDel = True - if (oriSeq[ pos[0] : pos[1] + 1 ] != ntDel): # mismatch - print('\tMismatch on variation %s' % var) - continue - - if skipDel: - continue - - assert pos[1] - pos[0] + 1 == len(ntDel) - - oSet_var = 'del' + ntDel - if pos[0] == pos[1]: - oSet_var = str(pos[0]) + oSet_var - else: - oSet_var = str(pos[0]) + '_' + str(pos[1]) + oSet_var - - oSet_var_list.append(oSet_var) - - elif 'ins' in var: # insertion - pos = var.split('ins')[0].split('_') - pos = [int(p) for p in pos] - if len(pos) == 1: - pos.append(pos[0] + 1) - assert len(pos) == 2 - try: - assert pos[1] - pos[0] == 1 - except AssertionError: - print('\tIncorrect insertion format on variation %s' % var) - continue - ntIns = var.split('ins')[1] - for nt in ntIns: - assert nt in "ACGT" - - skipIns = False - if not 'GRCH38' in allele: - for i in range(len(pos)): - if pos[i] > 0: - pos[i] = pos[i] + oSetPos - else: - pos[i] = pos[i] + oSetNeg - if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds - skipIns = True - - if skipIns: - continue - - oSet_var = str(pos[0]) + '_' + str(pos[1]) + 'ins' + ntIns - oSet_var_list.append(oSet_var) - - else: - assert allele == ref_allele - assert var == 'None' - assert len(oSet_var_list) == 0 - oSet_var_list.append('None') - - var_dict[allele] = set(oSet_var_list) - - # Check variants created from MSF file against variants list - num_correct_alleles = 0 - for allele, msf_seq in msf_dict.items(): - if allele == ref_allele: - num_correct_alleles += 1 - continue - msf_var_list = msfToVarList(msf_dict[ref_allele], msf_seq) - '''print('\t' + str(var_dict[allele] == set(msf_var_list)) + '\t' + str(allele) + '\t' + str(msf_var_list))''' - - try: - assert var_dict[allele] == set(msf_var_list) - num_correct_alleles += 1 - except AssertionError: - incorrect_msf_entries.append(allele) - print('\n') - print('\t\tVar File:\t' + str(var_dict[allele])) - print('\t\tMSF File:\t' + str(set(msf_var_list))) - print('\t\tDifference:\t' + str(var_dict[allele] - set(msf_var_list)) + '\n') - '''sys.exit(1)''' - - print("\t%d out of %d alleles have correct msf sequences\n" % (num_correct_alleles, len(msf_dict))) - -def check_msf_files(): - print("\nChecking MSF files:") - - for gene_name in gene_names: - checkMSFfile(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 'cyp_var_files/%s.var' % gene_name, 'cyp_fasta/%s.fasta' % gene_name) - - print('\n\n%d incorrect msf entries on alleles %s\n' % (len(incorrect_msf_entries), str(incorrect_msf_entries))) - - -""" -Write allele sequences to fasta for each gene -""" - -def writeGenFasta(gene_name, msf_fname, line_length): - try: - msf_file = open(msf_fname,'r') - msf_seq_dict = readMSF(msf_file) - msf_file.close() - except IOError: - print("\t%s msf file was skipped." % (gene_name)) - return - - gen_fasta_file = open('gen_fasta/%s_gen.fasta' % gene_name[3:].upper(), 'w') - - for allele, seq in msf_seq_dict.items(): - seq = seq.replace('.','') - print >> gen_fasta_file, ('>' + allele[3:].upper() + ' ' + str(len(seq)) + ' bp') - seq_lines = [seq[i:i+line_length] for i in range(0, len(seq), line_length)] - print >> gen_fasta_file, ('\n'.join(seq_lines)) - - gen_fasta_file.close() - print('%s_gen.fasta completed' % gene_name) - -def build_gen_fasta_files(): - os.system('mkdir gen_fasta') - - print("\nBuilding alleles sequence fasta files:") - for gene_name in gene_names: - writeGenFasta(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 60) - - -""" -Run script -""" - -def extract_cyp_data(): - download_CYP(True) - build_msf_files() - check_msf_files() - build_gen_fasta_files() - -#################################################################################################### -## Debuging BLASTN alignment ref alleles - -def adjust_blast_vars(blast_vars_list,qry_pos): - if len(blast_vars_list) == 0: - return [] - - qry_pos = qry_pos - 1 - adj_blst_var_list = [] - - for var in blast_vars_list: - if '>' in var: # SNP - old_pos = int(var[:-3]) - adj_var = str(old_pos + qry_pos) + var[-3:] - adj_blst_var_list.append(adj_var) - elif 'del' in var: # deletion - old_pos = var.split('del')[0].split('_') - old_pos = [int(i) for i in old_pos] - old_pos = [i + qry_pos for i in old_pos] - if len(old_pos) == 1: - adj_var = str(old_pos[0]) + 'del' + var.split('del')[1] - else: - assert len(old_pos) == 2 - adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'del' + var.split('del')[1] - adj_blst_var_list.append(adj_var) - else: # insertion - assert 'ins' in var - old_pos = var.split('ins')[0].split('_') - old_pos = [int(i) for i in old_pos] - old_pos = [i + qry_pos for i in old_pos] - assert len(old_pos) == 2 and (old_pos[1] - old_pos[0] == 1) - adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'ins' + var.split('ins')[1] - adj_blst_var_list.append(adj_var) - - return adj_blst_var_list - -def extract_var_from_blast(cyp_blast_fname): - blastn_file = open(cyp_blast_fname,'r') - all_lines = [line.strip() for line in blastn_file if not (len(line.strip()) == 0 or line.strip().startswith('|'))] - blastn_file.close() - - id_match = [m.group(0) for l in all_lines[0:25] for m in [re.compile('.*(Identities.*).*').search(l)] if m][0] - id_match = id_match.split('%')[0].split(' (')[0].split('= ')[1].split('/') - id_match = [int(i) for i in id_match] - - # print(id_match) - assert len(id_match) == 2 and id_match[1] - id_match[0] >= 0 - if id_match[1] - id_match[0] == 0: - print('\tPerfect match using blastn') - return [] - - - start = -1 - end = -1 - for i in range(len(all_lines)): # Get rid of headers and footers - if all_lines[i].startswith('Score ='): - assert start == -1 - start = i - - if all_lines[i].startswith('Lambda'): - assert start != -1 and end == -1 - end = i - break - - all_lines = all_lines[start + 3 : end] - # print('\n'.join(all_lines)) - - blastn_var_list = [] - for i in range(0,len(all_lines),2): - qry_seq = '\t'.join(all_lines[i].split()) - qry_seq_pos = int(qry_seq.split('\t')[1]) - sbj_seq = '\t'.join(all_lines[i + 1].split()) - qry_seq = qry_seq.split('\t')[2].replace('-','.').upper() - sbj_seq = sbj_seq.split('\t')[2].replace('-','.').upper() - #print(qry_seq) - #print(sbj_seq) - - temp_var_list = msfToVarList(qry_seq, sbj_seq) - #print(str(qry_seq_pos) + '\t' + str(temp_var_list) + '\t' + str(adjust_blast_vars(temp_var_list,qry_seq_pos))) - temp_var_list = adjust_blast_vars(temp_var_list,qry_seq_pos) - blastn_var_list = blastn_var_list + temp_var_list - - return blastn_var_list - -# extract_var_from_blast('cyp_blast_alignment/cyp2d6_blast.align') - -extract_cyp_data() diff --git a/hisatgenotype_scripts/hisatgenotype_locus_samples.py b/hisatgenotype_scripts/hisatgenotype_locus_samples.py deleted file mode 100755 index 3de636a0..00000000 --- a/hisatgenotype_scripts/hisatgenotype_locus_samples.py +++ /dev/null @@ -1,354 +0,0 @@ -#!/usr/bin/env python - -# -# Copyright 2017, Daehwan Kim -# -# This file is part of HISAT-genotype. -# -# HISAT-genotype is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# HISAT-genotype is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with HISAT-genotype. If not, see . -# - - -import sys, os, subprocess, re, threading -import inspect -import random -import glob -from argparse import ArgumentParser, FileType -import hisatgenotype_typing_common as typing_common - - -# Platinum genomes - CEPH pedigree (17 family members) -CEPH_pedigree = { - "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]}, - "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]}, - "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]}, - "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]}, - "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]}, - - "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"}, - "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"}, - } - - - -""" -""" -class myThread(threading.Thread): - def __init__(self, - lock, - paths, - reference_type, - region_list, - num_editdist, - max_sample, - assembly, - out_dir, - genotype_results, - verbose): - threading.Thread.__init__(self) - self.lock = lock - self.paths = paths - self.reference_type = reference_type - self.region_list = region_list - self.num_editdist = num_editdist - self.max_sample = max_sample - self.assembly = assembly - self.out_dir = out_dir - self.genotype_results = genotype_results - self.verbose = verbose - - def run(self): - global work_idx - while True: - self.lock.acquire() - my_work_idx = work_idx - work_idx += 1 - self.lock.release() - if my_work_idx >= len(self.paths) or \ - my_work_idx >= self.max_sample: - return - worker(self.lock, - self.paths[my_work_idx], - self.reference_type, - self.region_list, - self.num_editdist, - self.assembly, - self.out_dir, - self.genotype_results, - self.verbose) - - -""" -""" -work_idx = 0 -def worker(lock, - path, - reference_type, - region_list, - num_editdist, - assembly, - out_dir, - genotype_results, - verbose): - fq_name = path.split('/')[-1] - read_dir = '/'.join(path.split('/')[:-1]) - genome = fq_name.split('.')[0] - if not fq_name.endswith("extracted.1.fq.gz"): - return - read_basename = fq_name[:fq_name.find("extracted.1.fq.gz")] - read_fname_1, read_fname_2 = "%s/%sextracted.1.fq.gz" % \ - (read_dir, read_basename), "%s/%sextracted.2.fq.gz" % (read_dir, read_basename) - - if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2): - return - lock.acquire() - print >> sys.stderr, genome - lock.release() - - for family, loci in region_list.items(): - test_hla_cmd = ["hisatgenotype_locus.py", - "--base", family] - if len(loci) > 0: - test_hla_cmd += ["--locus", ','.join(loci)] - test_hla_cmd += ["--num-editdist", str(num_editdist)] - test_hla_cmd += ["-1", read_fname_1, "-2", read_fname_2] - if assembly: - test_hla_cmd += ["--assembly"] - test_hla_cmd += ["--assembly-base"] - if out_dir != "": - test_hla_cmd += ["%s/%s" % (out_dir, genome)] - else: - test_hla_cmd += [genome] - - if verbose: - lock.acquire() - print >> sys.stderr, ' '.join(test_hla_cmd) - lock.release() - - proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - test_alleles = set() - output_list = [] - for line in proc.stdout: - line = line.strip() - if line.find("abundance") == -1: - continue - - rank, _, allele, _, abundance = line.split() - output_list.append([allele, float(abundance[:-2])]) - - lock.acquire() - for allele, abundance in output_list: - print >> sys.stdout, "%s\t%s\t%.2f" % (genome, allele, abundance) - genotype_results.append([genome, allele, abundance]) - sys.stdout.flush() - lock.release() - - -""" -""" -def genotyping(read_dir, - reference_type, - region_list, - num_editdist, - nthreads, - max_sample, - assembly, - out_dir, - verbose, - platinum_check): - for database_name in region_list: - # Extract variants, backbone sequence, and other sequeces - typing_common.extract_database_if_not_exists(database_name, - []) # locus_list - # Build HISAT2's graph index - typing_common.build_index_if_not_exists(database_name, - "hisat2", - "graph", - 1, # threads - verbose) - - if not os.path.exists(read_dir): - print >> sys.stderr, "Error: %s does not exist." % read_dir - sys.exit(1) - - if out_dir != "" and not os.path.exists(out_dir): - os.mkdir(out_dir) - - # fastq files - fq_fnames = glob.glob("%s/*.extracted.1.fq.gz" % read_dir) - - genotype_results = [] - - lock = threading.Lock() - threads = [] - for t in range(nthreads): - thread = myThread(lock, - fq_fnames, - reference_type, - region_list, - num_editdist, - max_sample, - assembly, - out_dir, - genotype_results, - verbose) - thread.start() - threads.append(thread) - - for thread in threads: - thread.join() - - - if platinum_check: - genotype_dic = {} - for genome, allele, abundance in genotype_results: - region, _ = allele.split('*') - if region not in genotype_dic: - genotype_dic[region] = {} - if genome not in genotype_dic[region]: - genotype_dic[region][genome] = [] - if len(genotype_dic[region][genome]) >= 2: - continue - # DK - debugging purposes - # if abundance < 0.15 * 100: - # continue - genotype_dic[region][genome].append([allele, abundance]) - - for region, region_genotype in genotype_dic.items(): - print >> sys.stderr, region - included, total = 0, 0 - for genome, genome_alleles in region_genotype.items(): - genome_alleles = set([allele for allele, _ in genome_alleles]) - if "father" in CEPH_pedigree[genome]: - assert "mother" in CEPH_pedigree[genome] - parents = [CEPH_pedigree[genome]["father"], CEPH_pedigree[genome]["mother"]] - else: - parents = [] - parent_allele_sets = [] - assert len(parents) in [0, 2] - if len(parents) == 2 and \ - parents[0] in region_genotype and \ - parents[1] in region_genotype: - for parent_allele, _ in region_genotype[parents[0]]: - for parent_allele2, _ in region_genotype[parents[1]]: - parent_allele_sets.append(set([parent_allele, parent_allele2])) - print >> sys.stderr, "\t", genome, genome_alleles, parent_allele_sets - if len(parent_allele_sets) > 0: - total += 1 - if genome_alleles in parent_allele_sets: - included += 1 - print >> sys.stderr, "\t%d / %d" % (included, total) - - -""" -""" -if __name__ == '__main__': - parser = ArgumentParser( - description='genotyping on many samples') - parser.add_argument("--reference-type", - dest="reference_type", - type=str, - default="gene", - help="Reference type: gene, chromosome, and genome (default: gene)") - parser.add_argument("--region-list", - dest="region_list", - type=str, - default="", - help="A comma-separated list of regions (default: empty)") - parser.add_argument('--read-dir', - dest="read_dir", - type=str, - default="", - help='read directory (e.g. read_input)') - parser.add_argument("--num-editdist", - dest="num_editdist", - type=int, - default=2, - help="Maximum number of mismatches per read alignment to be considered (default: 2)") - parser.add_argument("-p", "--threads", - dest="threads", - type=int, - default=1, - help="Number of threads") - parser.add_argument('--assembly', - dest='assembly', - action='store_true', - help='Perform assembly') - parser.add_argument("--max-sample", - dest="max_sample", - type=int, - default=sys.maxint, - help="Number of samples to be analyzed (default: sys.maxint)") - parser.add_argument("--out-dir", - dest="out_dir", - type=str, - default="", - help='Output directory (default: (empty))') - parser.add_argument('-v', '--verbose', - dest='verbose', - action='store_true', - help='also print some statistics to stderr') - parser.add_argument('--platinum-check', - dest='platinum_check', - action='store_true', - help='Check for concordance of platinum genomes') - - args = parser.parse_args() - - if args.read_dir == "": - print >> sys.stderr, "Error: please specify --read-dir." - sys.exit(1) - - if not args.reference_type in ["gene", "chromosome", "genome"]: - print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type) - sys.exit(1) - - region_list = {} - if args.region_list != "": - for region in args.region_list.split(','): - region = region.split('.') - if len(region) < 1 or len(region) > 2: - print >> sys.stderr, "Error: --region-list is incorrectly formatted." - sys.exit(1) - - family = region[0].lower() - if len(region) == 2: - locus_name = region[1].upper() - if family not in region_list: - region_list[family] = set() - if len(region) == 2: - region_list[family].add(locus_name) - - genotyping(args.read_dir, - args.reference_type, - region_list, - args.num_editdist, - args.threads, - args.max_sample, - args.assembly, - args.out_dir, - args.verbose, - args.platinum_check) - diff --git a/hisatgenotype_scripts/run_extract_CP.sh b/hisatgenotype_scripts/run_extract_CP.sh deleted file mode 100755 index ceca077e..00000000 --- a/hisatgenotype_scripts/run_extract_CP.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.HLA.CP.extract.genome -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=40 -#SBATCH --mem=400G -#SBATCH --partition=lrgmem -#SBATCH --time=166:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/scripts/extract_reads.py --base-fname genotype_genome --reference-type genome --read-dir /home-1/dkim136@jhu.edu/aszalay1/genomes --out-dir CP_80 -p 40 --max-sample 80 --job-range 0,2 - diff --git a/hisatgenotype_scripts/run_extract_ILMN.sh b/hisatgenotype_scripts/run_extract_ILMN.sh deleted file mode 100755 index 3aaf0cbb..00000000 --- a/hisatgenotype_scripts/run_extract_ILMN.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.HLA.ILMN.extract.genome -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=17 -#SBATCH --mem=120G -#SBATCH --partition=shared -#SBATCH --time=166:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/scripts/extract_reads.py --base-fname genotype_genome --reference-type genome --read-dir /home-1/dkim136@jhu.edu/ssalzbe1/users/infphilo/platinum_genomes --out-dir ILMN -p 17 - diff --git a/hisatgenotype_scripts/run_genotype_build.sh b/hisatgenotype_scripts/run_genotype_build.sh deleted file mode 100755 index ac2a3363..00000000 --- a/hisatgenotype_scripts/run_genotype_build.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.genotype -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=4 -#SBATCH --mem=400G -#SBATCH --partition=lrgmem -#SBATCH --time=168:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/hisatgenotype_build_genome.py -p 4 --verbose --commonvar genome.fa genotype_genome diff --git a/hisatgenotype_scripts/run_hisat2_build.sh b/hisatgenotype_scripts/run_hisat2_build.sh deleted file mode 100755 index 15d25611..00000000 --- a/hisatgenotype_scripts/run_hisat2_build.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.genotype.hisat2-build -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=4 -#SBATCH --mem=400G -#SBATCH --partition=lrgmem -#SBATCH --time=168:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/hisat2/evaluation/tests/genotype - -/home-1/dkim136@jhu.edu/infphilo/hisat2/hisat2/hisat2-build -p 4 --snp genotype_genome.snp --haplotype genotype_genome.haplotype genotype_genome.fa genotype_genome diff --git a/hisatgenotype_scripts/run_type_CP.sh b/hisatgenotype_scripts/run_type_CP.sh deleted file mode 100755 index 4fd54ffd..00000000 --- a/hisatgenotype_scripts/run_type_CP.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -l -#SBATCH --job-name=infphio.HLA.CP -#SBATCH --nodes=1 -#SBATCH --cpus-per-task=24 -#SBATCH --mem=64G -#SBATCH --partition=shared -#SBATCH --time=12:0:0 -#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_CP_extract_genome_partial - -/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_CP_extract_genome_partial/hisat2_test_HLA_genotyping_CP.py CP --num-editdist 2 -p 24 > cp_hla.txt From 3ce220da6ac1bc949ce21359406f2b42b6dfa5b5 Mon Sep 17 00:00:00 2001 From: Christopher Bennett Date: Mon, 17 Feb 2020 14:13:58 -0600 Subject: [PATCH 3/5] Remove genotype in Makefule --- Makefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/Makefile b/Makefile index b8399819..60445ce6 100644 --- a/Makefile +++ b/Makefile @@ -237,8 +237,6 @@ HT2LIB_PKG_SRC = \ GENERAL_LIST = $(wildcard scripts/*.sh) \ $(wildcard scripts/*.pl) \ $(wildcard *.py) \ - $(wildcard hisatgenotype_modules/*.py) \ - $(wildcard hisatgenotype_scripts/*.py) \ $(wildcard example/index/*.ht2) \ $(wildcard example/reads/*.fa) \ example/reference/22_20-21M.fa \ From 640d0804cde16be94e4b0ba0c70c77100ff77cc6 Mon Sep 17 00:00:00 2001 From: Chanhee Park Date: Fri, 27 Mar 2020 11:58:34 -0500 Subject: [PATCH 4/5] Support python3 --- hisat2_extract_exons.py | 4 +- hisat2_extract_snps_haplotypes_UCSC.py | 48 ++++++------ hisat2_extract_snps_haplotypes_VCF.py | 44 ++++++----- hisat2_extract_splice_sites.py | 12 ++- hisat2_read_statistics.py | 36 +++++---- hisat2_simulate_reads.py | 104 +++++++++++++++---------- 6 files changed, 137 insertions(+), 111 deletions(-) diff --git a/hisat2_extract_exons.py b/hisat2_extract_exons.py index 201f8328..50602f2f 100755 --- a/hisat2_extract_exons.py +++ b/hisat2_extract_exons.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Copyright 2015, Daehwan Kim @@ -19,8 +19,6 @@ # along with HISAT 2. If not, see . # -from __future__ import print_function - from sys import stderr, exit from collections import defaultdict as dd, Counter from argparse import ArgumentParser, FileType diff --git a/hisat2_extract_snps_haplotypes_UCSC.py b/hisat2_extract_snps_haplotypes_UCSC.py index f2a5aba3..f90b2dcb 100755 --- a/hisat2_extract_snps_haplotypes_UCSC.py +++ b/hisat2_extract_snps_haplotypes_UCSC.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Copyright 2015, Daehwan Kim @@ -23,6 +23,7 @@ import sys, subprocess import re from argparse import ArgumentParser, FileType +from functools import cmp_to_key """ @@ -80,8 +81,8 @@ def compare_vars(a, b): # daehwan - for debugging purposes if a_chr != b_chr: - print a - print b + print(a) + print(b) assert a_chr == b_chr if a_pos != b_pos: @@ -129,7 +130,7 @@ def generate_haplotypes(snp_file, assert len(vars) > 0 # Sort variants and remove redundant variants - vars = sorted(vars, cmp=compare_vars) + vars = sorted(vars, key=cmp_to_key(compare_vars)) tmp_vars = [] v = 0 while v < len(vars): @@ -223,8 +224,8 @@ def generate_haplotypes(snp_file, else: assert type == 'I' type = "insertion" - print >> snp_file, "%s\t%s\t%s\t%s\t%s" % \ - (varID, type, chr, pos, data) + print("%s\t%s\t%s\t%s\t%s" % (varID, type, chr, pos, data), + file=snp_file) # genotypes_list looks like # Var0: 0 @@ -270,7 +271,7 @@ def split_haplotypes(haplotypes): split_haplotypes.add('#'.join(haplotype[prev_s:s])) return split_haplotypes - haplotypes2 = split_haplotypes(haplotypes) + haplotypes2 = sorted(list(split_haplotypes(haplotypes))) def cmp_haplotype(a, b): a = a.split('#') @@ -288,8 +289,8 @@ def cmp_haplotype(a, b): if a_begin != b_begin: return a_begin - b_begin return a_end - b_end - - haplotypes = sorted(list(haplotypes2), cmp=cmp_haplotype) + + haplotypes = sorted(list(haplotypes2), key=cmp_to_key(cmp_haplotype)) # Write haplotypes for h_i in range(len(haplotypes)): @@ -317,8 +318,8 @@ def cmp_haplotype(a, b): for id in h: var_dic = vars[int(id)][4] h_add.append(var_dic["id2"]) - print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)) + print("ht%d\t%s\t%d\t%d\t%s" % (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)), + file=haplotype_file) num_haplotypes += 1 return num_haplotypes @@ -352,6 +353,7 @@ def main(genome_file, else: snp_cmd = ["cat", snp_fname] snp_proc = subprocess.Popen(snp_cmd, + text=True, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) ids_seen = set() @@ -447,10 +449,10 @@ def main(genome_file, if testset: ref_seq = chr_seq[start-50:start+50] alt_seq = chr_seq[start-50:start] + allele + chr_seq[start+1:start+50] - print >> ref_testset_file, ">%s_single_%d" % (rs_id, start - 50) - print >> ref_testset_file, ref_seq - print >> alt_testset_file, ">%s_single_%d_%s" % (rs_id, start - 50, ref_seq) - print >> alt_testset_file, alt_seq + print(">%s_single_%d" % (rs_id, start - 50), file=ref_testset_file) + print(ref_seq, file=ref_testset_file) + print(">%s_single_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file) + print(alt_seq, file=alt_testset_file) elif classType == "deletion": if start > 0: @@ -475,10 +477,10 @@ def main(genome_file, if testset and delLen > 0 and delLen <= 10: ref_seq = chr_seq[start-50:start+50] alt_seq = chr_seq[start-50:start] + chr_seq[start+delLen:start+50+delLen] - print >> ref_testset_file, ">%s_deletion_%d" % (rs_id, start - 50) - print >> ref_testset_file, ref_seq - print >> alt_testset_file, ">%s_deletion_%d_%s" % (rs_id, start - 50, ref_seq) - print >> alt_testset_file, alt_seq + print(">%s_deletion_%d" % (rs_id, start - 50), file=ref_testset_file) + print(ref_seq, file=ref_testset_file) + print(">%s_deletion_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file) + print(alt_seq, file=alt_testset_file) else: assert classType == "insertion" if start > 0: @@ -497,10 +499,10 @@ def main(genome_file, if testset and insLen > 0 and insLen <= 10: ref_seq = chr_seq[start-50:start+50] alt_seq = chr_seq[start-50:start] + allele + chr_seq[start:start+50-insLen] - print >> ref_testset_file, ">%s_insertion_%d" % (rs_id, start - 50) - print >> ref_testset_file, ref_seq - print >> alt_testset_file, ">%s_insertion_%d_%s" % (rs_id, start - 50, ref_seq) - print >> alt_testset_file, alt_seq + print(">%s_insertion_%d" % (rs_id, start - 50), file=ref_testset_file) + print(ref_seq, file=ref_testset_file) + print(">%s_insertion_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file) + print(alt_seq, file=alt_testset_file) if curr_right < end: curr_right = end diff --git a/hisat2_extract_snps_haplotypes_VCF.py b/hisat2_extract_snps_haplotypes_VCF.py index c365821a..873b1ede 100755 --- a/hisat2_extract_snps_haplotypes_VCF.py +++ b/hisat2_extract_snps_haplotypes_VCF.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Copyright 2016, Daehwan Kim # @@ -21,6 +21,7 @@ import sys, os, subprocess from argparse import ArgumentParser, FileType +from functools import cmp_to_key digit2str = [str(i) for i in range(10)] @@ -100,11 +101,11 @@ def extract_vars(chr_dic, chr, pos, ref_allele, alt_alleles, varID): ref_allele2, pos2 = ref_allele, pos if chr_seq[pos:pos+len(ref_allele)] != ref_allele: - print >> sys.stderr, "Error: the reference genome you provided seems to be incompatible with the VCF file at %d of chromosome %s where %s is in the reference genome while %s is in the VCF file" % (pos, chr, chr_seq[pos:pos+len(ref_allele)], ref_allele) + print("Error: the reference genome you provided seems to be incompatible with the VCF file at %d of chromosome %s where %s is in the reference genome while %s is in the VCF file" % (pos, chr, chr_seq[pos:pos+len(ref_allele)], ref_allele), file=sys.stderr) def warning_msg(): - print >> sys.stderr, "Warning) ref allele (%s) and alt allele (%s in %s) at chr%s:%d are excluded." % \ - (ref_allele, alt_allele, ','.join(alt_alleles), chr, pos + 1) + print("Warning) ref allele (%s) and alt allele (%s in %s) at chr%s:%d are excluded." % \ + (ref_allele, alt_allele, ','.join(alt_alleles), chr, pos + 1), file=sys.stderr) min_len = min(len(ref_allele2), len(alt_allele2)) if min_len >= 2: @@ -170,7 +171,7 @@ def generate_haplotypes(snp_file, assert len(vars) > 0 # Sort variants and remove redundant variants - vars = sorted(vars, cmp=compare_vars) + vars = sorted(vars, key=cmp_to_key(compare_vars)) tmp_vars = [] v = 0 while v < len(vars): @@ -203,8 +204,8 @@ def generate_haplotypes(snp_file, else: assert type == 'I' type = "insertion" - print >> snp_file, "%s\t%s\t%s\t%s\t%s" % \ - (varID, type, chr, pos, data) + print("%s\t%s\t%s\t%s\t%s" % \ + (varID, type, chr, pos, data), file=snp_file) # variant compatibility vars_cmpt = [-1 for i in range(len(vars))] @@ -363,7 +364,7 @@ def split_haplotypes(haplotypes): split_haplotypes.add('#'.join(haplotype[prev_s:s])) return split_haplotypes - haplotypes2 = split_haplotypes(haplotypes) + haplotypes2 = sorted(list(split_haplotypes(haplotypes))) def cmp_haplotype(a, b): a = a.split('#') @@ -382,7 +383,7 @@ def cmp_haplotype(a, b): return a_begin - b_begin return a_end - b_end - haplotypes = sorted(list(haplotypes2), cmp=cmp_haplotype) + haplotypes = sorted(list(haplotypes2), key=cmp_to_key(cmp_haplotype)) # daehwan - for debugging purposes """ @@ -424,8 +425,8 @@ def cmp_haplotype(a, b): for id in h: var_dic = vars[int(id)][4] h_add.append(var_dic["id2"]) - print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \ - (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)) + print("ht%d\t%s\t%d\t%d\t%s" % \ + (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)), file=haplotype_file) num_haplotypes += 1 return num_haplotypes @@ -464,6 +465,7 @@ def main(genome_file, else: vcf_cmd = ["cat", genotype_vcf] vcf_proc = subprocess.Popen(vcf_cmd, + text=True, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) for line in vcf_proc.stdout: @@ -525,17 +527,17 @@ def main(genome_file, var_set.add(var_str) - print >> sys.stderr, "Number of variants in %s is:" % (genotype_vcf) + print("Number of variants in %s is:" % (genotype_vcf), file=sys.stderr) for chr, vars in genotype_var_list.items(): vars = sorted(vars, cmp=compare_vars) - print >> sys.stderr, "\tChromosome %s: %d variants" % (chr, len(vars)) + print("\tChromosome %s: %d variants" % (chr, len(vars)), file=sys.stderr) for chr, gene_ranges in genotype_ranges.items(): for gene, value in gene_ranges.items(): gene_ranges[gene] = [value[0] - 100, value[1] + 100] value = genotype_ranges[chr][gene] if verbose: - print >> sys.stderr, "%s\t%s\t%d-%d" % (chr, gene, value[0], value[1]) + print("%s\t%s\t%d-%d" % (chr, gene, value[0], value[1]), file=sys.stderr) if extra_files or True: clnsig_file = open("%s.clnsig" % base_fname, 'w') @@ -544,7 +546,7 @@ def main(genome_file, varID = var[4]["id2"] CLNSIG = var[4]["CLNSIG"] gene = var[4]["gene"] - print >> clnsig_file, "%s\t%s\t%s" % (varID, gene, CLNSIG) + print("%s\t%s\t%s" % (varID, gene, CLNSIG), file=clnsig_file) clnsig_file.close() SNP_file = open("%s.snp" % base_fname, 'w') @@ -558,7 +560,7 @@ def main(genome_file, left, right = value if reference_type == "gene": left, right = 0, right - left - print >> ref_file, "%s\t%s\t%d\t%d" % (gene, chr, left, right) + print("%s\t%s\t%d\t%d" % (gene, chr, left, right), file=ref_file) ref_file.close() if reference_type == "gene": @@ -567,10 +569,10 @@ def main(genome_file, for gene, value in gene_ranges.items(): left, right = value left, right = 0, right - left - print >> backbone_file, ">%s" % (gene) + print(">%s" % (gene), file=backbone_file) backbone_seq = chr_dic[chr][value[0]:value[1]+1] for s in range(0, len(backbone_seq), 60): - print >> backbone_file, backbone_seq[s:s+60] + print(backbone_seq[s:s+60], file=backbone_file) backbone_file.close() elif reference_type == "chromosome": first = True @@ -604,6 +606,7 @@ def main(genome_file, else: vcf_cmd = ["cat", VCF_fname] vcf_proc = subprocess.Popen(vcf_cmd, + text=True, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) @@ -665,7 +668,7 @@ def main(genome_file, offset = 0 gene = None if num_lines % 10000 == 1: - print >> sys.stderr, "\t%s:%d\r" % (chr, pos), + print("\t%s:%d\r" % (chr, pos), file=sys.stderr) if chr_genotype_ranges: skip = True @@ -883,6 +886,7 @@ def add_vars(pos, else: vcf_cmd = ["cat", args.genotype_vcf] vcf_proc = subprocess.Popen(vcf_cmd, + text=True, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w')) for line in vcf_proc.stdout: @@ -900,7 +904,7 @@ def add_vars(pos, args.genotype_gene_list = args.genotype_gene_list.split(',') if len(args.genotype_gene_list) == 0: - print >> sys.stderr, "Error: please specify --genotype-gene-list." + print("Error: please specify --genotype-gene-list.", file=sys.stderr) sys.exit(1) else: diff --git a/hisat2_extract_splice_sites.py b/hisat2_extract_splice_sites.py index 59862882..cba92347 100755 --- a/hisat2_extract_splice_sites.py +++ b/hisat2_extract_splice_sites.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Copyright 2015, Daehwan Kim @@ -19,8 +19,6 @@ # along with HISAT 2. If not, see . # -from __future__ import print_function - from sys import stderr, exit from collections import defaultdict as dd, Counter from argparse import ArgumentParser, FileType @@ -105,18 +103,18 @@ def extract_splice_sites(gtf_file, verbose=False): len(genes), sum(len(v) > 1 for v in genes.values())), file=stderr) print('transcripts: {}, transcript avg. length: {:.0f}'.format( - len(trans), sum(trans_lengths.elements())/len(trans)), + len(trans), sum(trans_lengths.elements())//len(trans)), file=stderr) print('exons: {}, exon avg. length: {:.0f}'.format( sum(exon_lengths.values()), - sum(exon_lengths.elements())/sum(exon_lengths.values())), + sum(exon_lengths.elements())//sum(exon_lengths.values())), file=stderr) print('introns: {}, intron avg. length: {:.0f}'.format( sum(intron_lengths.values()), - sum(intron_lengths.elements())/sum(intron_lengths.values())), + sum(intron_lengths.elements())//sum(intron_lengths.values())), file=stderr) print('average number of exons per transcript: {:.0f}'.format( - sum(exon_lengths.values())/len(trans)), + sum(exon_lengths.values())//len(trans)), file=stderr) diff --git a/hisat2_read_statistics.py b/hisat2_read_statistics.py index e6d0d1c3..d4a40d6f 100755 --- a/hisat2_read_statistics.py +++ b/hisat2_read_statistics.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Copyright 2018, Chanhee Park and Daehwan Kim @@ -42,7 +42,7 @@ def parser_FQ(fp): return if line[0] == '@': - break; + break while True: id = line[1:].split()[0] @@ -55,9 +55,9 @@ def parser_FQ(fp): seq = line.strip() yield id, seq - line = fp.readline() # '+' - line = fp.readline() # quality - line = fp.readline() # next ID + line = fp.readline() # '+' + line = fp.readline() # quality + line = fp.readline() # next ID if line == "": return @@ -73,7 +73,7 @@ def parser_FA(fp): return if line[0] == '>': - break; + break while True: id = line[1:].split()[0] @@ -119,10 +119,10 @@ def parse_type(fname): """ """ def generate_stats(length_map): - mn = 0 # minimun read length - mx = 0 # maximum read length - cnt = 0 # number of reads - avg = 0 # average read length + mn = 0 # minimun read length + mx = 0 # maximum read length + cnt = 0 # number of reads + avg = 0 # average read length sum = 0 @@ -135,11 +135,11 @@ def generate_stats(length_map): mn = sorted_map[0] mx = sorted_map[-1] - for k in sorted(length_map): - sum += int(k) * length_map[k] - cnt += length_map[k] + for k, v in length_map.items(): + sum += k * v + cnt += v - avg = sum / cnt + avg = sum // cnt return cnt, mn, mx, avg @@ -179,10 +179,12 @@ def reads_stat(read_file, read_count): fp.close() cnt, mn, mx, avg = generate_stats(length_map) - length_map = sorted(length_map.iteritems(), key=lambda (k,v):(v,k), reverse=True) + # sort by (read count, read length) + length_map = sorted(length_map.items(), key=lambda t: (t[1], t[0]), reverse=True) if len(length_map) == 0: - length_map.append((0,0)) - print cnt, mn, mx, avg, ",".join([str(k) for (k,v) in length_map]) + length_map.append((0, 0)) + print(cnt, mn, mx, avg, ",".join([str(k) for (k,v) in length_map])) + if __name__ == '__main__': diff --git a/hisat2_simulate_reads.py b/hisat2_simulate_reads.py index e5c63e6b..8e8561e5 100755 --- a/hisat2_simulate_reads.py +++ b/hisat2_simulate_reads.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # # Copyright 2015, Daehwan Kim # @@ -51,6 +51,13 @@ def reverse_complement(seq): return result +""" +python2 style randint +""" +def myrandint(m, x): + s = x - m + 1 + return m + int(random.random() * s) + """ Random source for sequencing errors """ @@ -91,7 +98,7 @@ def read_genome(genome_file): chr_dic[chr_name] = sequence - chr_filter = [str(x) for x in range(1, 23) + ['X', 'Y']] + chr_filter = [str(x) for x in list(range(1, 23)) + ['X', 'Y']] #chr_filter = None if chr_filter: @@ -120,7 +127,7 @@ def read_transcript(genome_seq, gtf_file, frag_len): strand, frame, values = line.split('\t') except ValueError: continue - if not chrom in genome_seq: + if chrom not in genome_seq: continue # Zero-based offset @@ -214,7 +221,7 @@ def sanity_check_input(genome_seq, genes, transcripts, snps, frag_len): num_ss += 1 if num_ss > 0: - print >> sys.stderr, "GT/AG splice sites: {}/{} ({:.2%})".format(num_canon_ss, num_ss, (float(num_canon_ss) / num_ss)) + print("GT/AG splice sites: {}/{} ({:.2%})".format(num_canon_ss, num_ss, (float(num_canon_ss) / num_ss)), file=sys.stderr) num_alt_single, num_single = 0, 0 for chr, chr_snps in snps.items(): @@ -235,7 +242,7 @@ def sanity_check_input(genome_seq, genes, transcripts, snps, frag_len): num_single += 1 if num_single > 0: - print >> sys.stderr, "Alternative bases: {}/{} ({:.2%})".format(num_alt_single, num_single, (float(num_alt_single) / num_single)) + print("Alternative bases: {}/{} ({:.2%})".format(num_alt_single, num_single, (float(num_alt_single) / num_single)), file=sys.stderr) """ @@ -260,6 +267,7 @@ def calc_expr(x, a): expr_sum = sum(expr_profile) expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))] assert abs(sum(expr_profile) - 1.0) < 0.001 + #print(expr_sum, expr_profile, file=sys.stderr) return expr_profile @@ -280,7 +288,7 @@ def generate_dna_expr_profile(genome_seq): def getSNPs(chr_snps, left, right): low, high = 0, len(chr_snps) while low < high: - mid = (low + high) / 2 + mid = (low + high) // 2 snpID, type, pos, data = chr_snps[mid] if pos < left: low = mid + 1 @@ -370,7 +378,8 @@ def getSamAlignment(rna, exons, chr_seq, trans_seq, frag_pos, read_len, chr_snps if err_rand_src.getRand() == 1: assert i < len(chr_seq) err_base = "A" - rand = random.randint(0, 2) + #rand = random.randint(0, 2) + rand = myrandint(0, 2) if chr_seq[i] == "A": err_base = "GCT"[rand] elif chr_seq[i] == "C": @@ -382,10 +391,10 @@ def getSamAlignment(rna, exons, chr_seq, trans_seq, frag_pos, read_len, chr_snps mms.append(["", "single", i, err_base]) tmp_diffs = snps + mms - def diff_sort(a , b): - return a[2] - b[2] +# def diff_sort(a , b): +# return a[2] - b[2] - tmp_diffs = sorted(tmp_diffs, cmp=diff_sort) + tmp_diffs = sorted(tmp_diffs, key=lambda t: t[2]) diffs = [] if len(tmp_diffs) > 0: diffs = tmp_diffs[:1] @@ -545,8 +554,8 @@ def diff_sort(a , b): MD += ("{}".format(MD_match_len)) if len(read_seq) != read_len: - print >> sys.stderr, "read length differs:", len(read_seq), "vs.", read_len - print >> sys.stderr, pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs + print("read length differs:", len(read_seq), "vs.", read_len, file=sys.stderr) + print(pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs, file=sys.stderr) assert False return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq @@ -676,25 +685,27 @@ def samRepOk(genome_seq, read_seq, chr, pos, cigar, XM, NM, MD, Zs, max_mismatch tMD += ("{}".format(match_len)) if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM: - print >> sys.stderr, chr, pos, cigar, MD, XM, NM, Zs - print >> sys.stderr, tMD, tXM, tNM + print(chr, pos, cigar, MD, XM, NM, Zs, file=sys.stderr) + print(tMD, tXM, tNM, file=sys.stderr) assert False """ """ -def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ - rna, paired_end, read_len, frag_len, \ +def simulate_reads(genome_file, gtf_file, snp_file, base_fname, + rna, paired_end, read_len, frag_len, num_frag, expr_profile_type, repeat_fname, - error_rate, max_mismatch, \ + error_rate, max_mismatch, random_seed, snp_prob, sanity_check, verbose): - random.seed(random_seed) + print('random seed', random_seed, file=sys.stderr) + random.seed(random_seed, version=1) err_rand_src = ErrRandomSource(error_rate / 100.0) if read_len > frag_len: frag_len = read_len genome_seq = read_genome(genome_file) + #print(genome_seq) if rna: genes, transcripts = read_transcript(genome_seq, gtf_file, frag_len) else: @@ -716,6 +727,8 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ for i in range(min(num_frag - sum(expr_profile), len(expr_profile))): expr_profile[i] += 1 assert num_frag == sum(expr_profile) + + #print(expr_profile) repeat_loci = {} if repeat_fname != "" and os.path.exists(repeat_fname): @@ -730,18 +743,23 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ repeat_loci[chr].append([int(pos), strand]) if rna: - transcript_ids = transcripts.keys() - random.shuffle(transcript_ids) + transcript_ids = sorted(list(transcripts.keys())) + #transcript_ids = list(transcripts.keys()) + #random.shuffle(transcript_ids) assert len(transcript_ids) >= len(expr_profile) else: - chr_ids = genome_seq.keys() + transcript_ids = list() + chr_ids = list(genome_seq.keys()) + + for k in transcript_ids: + print(k, file=sys.stderr) sam_file = open(base_fname + ".sam", "w") # Write SAM header - print >> sam_file, "@HD\tVN:1.0\tSO:unsorted" + print("@HD\tVN:1.0\tSO:unsorted", file=sam_file) for chr in genome_seq.keys(): - print >> sam_file, "@SQ\tSN:%s\tLN:%d" % (chr, len(genome_seq[chr])) + print("@SQ\tSN:%s\tLN:%d" % (chr, len(genome_seq[chr])), file=sam_file) read_file = open(base_fname + "_1.fa", "w") if paired_end: @@ -753,10 +771,10 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ if rna: transcript_id = transcript_ids[t] chr, strand, transcript_len, exons = transcripts[transcript_id] - print >> sys.stderr, transcript_id, t_num_frags + print(transcript_id, t_num_frags, file=sys.stderr) else: chr = chr_ids[t] - print >> sys.stderr, chr, t_num_frags + print(chr, t_num_frags, file=sys.stderr) assert chr in genome_seq chr_seq = genome_seq[chr] @@ -783,14 +801,17 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ for f in range(t_num_frags): if rna: - frag_pos = random.randint(0, transcript_len - frag_len) + #frag_pos = random.randint(0, transcript_len - frag_len) + frag_pos = myrandint(0, transcript_len - frag_len) else: while True: if len(chr_repeat_loci): - locus_id = random.randint(0, len(chr_repeat_loci) - 1) + #locus_id = random.randint(0, len(chr_repeat_loci) - 1) + locus_id = myrandint(0, len(chr_repeat_loci) - 1) frag_pos = chr_repeat_loci[locus_id][0] else: - frag_pos = random.randint(0, chr_len - frag_len) + #frag_pos = random.randint(0, chr_len - frag_len) + frag_pos = myrandint(0, chr_len - frag_len) if 'N' not in chr_seq[frag_pos:frag_pos + frag_len]: break @@ -801,7 +822,8 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ pos2, cigars2, cigar2_descs, MD2, XM2, NM2, Zs2, read2_seq = getSamAlignment(rna, exons, chr_seq, t_seq, frag_pos+frag_len-read_len, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch) swapped = False if paired_end: - if random.randint(0, 1) == 1: + #if random.randint(0, 1) == 1: + if myrandint(0, 1) == 1: swapped = True if swapped: flag, flag2 = flag - 16, flag2 - 16 @@ -830,19 +852,19 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ else: XS, TI = "", "" - print >> read_file, ">{}".format(cur_read_id) + print(">{}".format(cur_read_id), file=read_file) if swapped: - print >> read_file, reverse_complement(read_seq) + print(reverse_complement(read_seq), file=read_file) else: - print >> read_file, read_seq - print >> sam_file, "{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag, chr, pos + 1, cigar_str, chr, pos2 + 1, read_seq, XM, NM, MD, Zs, XS, TI) + print(read_seq, file=read_file) + print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag, chr, pos + 1, cigar_str, chr, pos2 + 1, read_seq, XM, NM, MD, Zs, XS, TI), file=sam_file) if paired_end: - print >> read2_file, ">{}".format(cur_read_id) + print(">{}".format(cur_read_id), file=read2_file) if swapped: - print >> read2_file, read2_seq + print(read2_seq, file=read2_file) else: - print >> read2_file, reverse_complement(read2_seq) - print >> sam_file, "{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag2, chr, pos2 + 1, cigar2_str, chr, pos + 1, read2_seq, XM2, NM2, MD2, Zs2, XS, TI) + print(reverse_complement(read2_seq), file=read2_file) + print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag2, chr, pos2 + 1, cigar2_str, chr, pos + 1, read2_seq, XM2, NM2, MD2, Zs2, XS, TI), file=sam_file) cur_read_id += 1 @@ -952,8 +974,8 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \ exit(1) if not args.rna: args.expr_profile = "constant" - simulate_reads(args.genome_file, args.gtf_file, args.snp_file, args.base_fname, \ - args.rna, args.paired_end, args.read_len, args.frag_len, \ - args.num_frag, args.expr_profile, args.repeat_fname, \ - args.error_rate, args.max_mismatch, \ + simulate_reads(args.genome_file, args.gtf_file, args.snp_file, args.base_fname, + args.rna, args.paired_end, args.read_len, args.frag_len, + args.num_frag, args.expr_profile, args.repeat_fname, + args.error_rate, args.max_mismatch, args.random_seed, args.snp_prob, args.sanity_check, args.verbose) From b7180c3f4edfa151ad7d5ffe96d1f7fb63ead71f Mon Sep 17 00:00:00 2001 From: Chanhee Park Date: Tue, 31 Mar 2020 15:40:06 -0500 Subject: [PATCH 5/5] removed unnecessary codes --- hisat2_read_statistics.py | 1 - hisat2_simulate_reads.py | 12 +----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/hisat2_read_statistics.py b/hisat2_read_statistics.py index d4a40d6f..5f5d7c1f 100755 --- a/hisat2_read_statistics.py +++ b/hisat2_read_statistics.py @@ -185,7 +185,6 @@ def reads_stat(read_file, read_count): length_map.append((0, 0)) print(cnt, mn, mx, avg, ",".join([str(k) for (k,v) in length_map])) - if __name__ == '__main__': parser = ArgumentParser( diff --git a/hisat2_simulate_reads.py b/hisat2_simulate_reads.py index 8e8561e5..c503522a 100755 --- a/hisat2_simulate_reads.py +++ b/hisat2_simulate_reads.py @@ -267,7 +267,6 @@ def calc_expr(x, a): expr_sum = sum(expr_profile) expr_profile = [expr_profile[i] / expr_sum for i in range(len(expr_profile))] assert abs(sum(expr_profile) - 1.0) < 0.001 - #print(expr_sum, expr_profile, file=sys.stderr) return expr_profile @@ -697,7 +696,6 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, num_frag, expr_profile_type, repeat_fname, error_rate, max_mismatch, random_seed, snp_prob, sanity_check, verbose): - print('random seed', random_seed, file=sys.stderr) random.seed(random_seed, version=1) err_rand_src = ErrRandomSource(error_rate / 100.0) @@ -705,7 +703,6 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, frag_len = read_len genome_seq = read_genome(genome_file) - #print(genome_seq) if rna: genes, transcripts = read_transcript(genome_seq, gtf_file, frag_len) else: @@ -728,8 +725,6 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, expr_profile[i] += 1 assert num_frag == sum(expr_profile) - #print(expr_profile) - repeat_loci = {} if repeat_fname != "" and os.path.exists(repeat_fname): for line in open(repeat_fname): @@ -744,16 +739,11 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, if rna: transcript_ids = sorted(list(transcripts.keys())) - #transcript_ids = list(transcripts.keys()) - #random.shuffle(transcript_ids) + random.shuffle(transcript_ids, random=random.random) assert len(transcript_ids) >= len(expr_profile) else: - transcript_ids = list() chr_ids = list(genome_seq.keys()) - for k in transcript_ids: - print(k, file=sys.stderr) - sam_file = open(base_fname + ".sam", "w") # Write SAM header