diff --git a/Makefile b/Makefile
index b8399819..60445ce6 100644
--- a/Makefile
+++ b/Makefile
@@ -237,8 +237,6 @@ HT2LIB_PKG_SRC = \
 GENERAL_LIST = $(wildcard scripts/*.sh) \
 	$(wildcard scripts/*.pl) \
 	$(wildcard *.py) \
-	$(wildcard hisatgenotype_modules/*.py) \
-	$(wildcard hisatgenotype_scripts/*.py) \
 	$(wildcard example/index/*.ht2) \
 	$(wildcard example/reads/*.fa) \
 	example/reference/22_20-21M.fa \
diff --git a/hisat2_extract_exons.py b/hisat2_extract_exons.py
index 201f8328..50602f2f 100755
--- a/hisat2_extract_exons.py
+++ b/hisat2_extract_exons.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2015, Daehwan Kim <infphilo@gmail.com>
@@ -19,8 +19,6 @@
 # along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-from __future__ import print_function
-
 from sys import stderr, exit
 from collections import defaultdict as dd, Counter
 from argparse import ArgumentParser, FileType
diff --git a/hisat2_extract_snps_haplotypes_UCSC.py b/hisat2_extract_snps_haplotypes_UCSC.py
index f2a5aba3..f90b2dcb 100755
--- a/hisat2_extract_snps_haplotypes_UCSC.py
+++ b/hisat2_extract_snps_haplotypes_UCSC.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2015, Daehwan Kim <infphilo@gmail.com>
@@ -23,6 +23,7 @@
 import sys, subprocess
 import re
 from argparse import ArgumentParser, FileType
+from functools import cmp_to_key
 
 
 """
@@ -80,8 +81,8 @@ def compare_vars(a, b):
 
     # daehwan - for debugging purposes
     if a_chr != b_chr:
-        print a
-        print b
+        print(a)
+        print(b)
     
     assert a_chr == b_chr
     if a_pos != b_pos:
@@ -129,7 +130,7 @@ def generate_haplotypes(snp_file,
     assert len(vars) > 0
 
     # Sort variants and remove redundant variants
-    vars = sorted(vars, cmp=compare_vars)
+    vars = sorted(vars, key=cmp_to_key(compare_vars))
     tmp_vars = []
     v = 0
     while v < len(vars):
@@ -223,8 +224,8 @@ def generate_haplotypes(snp_file,
         else:
             assert type == 'I'
             type = "insertion"
-        print >> snp_file, "%s\t%s\t%s\t%s\t%s" % \
-            (varID, type, chr, pos, data)
+        print("%s\t%s\t%s\t%s\t%s" % (varID, type, chr, pos, data),
+                file=snp_file)
 
     # genotypes_list looks like
     #    Var0: 0
@@ -270,7 +271,7 @@ def split_haplotypes(haplotypes):
                     split_haplotypes.add('#'.join(haplotype[prev_s:s]))
         return split_haplotypes
 
-    haplotypes2 = split_haplotypes(haplotypes)
+    haplotypes2 = sorted(list(split_haplotypes(haplotypes)))
 
     def cmp_haplotype(a, b):
         a = a.split('#')
@@ -288,8 +289,8 @@ def cmp_haplotype(a, b):
         if a_begin != b_begin:
             return a_begin - b_begin
         return a_end - b_end
-    
-    haplotypes = sorted(list(haplotypes2), cmp=cmp_haplotype)
+
+    haplotypes = sorted(list(haplotypes2), key=cmp_to_key(cmp_haplotype))
 
     # Write haplotypes
     for h_i in range(len(haplotypes)):
@@ -317,8 +318,8 @@ def cmp_haplotype(a, b):
         for id in h:
             var_dic = vars[int(id)][4]
             h_add.append(var_dic["id2"])
-        print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \
-            (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add))
+        print("ht%d\t%s\t%d\t%d\t%s" % (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)),
+                file=haplotype_file)
         num_haplotypes += 1
 
     return num_haplotypes
@@ -352,6 +353,7 @@ def main(genome_file,
     else:
         snp_cmd = ["cat", snp_fname]
     snp_proc = subprocess.Popen(snp_cmd,
+                                text=True,
                                 stdout=subprocess.PIPE,
                                 stderr=open("/dev/null", 'w'))
     ids_seen = set()
@@ -447,10 +449,10 @@ def main(genome_file,
                 if testset:
                     ref_seq = chr_seq[start-50:start+50]
                     alt_seq = chr_seq[start-50:start] + allele + chr_seq[start+1:start+50]
-                    print >> ref_testset_file, ">%s_single_%d" % (rs_id, start - 50)
-                    print >> ref_testset_file, ref_seq
-                    print >> alt_testset_file, ">%s_single_%d_%s" % (rs_id, start - 50, ref_seq)
-                    print >> alt_testset_file, alt_seq
+                    print(">%s_single_%d" % (rs_id, start - 50), file=ref_testset_file)
+                    print(ref_seq, file=ref_testset_file)
+                    print(">%s_single_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file)
+                    print(alt_seq, file=alt_testset_file)
                 
         elif classType == "deletion":
             if start > 0:
@@ -475,10 +477,10 @@ def main(genome_file,
             if testset and delLen > 0 and delLen <= 10:
                 ref_seq = chr_seq[start-50:start+50]
                 alt_seq = chr_seq[start-50:start] + chr_seq[start+delLen:start+50+delLen]
-                print >> ref_testset_file, ">%s_deletion_%d" % (rs_id, start - 50)
-                print >> ref_testset_file, ref_seq
-                print >> alt_testset_file, ">%s_deletion_%d_%s" % (rs_id, start - 50, ref_seq)
-                print >> alt_testset_file, alt_seq
+                print(">%s_deletion_%d" % (rs_id, start - 50), file=ref_testset_file)
+                print(ref_seq, file=ref_testset_file)
+                print(">%s_deletion_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file)
+                print(alt_seq, file=alt_testset_file)
         else:
             assert classType == "insertion"
             if start > 0:
@@ -497,10 +499,10 @@ def main(genome_file,
                     if testset and insLen > 0 and insLen <= 10:
                         ref_seq = chr_seq[start-50:start+50]
                         alt_seq = chr_seq[start-50:start] + allele + chr_seq[start:start+50-insLen]
-                        print >> ref_testset_file, ">%s_insertion_%d" % (rs_id, start - 50)
-                        print >> ref_testset_file, ref_seq
-                        print >> alt_testset_file, ">%s_insertion_%d_%s" % (rs_id, start - 50, ref_seq)
-                        print >> alt_testset_file, alt_seq
+                        print(">%s_insertion_%d" % (rs_id, start - 50), file=ref_testset_file)
+                        print(ref_seq, file=ref_testset_file)
+                        print(">%s_insertion_%d_%s" % (rs_id, start - 50, ref_seq), file=alt_testset_file)
+                        print(alt_seq, file=alt_testset_file)
 
         if curr_right < end:
             curr_right = end
diff --git a/hisat2_extract_snps_haplotypes_VCF.py b/hisat2_extract_snps_haplotypes_VCF.py
index c365821a..873b1ede 100755
--- a/hisat2_extract_snps_haplotypes_VCF.py
+++ b/hisat2_extract_snps_haplotypes_VCF.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
 # Copyright 2016, Daehwan Kim <infphilo@gmail.com>
 #
@@ -21,6 +21,7 @@
 
 import sys, os, subprocess
 from argparse import ArgumentParser, FileType
+from functools import cmp_to_key
 
 digit2str = [str(i) for i in range(10)]
 
@@ -100,11 +101,11 @@ def extract_vars(chr_dic, chr, pos, ref_allele, alt_alleles, varID):
         ref_allele2, pos2 = ref_allele, pos
 
         if chr_seq[pos:pos+len(ref_allele)] != ref_allele:
-            print >> sys.stderr, "Error: the reference genome you provided seems to be incompatible with the VCF file at %d of chromosome %s where %s is in the reference genome while %s is in the VCF file" % (pos, chr, chr_seq[pos:pos+len(ref_allele)], ref_allele)
+            print("Error: the reference genome you provided seems to be incompatible with the VCF file at %d of chromosome %s where %s is in the reference genome while %s is in the VCF file" % (pos, chr, chr_seq[pos:pos+len(ref_allele)], ref_allele), file=sys.stderr)
 
         def warning_msg():
-            print >> sys.stderr, "Warning) ref allele (%s) and alt allele (%s in %s) at chr%s:%d are excluded." % \
-                (ref_allele, alt_allele, ','.join(alt_alleles), chr, pos + 1)
+            print("Warning) ref allele (%s) and alt allele (%s in %s) at chr%s:%d are excluded." % \
+                (ref_allele, alt_allele, ','.join(alt_alleles), chr, pos + 1), file=sys.stderr)
             
         min_len = min(len(ref_allele2), len(alt_allele2))
         if min_len >= 2:
@@ -170,7 +171,7 @@ def generate_haplotypes(snp_file,
     assert len(vars) > 0
 
     # Sort variants and remove redundant variants
-    vars = sorted(vars, cmp=compare_vars)
+    vars = sorted(vars, key=cmp_to_key(compare_vars))
     tmp_vars = []
     v = 0
     while v < len(vars):
@@ -203,8 +204,8 @@ def generate_haplotypes(snp_file,
         else:
             assert type == 'I'
             type = "insertion"
-        print >> snp_file, "%s\t%s\t%s\t%s\t%s" % \
-            (varID, type, chr, pos, data)
+        print("%s\t%s\t%s\t%s\t%s" % \
+            (varID, type, chr, pos, data), file=snp_file)
 
     # variant compatibility
     vars_cmpt = [-1 for i in range(len(vars))]
@@ -363,7 +364,7 @@ def split_haplotypes(haplotypes):
                     split_haplotypes.add('#'.join(haplotype[prev_s:s]))
         return split_haplotypes
 
-    haplotypes2 = split_haplotypes(haplotypes)
+    haplotypes2 = sorted(list(split_haplotypes(haplotypes)))
 
     def cmp_haplotype(a, b):
         a = a.split('#')
@@ -382,7 +383,7 @@ def cmp_haplotype(a, b):
             return a_begin - b_begin
         return a_end - b_end
     
-    haplotypes = sorted(list(haplotypes2), cmp=cmp_haplotype)
+    haplotypes = sorted(list(haplotypes2), key=cmp_to_key(cmp_haplotype))
 
     # daehwan - for debugging purposes
     """
@@ -424,8 +425,8 @@ def cmp_haplotype(a, b):
         for id in h:
             var_dic = vars[int(id)][4]
             h_add.append(var_dic["id2"])
-        print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \
-            (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add))
+        print("ht%d\t%s\t%d\t%d\t%s" % \
+            (num_haplotypes, chr, h_new_begin, h_end, ','.join(h_add)), file=haplotype_file)
         num_haplotypes += 1
 
     return num_haplotypes
@@ -464,6 +465,7 @@ def main(genome_file,
         else:
             vcf_cmd = ["cat", genotype_vcf]
         vcf_proc = subprocess.Popen(vcf_cmd,
+                                    text=True,
                                     stdout=subprocess.PIPE,
                                     stderr=open("/dev/null", 'w'))
         for line in vcf_proc.stdout:
@@ -525,17 +527,17 @@ def main(genome_file,
                     
                 var_set.add(var_str)
 
-        print >> sys.stderr, "Number of variants in %s is:" % (genotype_vcf)
+        print("Number of variants in %s is:" % (genotype_vcf), file=sys.stderr)
         for chr, vars in genotype_var_list.items():
             vars = sorted(vars, cmp=compare_vars)
-            print >> sys.stderr, "\tChromosome %s: %d variants" % (chr, len(vars))
+            print("\tChromosome %s: %d variants" % (chr, len(vars)), file=sys.stderr)
 
         for chr, gene_ranges in genotype_ranges.items():
             for gene, value in gene_ranges.items():
                 gene_ranges[gene] = [value[0] - 100, value[1] + 100]
                 value = genotype_ranges[chr][gene]
                 if verbose:
-                    print >> sys.stderr, "%s\t%s\t%d-%d" % (chr, gene, value[0], value[1])
+                    print("%s\t%s\t%d-%d" % (chr, gene, value[0], value[1]), file=sys.stderr)
 
         if extra_files or True:
             clnsig_file = open("%s.clnsig" % base_fname, 'w')
@@ -544,7 +546,7 @@ def main(genome_file,
                     varID = var[4]["id2"]
                     CLNSIG = var[4]["CLNSIG"]
                     gene = var[4]["gene"]
-                    print >> clnsig_file, "%s\t%s\t%s" % (varID, gene, CLNSIG)
+                    print("%s\t%s\t%s" % (varID, gene, CLNSIG), file=clnsig_file)
             clnsig_file.close()
 
     SNP_file = open("%s.snp" % base_fname, 'w')
@@ -558,7 +560,7 @@ def main(genome_file,
                 left, right = value
                 if reference_type == "gene":
                     left, right = 0, right - left
-                print >> ref_file, "%s\t%s\t%d\t%d" % (gene, chr, left, right)
+                print("%s\t%s\t%d\t%d" % (gene, chr, left, right), file=ref_file)
         ref_file.close()
 
         if reference_type == "gene":
@@ -567,10 +569,10 @@ def main(genome_file,
                 for gene, value in gene_ranges.items():
                     left, right = value
                     left, right = 0, right - left
-                    print >> backbone_file, ">%s" % (gene)
+                    print(">%s" % (gene), file=backbone_file)
                     backbone_seq = chr_dic[chr][value[0]:value[1]+1]
                     for s in range(0, len(backbone_seq), 60):
-                        print >> backbone_file, backbone_seq[s:s+60]
+                        print(backbone_seq[s:s+60], file=backbone_file)
             backbone_file.close()
         elif reference_type == "chromosome":
             first = True
@@ -604,6 +606,7 @@ def main(genome_file,
             else:
                 vcf_cmd = ["cat", VCF_fname]
             vcf_proc = subprocess.Popen(vcf_cmd,
+                                        text=True,
                                         stdout=subprocess.PIPE,
                                         stderr=open("/dev/null", 'w'))
 
@@ -665,7 +668,7 @@ def main(genome_file,
                 offset = 0
                 gene = None
                 if num_lines % 10000 == 1:
-                    print >> sys.stderr, "\t%s:%d\r" % (chr, pos),
+                    print("\t%s:%d\r" % (chr, pos), file=sys.stderr)
 
                 if chr_genotype_ranges:
                     skip = True
@@ -883,6 +886,7 @@ def add_vars(pos,
             else:
                 vcf_cmd = ["cat", args.genotype_vcf]
             vcf_proc = subprocess.Popen(vcf_cmd,
+                                        text=True,
                                         stdout=subprocess.PIPE,
                                         stderr=open("/dev/null", 'w'))
             for line in vcf_proc.stdout:
@@ -900,7 +904,7 @@ def add_vars(pos,
             args.genotype_gene_list = args.genotype_gene_list.split(',')
 
         if len(args.genotype_gene_list) == 0:
-            print >> sys.stderr, "Error: please specify --genotype-gene-list."
+            print("Error: please specify --genotype-gene-list.", file=sys.stderr)
             sys.exit(1)
 
     else:
diff --git a/hisat2_extract_splice_sites.py b/hisat2_extract_splice_sites.py
index 59862882..cba92347 100755
--- a/hisat2_extract_splice_sites.py
+++ b/hisat2_extract_splice_sites.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2015, Daehwan Kim <infphilo@gmail.com>
@@ -19,8 +19,6 @@
 # along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-from __future__ import print_function
-
 from sys import stderr, exit
 from collections import defaultdict as dd, Counter
 from argparse import ArgumentParser, FileType
@@ -105,18 +103,18 @@ def extract_splice_sites(gtf_file, verbose=False):
                 len(genes), sum(len(v) > 1 for v in genes.values())),
               file=stderr)
         print('transcripts: {}, transcript avg. length: {:.0f}'.format(
-                len(trans), sum(trans_lengths.elements())/len(trans)),
+                len(trans), sum(trans_lengths.elements())//len(trans)),
               file=stderr)
         print('exons: {}, exon avg. length: {:.0f}'.format(
                 sum(exon_lengths.values()),
-                sum(exon_lengths.elements())/sum(exon_lengths.values())),
+                sum(exon_lengths.elements())//sum(exon_lengths.values())),
               file=stderr)
         print('introns: {}, intron avg. length: {:.0f}'.format(
                 sum(intron_lengths.values()),
-                sum(intron_lengths.elements())/sum(intron_lengths.values())),
+                sum(intron_lengths.elements())//sum(intron_lengths.values())),
               file=stderr)
         print('average number of exons per transcript: {:.0f}'.format(
-                sum(exon_lengths.values())/len(trans)),
+                sum(exon_lengths.values())//len(trans)),
               file=stderr)
 
 
diff --git a/hisat2_read_statistics.py b/hisat2_read_statistics.py
index e6d0d1c3..5f5d7c1f 100755
--- a/hisat2_read_statistics.py
+++ b/hisat2_read_statistics.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 #
 # Copyright 2018, Chanhee Park <parkchanhee@gmail.com> and Daehwan Kim <infphilo@gmail.com>
@@ -42,7 +42,7 @@ def parser_FQ(fp):
             return
 
         if line[0] == '@':
-            break;
+            break
 
     while True:
         id = line[1:].split()[0]
@@ -55,9 +55,9 @@ def parser_FQ(fp):
         seq = line.strip()
         yield id, seq
 
-        line = fp.readline() # '+'
-        line = fp.readline() # quality
-        line = fp.readline() # next ID
+        line = fp.readline()  # '+'
+        line = fp.readline()  # quality
+        line = fp.readline()  # next ID
         if line == "":
             return
 
@@ -73,7 +73,7 @@ def parser_FA(fp):
             return
 
         if line[0] == '>':
-            break;
+            break
 
     while True:
         id = line[1:].split()[0]
@@ -119,10 +119,10 @@ def parse_type(fname):
 """
 """
 def generate_stats(length_map):
-    mn = 0 # minimun read length
-    mx = 0 # maximum read length
-    cnt = 0 # number of reads
-    avg = 0 # average read length
+    mn = 0  # minimun read length
+    mx = 0  # maximum read length
+    cnt = 0  # number of reads
+    avg = 0  # average read length
 
     sum = 0
 
@@ -135,11 +135,11 @@ def generate_stats(length_map):
     mn = sorted_map[0]
     mx = sorted_map[-1]
 
-    for k in sorted(length_map):
-        sum += int(k) * length_map[k]
-        cnt += length_map[k]
+    for k, v in length_map.items():
+        sum += k * v
+        cnt += v
 
-    avg = sum / cnt
+    avg = sum // cnt
 
     return cnt, mn, mx, avg
 
@@ -179,10 +179,11 @@ def reads_stat(read_file, read_count):
     fp.close()
 
     cnt, mn, mx, avg =  generate_stats(length_map)
-    length_map = sorted(length_map.iteritems(), key=lambda (k,v):(v,k), reverse=True)
+    # sort by (read count, read length)
+    length_map = sorted(length_map.items(), key=lambda t: (t[1], t[0]), reverse=True)
     if len(length_map) == 0:
-        length_map.append((0,0))
-    print cnt, mn, mx, avg, ",".join([str(k) for (k,v) in length_map])
+        length_map.append((0, 0))
+    print(cnt, mn, mx, avg, ",".join([str(k) for (k,v) in length_map]))
 
 if __name__ == '__main__':
 
diff --git a/hisat2_simulate_reads.py b/hisat2_simulate_reads.py
index e5c63e6b..c503522a 100755
--- a/hisat2_simulate_reads.py
+++ b/hisat2_simulate_reads.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 #
 # Copyright 2015, Daehwan Kim <infphilo@gmail.com>
 #
@@ -51,6 +51,13 @@ def reverse_complement(seq):
     return result
 
 
+"""
+python2 style randint
+"""
+def myrandint(m, x):
+    s = x - m + 1
+    return m + int(random.random() * s)
+
 """
 Random source for sequencing errors
 """
@@ -91,7 +98,7 @@ def read_genome(genome_file):
         chr_dic[chr_name] = sequence
 
 
-    chr_filter = [str(x) for x in range(1, 23) + ['X', 'Y']]
+    chr_filter = [str(x) for x in list(range(1, 23)) + ['X', 'Y']]
     #chr_filter = None
 
     if chr_filter:
@@ -120,7 +127,7 @@ def read_transcript(genome_seq, gtf_file, frag_len):
                 strand, frame, values = line.split('\t')
         except ValueError:
             continue
-        if not chrom in genome_seq:
+        if chrom not in genome_seq:
             continue
         
         # Zero-based offset
@@ -214,7 +221,7 @@ def sanity_check_input(genome_seq, genes, transcripts, snps, frag_len):
             num_ss += 1
 
     if num_ss > 0:
-        print >> sys.stderr, "GT/AG splice sites: {}/{} ({:.2%})".format(num_canon_ss, num_ss, (float(num_canon_ss) / num_ss))
+        print("GT/AG splice sites: {}/{} ({:.2%})".format(num_canon_ss, num_ss, (float(num_canon_ss) / num_ss)), file=sys.stderr)
 
     num_alt_single, num_single = 0, 0
     for chr, chr_snps in snps.items():
@@ -235,7 +242,7 @@ def sanity_check_input(genome_seq, genes, transcripts, snps, frag_len):
             num_single += 1
 
     if num_single > 0:
-        print >> sys.stderr, "Alternative bases: {}/{} ({:.2%})".format(num_alt_single, num_single, (float(num_alt_single) / num_single))
+        print("Alternative bases: {}/{} ({:.2%})".format(num_alt_single, num_single, (float(num_alt_single) / num_single)), file=sys.stderr)
 
 
 """
@@ -280,7 +287,7 @@ def generate_dna_expr_profile(genome_seq):
 def getSNPs(chr_snps, left, right):
     low, high = 0, len(chr_snps)
     while low < high:
-        mid = (low + high) / 2
+        mid = (low + high) // 2
         snpID, type, pos, data = chr_snps[mid]
         if pos < left:
             low = mid + 1
@@ -370,7 +377,8 @@ def getSamAlignment(rna, exons, chr_seq, trans_seq, frag_pos, read_len, chr_snps
             if err_rand_src.getRand() == 1:
                 assert i < len(chr_seq)
                 err_base = "A"
-                rand = random.randint(0, 2)
+                #rand = random.randint(0, 2)
+                rand = myrandint(0, 2)
                 if chr_seq[i] == "A":
                     err_base = "GCT"[rand]
                 elif chr_seq[i] == "C":
@@ -382,10 +390,10 @@ def getSamAlignment(rna, exons, chr_seq, trans_seq, frag_pos, read_len, chr_snps
                 mms.append(["", "single", i, err_base])
 
         tmp_diffs = snps + mms
-        def diff_sort(a , b):
-            return a[2] - b[2]
+#        def diff_sort(a , b):
+#            return a[2] - b[2]
 
-        tmp_diffs = sorted(tmp_diffs, cmp=diff_sort)
+        tmp_diffs = sorted(tmp_diffs, key=lambda t: t[2])
         diffs = []
         if len(tmp_diffs) > 0:
             diffs = tmp_diffs[:1]
@@ -545,8 +553,8 @@ def diff_sort(a , b):
         MD += ("{}".format(MD_match_len))
 
     if len(read_seq) != read_len:
-        print >> sys.stderr, "read length differs:", len(read_seq), "vs.", read_len
-        print >> sys.stderr, pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs
+        print("read length differs:", len(read_seq), "vs.", read_len, file=sys.stderr)
+        print(pos, "".join(cigars), cigar_descs, MD, XM, NM, Zs, file=sys.stderr)
         assert False
 
     return pos, cigars, cigar_descs, MD, XM, NM, Zs, read_seq
@@ -676,19 +684,19 @@ def samRepOk(genome_seq, read_seq, chr, pos, cigar, XM, NM, MD, Zs, max_mismatch
         tMD += ("{}".format(match_len))
 
     if tMD != MD or tXM != XM or tNM != NM or XM > max_mismatch or XM != NM:
-        print >> sys.stderr, chr, pos, cigar, MD, XM, NM, Zs
-        print >> sys.stderr, tMD, tXM, tNM
+        print(chr, pos, cigar, MD, XM, NM, Zs, file=sys.stderr)
+        print(tMD, tXM, tNM, file=sys.stderr)
         assert False
         
         
 """
 """
-def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
-                   rna, paired_end, read_len, frag_len, \
+def simulate_reads(genome_file, gtf_file, snp_file, base_fname,
+                   rna, paired_end, read_len, frag_len,
                    num_frag, expr_profile_type, repeat_fname,
-                   error_rate, max_mismatch, \
+                   error_rate, max_mismatch,
                    random_seed, snp_prob, sanity_check, verbose):
-    random.seed(random_seed)
+    random.seed(random_seed, version=1)
     err_rand_src = ErrRandomSource(error_rate / 100.0)
     
     if read_len > frag_len:
@@ -716,7 +724,7 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
         for i in range(min(num_frag - sum(expr_profile), len(expr_profile))):
             expr_profile[i] += 1
     assert num_frag == sum(expr_profile)
-
+    
     repeat_loci = {}
     if repeat_fname != "" and os.path.exists(repeat_fname):
         for line in open(repeat_fname):
@@ -730,18 +738,18 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
                 repeat_loci[chr].append([int(pos), strand])
 
     if rna:
-        transcript_ids = transcripts.keys()
-        random.shuffle(transcript_ids)
+        transcript_ids = sorted(list(transcripts.keys()))
+        random.shuffle(transcript_ids, random=random.random)
         assert len(transcript_ids) >= len(expr_profile)
     else:
-        chr_ids = genome_seq.keys()
+        chr_ids = list(genome_seq.keys())
 
     sam_file = open(base_fname + ".sam", "w")
 
     # Write SAM header
-    print >> sam_file, "@HD\tVN:1.0\tSO:unsorted"
+    print("@HD\tVN:1.0\tSO:unsorted", file=sam_file)
     for chr in genome_seq.keys():
-        print >> sam_file, "@SQ\tSN:%s\tLN:%d" % (chr, len(genome_seq[chr]))
+        print("@SQ\tSN:%s\tLN:%d" % (chr, len(genome_seq[chr])), file=sam_file)
     
     read_file = open(base_fname + "_1.fa", "w")
     if paired_end:
@@ -753,10 +761,10 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
         if rna:
             transcript_id = transcript_ids[t]
             chr, strand, transcript_len, exons = transcripts[transcript_id]
-            print >> sys.stderr, transcript_id, t_num_frags
+            print(transcript_id, t_num_frags, file=sys.stderr)
         else:
             chr = chr_ids[t]
-            print >> sys.stderr, chr, t_num_frags
+            print(chr, t_num_frags, file=sys.stderr)
 
         assert chr in genome_seq
         chr_seq = genome_seq[chr]
@@ -783,14 +791,17 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
 
         for f in range(t_num_frags):
             if rna:
-                frag_pos = random.randint(0, transcript_len - frag_len)
+                #frag_pos = random.randint(0, transcript_len - frag_len)
+                frag_pos = myrandint(0, transcript_len - frag_len)
             else:
                 while True:
                     if len(chr_repeat_loci):
-                        locus_id = random.randint(0, len(chr_repeat_loci) - 1)
+                        #locus_id = random.randint(0, len(chr_repeat_loci) - 1)
+                        locus_id = myrandint(0, len(chr_repeat_loci) - 1)
                         frag_pos = chr_repeat_loci[locus_id][0]
                     else:
-                        frag_pos = random.randint(0, chr_len - frag_len)
+                        #frag_pos = random.randint(0, chr_len - frag_len)
+                        frag_pos = myrandint(0, chr_len - frag_len)
                     if 'N' not in chr_seq[frag_pos:frag_pos + frag_len]:
                         break
 
@@ -801,7 +812,8 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
             pos2, cigars2, cigar2_descs, MD2, XM2, NM2, Zs2, read2_seq = getSamAlignment(rna, exons, chr_seq, t_seq, frag_pos+frag_len-read_len, read_len, chr_snps, snp_prob, err_rand_src, max_mismatch)
             swapped = False
             if paired_end:
-                if random.randint(0, 1) == 1:
+                #if random.randint(0, 1) == 1:
+                if myrandint(0, 1) == 1:
                     swapped = True
                 if swapped:
                     flag, flag2 = flag - 16, flag2 - 16
@@ -830,19 +842,19 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
             else:
                 XS, TI = "", ""                
 
-            print >> read_file, ">{}".format(cur_read_id)
+            print(">{}".format(cur_read_id), file=read_file)
             if swapped:
-                print >> read_file, reverse_complement(read_seq)
+                print(reverse_complement(read_seq), file=read_file)
             else:
-                print >> read_file, read_seq
-            print >> sam_file, "{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag, chr, pos + 1, cigar_str, chr, pos2 + 1, read_seq, XM, NM, MD, Zs, XS, TI)
+                print(read_seq, file=read_file)
+            print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag, chr, pos + 1, cigar_str, chr, pos2 + 1, read_seq, XM, NM, MD, Zs, XS, TI), file=sam_file)
             if paired_end:
-                print >> read2_file, ">{}".format(cur_read_id)
+                print(">{}".format(cur_read_id), file=read2_file)
                 if swapped:
-                    print >> read2_file, read2_seq
+                    print(read2_seq, file=read2_file)
                 else:
-                    print >> read2_file, reverse_complement(read2_seq)
-                print >> sam_file, "{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag2, chr, pos2 + 1, cigar2_str, chr, pos + 1, read2_seq, XM2, NM2, MD2, Zs2, XS, TI)
+                    print(reverse_complement(read2_seq), file=read2_file)
+                print("{}\t{}\t{}\t{}\t255\t{}\t{}\t{}\t0\t{}\t*\tXM:i:{}\tNM:i:{}\tMD:Z:{}{}{}{}".format(cur_read_id, flag2, chr, pos2 + 1, cigar2_str, chr, pos + 1, read2_seq, XM2, NM2, MD2, Zs2, XS, TI), file=sam_file)
 
             cur_read_id += 1
             
@@ -952,8 +964,8 @@ def simulate_reads(genome_file, gtf_file, snp_file, base_fname, \
         exit(1)
     if not args.rna:
         args.expr_profile = "constant"
-    simulate_reads(args.genome_file, args.gtf_file, args.snp_file, args.base_fname, \
-                   args.rna, args.paired_end, args.read_len, args.frag_len, \
-                   args.num_frag, args.expr_profile, args.repeat_fname, \
-                   args.error_rate, args.max_mismatch, \
+    simulate_reads(args.genome_file, args.gtf_file, args.snp_file, args.base_fname,
+                   args.rna, args.paired_end, args.read_len, args.frag_len,
+                   args.num_frag, args.expr_profile, args.repeat_fname,
+                   args.error_rate, args.max_mismatch,
                    args.random_seed, args.snp_prob, args.sanity_check, args.verbose)
diff --git a/hisatgenotype.py b/hisatgenotype.py
deleted file mode 100755
index cf433b48..00000000
--- a/hisatgenotype.py
+++ /dev/null
@@ -1,490 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2017, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT-genotype.
-#
-# HISAT-genotype is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT-genotype is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re, resource
-import inspect, random
-import math
-from datetime import datetime, date, time
-from argparse import ArgumentParser, FileType
-import hisatgenotype_typing_common as typing_common
-
-
-"""
-Align reads, and sort the alignments into a BAM file
-"""
-def align_reads(base_fname,
-                read_fnames,
-                fastq,
-                threads,
-                verbose):
-    aligner_cmd = ["hisat2",
-                   "--no-unal",
-                   "-p", str(threads),
-                   "--no-spliced-alignment",
-                   "--max-altstried", "64"]
-    aligner_cmd += ["-X", "1000"]
-    # aligner_cmd += ["--mm"]
-    aligner_cmd += ["-x", "%s" % base_fname]
-
-    assert len(read_fnames) > 0
-    if not fastq:
-        aligner_cmd += ["-f"]
-    single = len(read_fnames) == 1
-    if single:
-        aligner_cmd += ["-U", read_fnames[0]]
-    else:
-        aligner_cmd += ["-1", read_fnames[0],
-                        "-2", read_fnames[1]]
-
-    out_base_fname = read_fnames[0].split('/')[-1].split('.')[0]
-
-    print >> sys.stderr, "%s Aligning %s to %s ..." % (str(datetime.now()), ' '.join(read_fnames), base_fname)
-    if verbose:
-        print >> sys.stderr, "\t%s" % (' '.join(aligner_cmd))
-
-    align_proc = subprocess.Popen(aligner_cmd,
-                                  stdout=subprocess.PIPE,
-                                  stderr=open("/dev/null", 'w'))
-
-    unsorted_bam_fname = "%s_unsorted.bam" % out_base_fname
-    sambam_cmd = ["samtools",
-                  "view",
-                  "-bS",
-                  "-"]
-    sambam_proc = subprocess.Popen(sambam_cmd,
-                                   stdin=align_proc.stdout,
-                                   stdout=open(unsorted_bam_fname, 'w'))
-    sambam_proc.communicate()
-
-    # Increase the maximum number of files that can be opened
-    resource.setrlimit(resource.RLIMIT_NOFILE, (10000, 10240))
-    
-    print >> sys.stderr, "%s Sorting %s ..." % (str(datetime.now()), unsorted_bam_fname)
-    bam_fname = "%s.bam" % out_base_fname
-    bamsort_cmd = ["samtools",
-                   "sort",
-                   "--threads", str(threads),
-                   "-m", "1536M",
-                   unsorted_bam_fname,
-                   "-o", bam_fname]    
-    if verbose:
-        print >> sys.stderr, "\t%s" % ' '.join(bamsort_cmd)
-    bamsort_proc = subprocess.call(bamsort_cmd)
-    os.remove(unsorted_bam_fname)
-
-    index_bam(bam_fname,
-              verbose)
-    
-    return bam_fname
-
-
-"""
-"""
-def index_bam(bam_fname,
-              verbose):
-    print >> sys.stderr, "%s Indexing %s ..." % (str(datetime.now()), bam_fname)
-    bamindex_cmd = ["samtools",
-                    "index",
-                    bam_fname]
-    if verbose:
-        print >> sys.stderr, "\t%s" % ' '.join(bamindex_cmd)
-    bamindex_proc = subprocess.call(bamindex_cmd)
-
-
-"""
-"""
-def extract_reads(bam_fname,
-                  chr,
-                  left,
-                  right,
-                  read_base_fname, # sample => sample.1.fq.gz and sample.2.fq.gz
-                  paired,
-                  fastq,
-                  verbose):
-    out_read_dname = "hisatgenotype_out"
-    if not os.path.exists(out_read_dname):
-        os.mkdir(out_read_dname)
-        
-    read_fnames = []
-    if paired:
-        read_fnames = [out_read_dname + "/" + read_base_fname + ".1.fq.gz",
-                       out_read_dname + "/" + read_base_fname + ".2.fq.gz"]
-    else:
-        read_fnames = [out_read_dname + "/" + read_base_fname + ".fq.gz"]
-
-    if paired:
-        gzip1_proc = subprocess.Popen(["gzip"],
-                                      stdin=subprocess.PIPE,
-                                      stdout=open(read_fnames[0], 'w'),
-                                      stderr=open("/dev/null", 'w'))
-
-        gzip2_proc = subprocess.Popen(["gzip"],
-                                      stdin=subprocess.PIPE,
-                                      stdout=open(read_fnames[1], 'w'),
-                                      stderr=open("/dev/null", 'w'))
-    else:
-        gzip1_proc = subprocess.Popen(["gzip"],
-                                      stdin=subprocess.PIPE,
-                                      stdout=open(read_fnames[0], 'w'),
-                                      stderr=open("/dev/null", 'w'))
-
-    def write_read(gzip_proc, read_name, seq, qual):
-        if fastq:
-            gzip_proc.stdin.write("@%s\n" % read_name)
-            gzip_proc.stdin.write("%s\n" % seq)
-            gzip_proc.stdin.write("+\n")
-            gzip_proc.stdin.write("%s\n" % qual)
-        else:
-            gzip_proc.stdin.write(">%s\n" % prev_read_name)
-            gzip_proc.stdin.write("%s\n" % seq)                    
-
-    bamview_cmd = ["samtools", "view", bam_fname, "%s:%d-%d" % (chr, left+1, right+1)]
-    if verbose:
-        print >> sys.stderr, "\t%s" % ' '.join(bamview_cmd)
-    bamview_proc = subprocess.Popen(bamview_cmd,
-                                    stdout=subprocess.PIPE,
-                                    stderr=open("/dev/null", 'w'))
-
-    sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
-    alignview_proc = subprocess.Popen(sort_read_cmd,
-                                      stdin=bamview_proc.stdout,
-                                      stdout=subprocess.PIPE,
-                                      stderr=open("/dev/null", 'w'))
-
-    prev_read_name, extract_read, read1, read2 = "", False, [], []
-    for line in alignview_proc.stdout:
-        if line.startswith('@'):
-            continue
-        line = line.strip()
-        cols = line.split()
-        read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11]
-        flag, pos = int(flag), int(pos)
-        strand = '-' if flag & 0x10 else '+'                   
-        AS, NH = "", ""
-        for i in range(11, len(cols)):
-            col = cols[i]
-            if col.startswith("AS"):
-                AS = int(col[5:])
-            elif col.startswith("NH"):
-                NH = int(col[5:])
-
-        # DK - check this out
-        simulation = True
-        if (not simulation and read_name != prev_read_name) or \
-           (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]):
-            if extract_read:
-                if paired:
-                    if len(read1) == 2 and len(read2) == 2:
-                        write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
-                        write_read(gzip2_proc, prev_read_name, read2[0], read2[1])
-                else:                    
-                    write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
-            prev_read_name, extract_read, read1, read2 = read_name, False, [], []
-
-        if NH == 1:
-            extract_read = True
-
-        if flag & 0x40 or not paired: # left read
-            if not read1:
-                if flag & 0x10: # reverse complement
-                    read1 = [typing_common.reverse_complement(read), qual[::-1]]
-                else:
-                    read1 = [read, qual]
-        else:
-            assert flag & 0x80 # right read
-            if flag & 0x10: # reverse complement
-                read2 = [typing_common.reverse_complement(read), qual[::-1]]
-            else:
-                read2 = [read, qual]
-
-    if extract_read:
-        if paired:
-            if len(read1) == 2 and len(read2) == 2:
-                write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
-                write_read(gzip2_proc, prev_read_name, read2[0], read2[1])
-        else:                    
-            write_read(gzip1_proc, prev_read_name, read1[0], read1[1])
-
-    gzip1_proc.stdin.close()
-    if paired:
-        gzip2_proc.stdin.close()
-
-    return read_fnames
-
-
-"""
-"""
-def perform_genotyping(base_fname,
-                       database,
-                       locus_list,
-                       read_fnames,
-                       fastq,
-                       num_editdist,
-                       assembly,
-                       local_database,
-                       threads,
-                       verbose):
-    genotype_cmd = ["hisatgenotype_locus.py"]
-    if not local_database:
-        genotype_cmd += ["--genotype-genome", base_fname]
-    genotype_cmd += ["--base", database]
-    if len(locus_list) > 0:
-        genotype_cmd += ["--locus-list", ','.join(locus_list)]
-    genotype_cmd += ["-p", str(threads),
-                     "--num-editdist", str(num_editdist)]
-    if not fastq:
-        genotype_cmd += ["-f"]
-
-    if len(read_fnames) == 2: # paired
-        genotype_cmd += ["-1", read_fnames[0],
-                         "-2", read_fnames[1]]
-    elif len(read_fnames) == 1:
-        genotype_cmd += ["-U", read_fnames[0]] 
-    else:
-        assert len(read_fnames) == 0
-
-    if assembly:
-        genotype_cmd += ["--assembly"]
-
-    if verbose:
-        print >> sys.stderr, "\t%s" % ' '.join(genotype_cmd)
-    genotype_proc = subprocess.Popen(genotype_cmd)
-    genotype_proc.communicate()
-        
-
-"""
-"""
-def genotype(base_fname,
-             target_region_list,
-             fastq,
-             read_fnames,
-             alignment_fname,
-             threads,
-             num_editdist,
-             assembly,
-             local_database,
-             verbose,
-             debug):
-    # variants, backbone sequence, and other sequeces
-    genotype_fnames = ["%s.fa" % base_fname,
-                       "%s.locus" % base_fname,
-                       "%s.snp" % base_fname,
-                       "%s.index.snp" % base_fname,
-                       "%s.haplotype" % base_fname,
-                       "%s.link" % base_fname,
-                       "%s.coord" % base_fname,
-                       "%s.clnsig" % base_fname]
-    # hisat2 graph index files
-    genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
-    if not typing_common.check_files(genotype_fnames):
-        print >> sys.stderr, "Error: some of the following files are missing!"
-        for fname in genotype_fnames:
-            print >> sys.stderr, "\t%s" % fname
-        sys.exit(1)
-
-    # Read region alleles (names and sequences)
-    regions, region_loci = {}, {}
-    for line in open("%s.locus" % base_fname):
-        family, allele_name, chr, left, right = line.strip().split()[:5]
-        family = family.lower()
-        if len(target_region_list) > 0 and \
-           family not in target_region_list:
-            continue
-        
-        locus_name = allele_name.split('*')[0]
-        if family in target_region_list and \
-           len(target_region_list[family]) > 0 and \
-           locus_name not in target_region_list[family]:
-            continue
-        
-        left, right = int(left), int(right)
-        if family not in region_loci:
-            region_loci[family] = []
-        region_loci[family].append([locus_name, allele_name, chr, left, right])
-
-    if len(region_loci) <= 0:
-        print >> sys.stderr, "Warning: no region exists!"
-        sys.exit(1)
-
-    # Align reads, and sort the alignments into a BAM file
-    if len(read_fnames) > 0:
-        alignment_fname = align_reads(base_fname,
-                                      read_fnames,
-                                      fastq,
-                                      threads,
-                                      verbose)
-    assert alignment_fname != "" and os.path.exists(alignment_fname)
-    if not os.path.exists(alignment_fname + ".bai"):
-        index_bam(alignment_fname,
-                  verbose)
-    assert os.path.exists(alignment_fname + ".bai")
-
-    # Extract reads and perform genotyping
-    for family, loci in region_loci.items():
-        print >> sys.stderr, "Analyzing %s ..." % family.upper()
-        for locus_name, allele_name, chr, left, right in loci:
-            out_read_fname = "%s.%s" % (family, locus_name)
-            if verbose:
-                print >> sys.stderr, "\tExtracting reads beloning to %s-%s ..." % \
-                    (family, locus_name)
-
-            extracted_read_fnames = extract_reads(alignment_fname,
-                                                  chr,
-                                                  left,
-                                                  right,
-                                                  out_read_fname,
-                                                  len(read_fnames) != 1, # paired?
-                                                  fastq,
-                                                  verbose)
-
-            perform_genotyping(base_fname,
-                               family,
-                               [locus_name],
-                               extracted_read_fnames,
-                               fastq,
-                               num_editdist,
-                               assembly,
-                               local_database,
-                               threads,
-                               verbose)
-        print >> sys.stderr
-
-    
-                
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='HISAT-genotype')
-    parser.add_argument("--base", "--base-name",
-                        dest="base_fname",
-                        type=str,
-                        default="genotype_genome",
-                        help="base filename for genotype genome")
-    parser.add_argument("--region-list",
-                        dest="region_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of regions (default: empty)")
-    parser.add_argument("-f", "--fasta",
-                        dest='fastq',
-                        action='store_false',
-                        help='FASTA file')    
-    parser.add_argument("-U",
-                        dest="read_fname_U",
-                        type=str,
-                        default="",
-                        help="filename for single-end reads")
-    parser.add_argument("-1",
-                        dest="read_fname_1",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")
-    parser.add_argument("-2",
-                        dest="read_fname_2",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")
-    parser.add_argument("--alignment-file",
-                        dest="alignment_fname",
-                        type=str,
-                        default="",
-                        help="Sorted BAM alignment file name")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--num-editdist",
-                        dest="num_editdist",
-                        type=int,
-                        default=2,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 2)")
-    parser.add_argument('--assembly',
-                        dest='assembly',
-                        action='store_true',
-                        help='Perform assembly')
-    parser.add_argument('--local-database',
-                        dest='local_database',
-                        action='store_true',
-                        help='Use local database')    
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument("--debug",
-                        dest="debug",
-                        type=str,
-                        default="",
-                        help="e.g., test_id:10,read_id:10000,basic_test")
-
-    args = parser.parse_args()
-    region_list = {}
-    if args.region_list != "":
-        for region in args.region_list.split(','):
-            region = region.split('.')
-            if len(region) < 1 or len(region) > 2:
-                print >> sys.stderr, "Error: --region-list is incorrectly formatted."
-                sys.exit(1)
-                
-            family = region[0].lower()
-            if len(region) == 2:
-                locus_name = region[1].upper()
-            if family not in region_list:
-                region_list[family] = set()
-            if len(region) == 2:
-                region_list[family].add(locus_name)
-
-    read_fnames = []
-    if args.alignment_fname != "":
-        if not os.path.exists(args.alignment_fname):
-            print >> sys.stderr, "Error: %s does not exist." % args.alignment_fname
-    elif args.read_fname_U != "":
-        read_fnames = [args.read_fname_U]
-    else:
-        if args.read_fname_1 == "" or args.read_fname_2 == "":
-            print >> sys.stderr, "Error: please specify read file names correctly: -U or -1 and -2"
-            sys.exit(1)
-        read_fnames = [args.read_fname_1, args.read_fname_2]
-
-    debug = {}
-    if args.debug != "":
-        for item in args.debug.split(','):
-            if ':' in item:
-                key, value = item.split(':')
-                debug[key] = value
-            else:
-                debug[item] = 1
-
-    genotype(args.base_fname,
-             region_list,
-             args.fastq,
-             read_fnames,
-             args.alignment_fname,
-             args.threads,
-             args.num_editdist,
-             args.assembly,
-             args.local_database,
-             args.verbose,
-             debug)
-
-
diff --git a/hisatgenotype_build_genome.py b/hisatgenotype_build_genome.py
deleted file mode 100755
index 3d103d92..00000000
--- a/hisatgenotype_build_genome.py
+++ /dev/null
@@ -1,505 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2016, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import os, sys, subprocess, re
-import shutil
-import inspect
-from argparse import ArgumentParser, FileType
-import hisatgenotype_typing_common as typing_common
-
-
-"""
-"""
-def read_clnsig(fname):
-    clnsig_dic = {}
-    for line in open(fname):
-        var_id, gene, clnsig = line.strip().split('\t')
-        clnsig_dic[var_id] = [gene, clnsig]
-    return clnsig_dic
-
-
-"""
-"""
-def build_genotype_genome(base_fname,                          
-                          inter_gap,
-                          intra_gap,
-                          threads,
-                          database_list,
-                          use_clinvar,
-                          use_commonvar,
-                          aligner,
-                          graph_index,
-                          verbose):    
-    # Download HISAT2 index
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-    if not typing_common.check_files(HISAT2_fnames):
-        typing_common.download_genome_and_index()
-
-    # Load genomic sequences
-    chr_dic, chr_names, chr_full_names = typing_common.read_genome(open("genome.fa"))
-
-    genotype_vars, genotype_haplotypes, genotype_clnsig = {}, {}, {}
-    if use_clinvar:
-        # Extract variants from the ClinVar database
-        CLINVAR_fnames = ["clinvar.vcf.gz",
-                          "clinvar.snp",
-                          "clinvar.haplotype",
-                          "clinvar.clnsig"]
-
-        if not typing_common.check_files(CLINVAR_fnames):
-            if not os.path.exists("clinvar.vcf.gz"):
-                os.system("wget ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/archive/2017/clinvar_20170404.vcf.gz")
-            assert os.path.exists("clinvar.vcf.gz")
-
-            extract_cmd = ["hisat2_extract_snps_haplotypes_VCF.py"]
-            extract_cmd += ["--inter-gap", str(inter_gap),
-                            "--intra-gap", str(intra_gap),
-                            "--genotype-vcf", "clinvar.vcf.gz",
-                            "genome.fa", "/dev/null", "clinvar"]
-            if verbose:
-                print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-            proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-            proc.communicate()
-            if not typing_common.check_files(CLINVAR_fnames):
-                print >> sys.stderr, "Error: extract variants from clinvar failed!"
-                sys.exit(1)
-
-        # Read variants to be genotyped
-        genotype_vars = typing_common.read_variants("clinvar.snp")
-
-        # Read haplotypes
-        genotype_haplotypes = typing_common.read_haplotypes("clinvar.haplotype")
-
-        # Read information about clinical significance
-        genotype_clnsig = typing_common.read_clnsig("clinvar.clnsig")
-
-    if use_commonvar:
-        # Extract variants from dbSNP database
-        commonvar_fbase = "snp144Common"
-        commonvar_fnames = ["%s.snp" % commonvar_fbase,
-                            "%s.haplotype" % commonvar_fbase]
-        if not typing_common.check_files(commonvar_fnames):
-            if not os.path.exists("%s.txt.gz" % commonvar_fbase):
-                os.system("wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/%s.txt.gz" % commonvar_fbase)
-            assert os.path.exists("%s.txt.gz" % commonvar_fbase)
-            os.system("gzip -cd %s.txt.gz | awk 'BEGIN{OFS=\"\t\"} {if($2 ~ /^chr/) {$2 = substr($2, 4)}; if($2 == \"M\") {$2 = \"MT\"} print}' > %s.txt" % (commonvar_fbase, commonvar_fbase))
-            extract_cmd = ["hisat2_extract_snps_haplotypes_UCSC.py",
-                           "--inter-gap", str(inter_gap),
-                           "--intra-gap", str(intra_gap),
-                           "genome.fa", "%s.txt" % commonvar_fbase, commonvar_fbase]
-            if verbose:
-                print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-            proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-            proc.communicate()
-            if not typing_common.check_files(commonvar_fnames):
-                print >> sys.stderr, "Error: extract variants from clinvar failed!"
-                sys.exit(1)
-
-        # Read variants to be genotyped
-        genotype_vars = typing_common.read_variants("%s.snp" % commonvar_fbase)
-
-        # Read haplotypes
-        genotype_haplotypes = typing_common.read_haplotypes("%s.haplotype" % commonvar_fbase)
-
-    # Genes to be genotyped
-    genotype_genes = {}
-
-    # Read genes or genomics regions
-    for database_name in database_list:
-        # Extract HLA variants, backbone sequence, and other sequeces
-        typing_common.extract_database_if_not_exists(database_name,
-                                                     [],            # locus_list
-                                                     inter_gap,
-                                                     intra_gap,
-                                                     True,          # partial?
-                                                     verbose)
-        locus_fname = "%s.locus" % database_name
-        assert os.path.exists(locus_fname)
-        for line in open(locus_fname):
-            locus_name, chr, left, right, length, exon_str, strand = line.strip().split()
-            left, right = int(left), int(right)
-            length = int(length)
-            if chr not in chr_names:
-                continue
-            if chr not in genotype_genes:
-                genotype_genes[chr] = []
-            genotype_genes[chr].append([left, right, length, locus_name, database_name, exon_str, strand])
-
-    # Write genotype genome
-    var_num, haplotype_num = 0, 0
-    genome_out_file = open("%s.fa" % base_fname, 'w')
-    locus_out_file = open("%s.locus" % base_fname, 'w')
-    var_out_file = open("%s.snp" % base_fname, 'w')
-    index_var_out_file = open("%s.index.snp" % base_fname, 'w')
-    haplotype_out_file = open("%s.haplotype" % base_fname, 'w')
-    link_out_file = open("%s.link" % base_fname, 'w')
-    coord_out_file = open("%s.coord" % base_fname, 'w')
-    clnsig_out_file = open("%s.clnsig" % base_fname, 'w')
-    for c in range(len(chr_names)):
-        chr = chr_names[c]
-        chr_full_name = chr_full_names[c]
-        assert chr in chr_dic
-        chr_seq = chr_dic[chr]
-        chr_len = len(chr_seq)
-        if chr in genotype_genes:
-            chr_genes = genotype_genes[chr]
-            def gene_cmp(a, b):
-                a_left, a_right, a_length = a[:3]
-                b_left, b_right, b_length = b[:3]
-                if a_left != b_left:
-                    return a_left - b_left
-                if a_right != b_right:
-                    return a_right - b_right
-                return a_lenght - b_length
-            chr_genes = sorted(chr_genes, cmp=gene_cmp)
-        else:
-            chr_genes = []
-
-        chr_genotype_vars, chr_genotype_vari = [], 0
-        if graph_index:
-            if chr in genotype_vars:
-                chr_genotype_vars = genotype_vars[chr]
-            chr_genotype_haplotypes, chr_genotype_hti = [], 0
-            if chr in genotype_haplotypes:
-                chr_genotype_haplotypes = genotype_haplotypes[chr]
-
-        def add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num):
-            # Output variants with clinical significance
-            while chr_genotype_vari < len(chr_genotype_vars):
-                var_left, var_type, var_data, var_id =  chr_genotype_vars[chr_genotype_vari]
-                var_right = var_left
-                if var_type == "deletion":
-                    var_right += var_data
-                if var_right > right:
-                    break
-                if var_right >= left:
-                    chr_genotype_vari += 1
-                    continue
-
-                out_str = "%s\t%s\t%s\t%d\t%s" % (var_id, var_type, chr, var_left + off, var_data)
-                print >> var_out_file, out_str
-                print >> index_var_out_file, out_str
-
-                if var_id in genotype_clnsig:
-                    var_gene, clnsig = genotype_clnsig[var_id]
-                    print >> clnsig_out_file, "%s\t%s\t%s" % \
-                        (var_id, var_gene, clnsig)
-                
-                chr_genotype_vari += 1
-
-            # Output haplotypes
-            while chr_genotype_hti < len(chr_genotype_haplotypes):
-                ht_left, ht_right, ht_vars =  chr_genotype_haplotypes[chr_genotype_hti]
-                if ht_right > right:
-                    break
-                if ht_right >= left:
-                    chr_genotype_hti += 1
-                    continue
-
-                print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \
-                    (haplotype_num, chr, ht_left + off, ht_right + off, ','.join(ht_vars))
-                chr_genotype_hti += 1
-                haplotype_num += 1
-
-            return chr_genotype_vari, chr_genotype_hti, haplotype_num
-
-        out_chr_seq = ""
-        
-        off = 0
-        prev_right = 0
-        for gene in chr_genes:
-            left, right, length, name, family, exon_str, strand = gene
-
-            if not graph_index:
-                # Output gene (genotype_genome.gene)
-                print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \
-                    (family.upper(), name, chr, left, right, exon_str, strand)
-                continue            
-
-            chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(left, right, chr_genotype_vari, chr_genotype_hti, haplotype_num)
-
-            # Read HLA backbone sequences
-            allele_seqs = typing_common.read_allele_sequences("%s_backbone.fa" % family)
-
-            # Read HLA variants
-            allele_vars = typing_common.read_variants("%s.snp" % family)
-            allele_index_vars = typing_common.read_variants("%s.index.snp" % family)
-                
-            # Read HLA haplotypes
-            allele_haplotypes = typing_common.read_haplotypes("%s.haplotype" % family)
-
-            # Read HLA link information between haplotypes and variants
-            links = typing_common.read_links("%s.link" % family)
-
-            if name not in allele_seqs:
-                continue
-            if name not in allele_vars or name not in allele_index_vars:
-                vars, index_vars = [], []
-            else:
-                vars, index_vars = allele_vars[name], allele_index_vars[name]
-                
-            allele_seq = allele_seqs[name]
-            index_var_ids = set()
-            for _, _, _, var_id in index_vars:
-                index_var_ids.add(var_id)
-
-            if name not in allele_haplotypes:
-                haplotypes = []
-            else:
-                haplotypes = allele_haplotypes[name]
-            assert length == len(allele_seq)
-            assert left < chr_len and right < chr_len
-            # Skipping overlapping genes
-            if left < prev_right:
-                print >> sys.stderr, "Warning: skipping %s ..." % (name)
-                continue
-
-            varID2htID = {}
-
-            assert left < right
-            prev_length = right - left + 1
-            assert prev_length <= length
-
-            if prev_right < left:
-                out_chr_seq += chr_seq[prev_right:left]
-
-            # Output gene (genotype_genome.gene)
-            print >> locus_out_file, "%s\t%s\t%s\t%d\t%d\t%s\t%s" % \
-                (family.upper(), name, chr, len(out_chr_seq), len(out_chr_seq) + length - 1, exon_str, strand)
-
-            # Output coord (genotype_genome.coord)
-            print >> coord_out_file, "%s\t%d\t%d\t%d" % \
-                (chr, len(out_chr_seq), left, right - left + 1)
-            out_chr_seq += allele_seq
-
-            # Output variants (genotype_genome.snp and genotype_genome.index.snp)
-            for var in vars:
-                var_left, var_type, var_data, var_id = var
-                new_var_id = "hv%d" % var_num
-                varID2htID[var_id] = new_var_id
-                new_var_left = var_left + left + off
-                assert var_type in ["single", "deletion", "insertion"]
-                assert new_var_left < len(out_chr_seq)
-                if var_type == "single":                    
-                    assert out_chr_seq[new_var_left] != var_data
-                elif var_type == "deletion":
-                    assert new_var_left + var_data <= len(out_chr_seq)
-                else:
-                    assert var_type == "insertion"
-
-                out_str = "%s\t%s\t%s\t%d\t%s" % (new_var_id, var_type, chr, new_var_left, var_data)
-                print >> var_out_file, out_str
-                if var_id in index_var_ids:
-                    print >> index_var_out_file, out_str
-                var_num += 1
-                
-            # Output haplotypes (genotype_genome.haplotype)
-            for haplotype in haplotypes:
-                ht_left, ht_right, ht_vars = haplotype
-                new_ht_left = ht_left + left + off
-                assert new_ht_left < len(out_chr_seq)
-                new_ht_right = ht_right + left + off
-                assert new_ht_left <= new_ht_right
-                assert new_ht_right <= len(out_chr_seq)
-                new_ht_vars = []
-                for var_id in ht_vars:
-                    assert var_id in varID2htID
-                    new_ht_vars.append(varID2htID[var_id])
-                print >> haplotype_out_file, "ht%d\t%s\t%d\t%d\t%s" % \
-                    (haplotype_num, chr, new_ht_left, new_ht_right, ','.join(new_ht_vars))
-                haplotype_num += 1
-
-            # Output link information between alleles and variants (genotype_genome.link)
-            for link in links:
-                var_id, allele_names = link
-                if var_id not in varID2htID:
-                    continue
-                new_var_id = varID2htID[var_id]
-                print >> link_out_file, "%s\t%s" % (new_var_id, allele_names)
-                
-            off += (length - prev_length)
-
-            prev_right = right + 1
-
-        if not graph_index:
-            continue
-
-        # Write the rest of the Vars
-        chr_genotype_vari, chr_genotype_hti, haplotype_num = add_vars(sys.maxint, sys.maxint, chr_genotype_vari, chr_genotype_hti, haplotype_num)            
-            
-        print >> coord_out_file, "%s\t%d\t%d\t%d" % \
-            (chr, len(out_chr_seq), prev_right, len(chr_seq) - prev_right)
-        out_chr_seq += chr_seq[prev_right:]
-
-        assert len(out_chr_seq) == len(chr_seq) + off
-
-        # Output chromosome sequence
-        print >> genome_out_file, ">%s" % (chr_full_name)
-        line_width = 60
-        for s in range(0, len(out_chr_seq), line_width):
-            print >> genome_out_file, out_chr_seq[s:s+line_width]
-
-    genome_out_file.close()
-    locus_out_file.close()
-    var_out_file.close()
-    index_var_out_file.close()
-    haplotype_out_file.close()
-    link_out_file.close()
-    coord_out_file.close()
-    clnsig_out_file.close()
-
-    allele_out_file = open("%s.allele" % base_fname, 'w')
-    if graph_index:
-        for database in database_list:
-            for line in open("%s.allele" % database):
-                allele_name = line.strip()
-                print >> allele_out_file, "%s\t%s" % (database.upper(), allele_name)
-    allele_out_file.close()
-
-    partial_out_file = open("%s.partial" % base_fname, 'w')
-    if graph_index:
-        for database in database_list:
-            for line in open("%s.partial" % database):
-                allele_name = line.strip()
-                print >> partial_out_file, "%s\t%s" % (database.upper(), allele_name)
-    partial_out_file.close()
-
-    if not graph_index:
-        shutil.copyfile("genome.fa", "%s.fa" % base_fname)
-
-    # Index genotype_genome.fa
-    index_cmd = ["samtools", "faidx", "%s.fa" % base_fname]
-    subprocess.call(index_cmd)
-
-    # Build indexes based on the above information
-    if graph_index:
-        assert aligner == "hisat2"
-        build_cmd = ["hisat2-build",
-                     "-p", str(threads),
-                     "--snp", "%s.index.snp" % base_fname,
-                     "--haplotype", "%s.haplotype" % base_fname,
-                     "%s.fa" % base_fname,
-                     "%s" % base_fname]
-    else:        
-        assert aligner in ["hisat2", "bowtie2"]
-        build_cmd = ["%s-build" % aligner,
-                     "-p" if aligner == "hisat2" else "--threads", str(threads),
-                     "%s.fa" % base_fname,
-                     "%s" % base_fname]
-    if verbose:
-        print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
-        
-    subprocess.call(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-
-    if aligner == "hisat2":
-        index_fnames = ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
-    else:
-        index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)]
-        index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)]
-    if not typing_common.check_files(index_fnames):
-        print >> sys.stderr, "Error: indexing failed!  Perhaps, you may have forgotten to build %s executables?" % aligner
-        sys.exit(1)
-
-        
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description="Build genotype genome")
-    parser.add_argument("--base", "--base-fname",
-                        dest="base_fname",
-                        type=str,
-                        default="genotype_genome",
-                        help="base filename for genotype genome (default: genotype_genome)")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--database-list",
-                        dest="database_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of databases (default: hla,codis,cyp)")
-    parser.add_argument("--commonvar",
-                        dest="use_commonvar",
-                        action="store_true",
-                        help="Include common variants from dbSNP")
-    parser.add_argument("--clinvar",
-                        dest="use_clinvar",
-                        action="store_true",
-                        help="Include variants from ClinVar database")
-    parser.add_argument("--inter-gap",
-                        dest="inter_gap",
-                        type=int,
-                        default=30,
-                        help="Maximum distance for variants to be in the same haplotype")
-    parser.add_argument("--intra-gap",
-                        dest="intra_gap",
-                        type=int,
-                        default=50,
-                        help="Break a haplotype into several haplotypes")
-    parser.add_argument("--aligner",
-                        dest="aligner",
-                        type=str,
-                        default="hisat2",
-                        help="Aligner (default: hisat2)")
-    parser.add_argument("--linear-index",
-                        dest="graph_index",
-                        action="store_false",
-                        help="Build linear index")
-    parser.add_argument("-v", "--verbose",
-                        dest="verbose",
-                        action="store_true",
-                        help="also print some statistics to stderr")
-
-    args = parser.parse_args()
-    if args.inter_gap > args.intra_gap:
-        print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap)
-        sys.exit(1)
-        
-    if args.database_list == "":
-        database_list = []
-    else:
-        database_list = args.database_list.split(',')
-
-    if args.use_clinvar and args.use_commonvar:
-        print >> sys.stderr, "Error: both --clinvar and --commonvar cannot be used together."
-        sys.exit(1)
-
-    if args.aligner not in ["hisat2", "bowtie2"]:
-        print >> sys.stderr, "Error: --aligner should be either hisat2 or bowtie2."
-        sys.exit(1)        
-        
-    build_genotype_genome(args.base_fname,
-                          args.inter_gap,
-                          args.intra_gap,
-                          args.threads,
-                          database_list,
-                          args.use_clinvar,
-                          args.use_commonvar,
-                          args.aligner,
-                          args.graph_index,
-                          args.verbose)
-    
diff --git a/hisatgenotype_extract_reads.py b/hisatgenotype_extract_reads.py
deleted file mode 100755
index 98215655..00000000
--- a/hisatgenotype_extract_reads.py
+++ /dev/null
@@ -1,541 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2017, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT-genotype.
-#
-# HISAT-genotype is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT-genotype is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re, resource
-import inspect
-import random
-import glob
-from argparse import ArgumentParser, FileType
-import hisatgenotype_typing_common as typing_common
-
-
-"""
-"""
-def parallel_work(pids, 
-                  work, 
-                  fq_fname_base, 
-                  fq_fname, 
-                  fq_fname2, 
-                  ranges,
-                  simulation,
-                  verbose):
-    child = -1
-    for i in range(len(pids)):
-        if pids[i] == 0:
-            child = i
-            break
-
-    while child == -1:
-        status = os.waitpid(0, 0)
-        for i in range(len(pids)):
-            if status[0] == pids[i]:
-                child = i
-                pids[i] = 0
-                break
-
-    child_id = os.fork()
-    if child_id == 0:
-        work(fq_fname_base, 
-             fq_fname, 
-             fq_fname2, 
-             ranges,
-             simulation,
-             verbose)
-        os._exit(os.EX_OK)
-    else:
-        # print >> sys.stderr, '\t\t>> thread %d: %d' % (child, child_id)
-        pids[child] = child_id
-
-        
-"""
-"""
-def wait_pids(pids):
-    for pid in pids:
-        if pid > 0:
-            os.waitpid(pid, 0)
-            
-
-"""
-"""
-def extract_reads(base_fname,
-                  database_list,
-                  read_dir,
-                  out_dir,
-                  suffix,
-                  read_fname,
-                  fastq,
-                  paired,
-                  simulation,
-                  threads,
-                  threads_aprocess,
-                  max_sample,
-                  job_range,
-                  aligner,
-                  block_size,
-                  verbose):
-    if block_size > 0:
-        resource.setrlimit(resource.RLIMIT_NOFILE, (1000, 1000))
-        resource.setrlimit(resource.RLIMIT_NPROC, (1000, 1000))
-        
-    genotype_fnames = ["%s.fa" % base_fname,
-                       "%s.locus" % base_fname,
-                       "%s.snp" % base_fname,
-                       "%s.haplotype" % base_fname,
-                       "%s.link" % base_fname,
-                       "%s.coord" % base_fname,
-                       "%s.clnsig" % base_fname]
-    # graph index files
-    if aligner == "hisat2":
-        genotype_fnames += ["%s.%d.ht2" % (base_fname, i+1) for i in range(8)]
-    else:
-        assert aligner == "bowtie2"
-        genotype_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)]
-        genotype_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)]
-        
-    if not typing_common.check_files(genotype_fnames):        
-        print >> sys.stderr, "Error: %s related files do not exist as follows:" % base_fname
-        for fname in genotype_fnames:
-            print >> sys.stderr, "\t%s" % fname
-        sys.exit(1)
-
-    filter_region = len(database_list) > 0
-    ranges = []
-    regions, region_loci = {}, {}
-    for line in open("%s.locus" % base_fname):
-        family, allele_name, chr, left, right = line.strip().split()[:5]
-        if filter_region and family.lower() not in database_list:
-            continue
-        region_name = "%s-%s" % (family, allele_name.split('*')[0])
-        assert region_name not in regions
-        regions[region_name] = allele_name
-        left, right = int(left), int(right)
-        """
-        exons = []
-        for exon in exon_str.split(','):
-            exon_left, exon_right = exon.split('-')
-            exons.append([int(exon_left), int(exon_right)])
-        """
-        if chr not in region_loci:
-            region_loci[chr] = {}
-        region_loci[chr][region_name] = [allele_name, chr, left, right]
-        database_list.add(family.lower())
-
-    if out_dir != "" and not os.path.exists(out_dir):
-        os.mkdir(out_dir)
-
-    # Extract reads
-    if len(read_fname) > 0:
-        if paired:
-            fq_fnames = [read_fname[0]]
-            fq_fnames2 = [read_fname[1]]
-        else:
-            fq_fnames = read_fname
-    else:
-        if paired:
-            fq_fnames = glob.glob("%s/*.1.%s" % (read_dir, suffix))
-        else:
-            fq_fnames = glob.glob("%s/*.%s" % (read_dir, suffix))
-    count = 0
-    pids = [0 for i in range(threads)]
-    for file_i in range(len(fq_fnames)):
-        if file_i >= max_sample:
-            break
-        fq_fname = fq_fnames[file_i]
-        if job_range[1] > 1:
-            if job_range[0] != (file_i % job_range[1]):
-                continue
-
-        fq_fname_base = fq_fname.split('/')[-1]
-        one_suffix = ".1." + suffix
-        if fq_fname_base.find(one_suffix) != -1:
-            fq_fname_base = fq_fname_base[:fq_fname_base.find(one_suffix)]
-        else:
-            fq_fname_base = fq_fname_base.split('.')[0]
-            
-        if paired:
-            if read_dir == "":
-                fq_fname2 = fq_fnames2[file_i]
-            else:
-                fq_fname2 = "%s/%s.2.%s" % (read_dir, fq_fname_base, suffix)
-            if not os.path.exists(fq_fname2):
-                print >> sys.stderr, "%s does not exist." % fq_fname2
-                continue
-        else:
-            fq_fname2 = ""
-
-        if paired:
-            if out_dir != "":
-                if os.path.exists("%s/%s.extracted.1.fq.gz" % (out_dir, fq_fname_base)):
-                    continue
-        else:
-            if out_dir != "":
-                if os.path.exists("%s/%s.extracted.fq.gz" % (out_dir, fq_fname_base)):
-                    continue
-        count += 1
-
-        print >> sys.stderr, "\t%d: Extracting reads from %s" % (count, fq_fname_base)
-        def work(fq_fname_base,
-                 fq_fname, 
-                 fq_fname2, 
-                 ranges,
-                 simulation,
-                 verbose):
-            aligner_cmd = [aligner]
-            if threads_aprocess > 1:
-                aligner_cmd += ["-p", "%d" % threads_aprocess]
-            if not fastq:
-                aligner_cmd += ["-f"]
-            aligner_cmd += ["-x", base_fname]
-            if aligner == "hisat2":
-                aligner_cmd += ["--no-spliced-alignment"]
-                # aligner_cmd += ["--max-altstried", "64"]
-            aligner_cmd += ["-X", "1000"]
-            if paired:
-                aligner_cmd += ["-1", fq_fname,
-                                "-2", fq_fname2]
-            else:
-                aligner_cmd += ["-U", fq_fname]
-            if verbose:
-                print >> sys.stderr, "\t\trunning", ' '.join(aligner_cmd)
-            align_proc = subprocess.Popen(aligner_cmd,
-                                          stdout=subprocess.PIPE,
-                                          stderr=open("/dev/null", 'w'))
-
-            gzip_dic = {}
-            out_dir_slash = out_dir
-            if out_dir != "":
-                out_dir_slash += "/"
-            for database in database_list:
-                if paired:
-                    # LP6005041-DNA_A01.extracted.1.fq.gz
-                    gzip1_proc = subprocess.Popen(["gzip"],
-                                                  stdin=subprocess.PIPE,
-                                                  stdout=open("%s%s.%s.extracted.1.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'),
-                                                  stderr=open("/dev/null", 'w'))
-
-                    # LP6005041-DNA_A01.extracted.2.fq.gz
-                    gzip2_proc = subprocess.Popen(["gzip"],
-                                                  stdin=subprocess.PIPE,
-                                                  stdout=open("%s%s.%s.extracted.2.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'),
-                                                  stderr=open("/dev/null", 'w'))
-                else:
-                    # LP6005041-DNA_A01.extracted.fq.gz
-                    gzip1_proc = subprocess.Popen(["gzip"],
-                                                  stdin=subprocess.PIPE,
-                                                  stdout=open("%s%s.%s.extracted.fq.gz" % (out_dir_slash, fq_fname_base, database), 'w'),
-                                                  stderr=open("/dev/null", 'w'))
-                gzip_dic[database] = [gzip1_proc, gzip2_proc if paired else None]
-
-            whole_gzip_dic = {}
-            if block_size > 0:
-                mult = block_size / 1000000
-                for chr_line in open("%s.fa.fai" % base_fname):
-                    chr, length = chr_line.strip().split('\t')[:2]
-                    length = int(length)
-                    if chr not in [str(i+1) for i in range(22)] + ['X', 'Y', 'MT']:
-                        continue
-                    length = (length + block_size - 1) / block_size
-                    assert chr not in whole_gzip_dic
-                    whole_gzip_dic[chr] = []
-                    for region_i in range(length):
-                        if paired:
-                            # LP6005041-DNA_A01.extracted.1.fq.gz
-                            gzip1_proc = subprocess.Popen(["gzip"],
-                                                          stdin=subprocess.PIPE,
-                                                          stdout=open("%s%s.%s.%d_%dM.extracted.1.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'),
-                                                          stderr=open("/dev/null", 'w'))
-
-                            # LP6005041-DNA_A01.extracted.2.fq.gz
-                            gzip2_proc = subprocess.Popen(["gzip"],
-                                                          stdin=subprocess.PIPE,
-                                                          stdout=open("%s%s.%s.%d_%dM.extracted.2.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'),
-                                                          stderr=open("/dev/null", 'w'))
-                        else:
-                            # LP6005041-DNA_A01.extracted.fq.gz
-                            gzip1_proc = subprocess.Popen(["gzip"],
-                                                          stdin=subprocess.PIPE,
-                                                          stdout=open("%s%s.%s.%d_%dM.extracted.fq.gz" % (out_dir_slash, fq_fname_base, chr, region_i * mult, (region_i + 1) * mult), 'w'),
-                                                          stderr=open("/dev/null", 'w'))
-                        whole_gzip_dic[chr].append([gzip1_proc, gzip2_proc if paired else None])
-
-
-            def write_read(gzip_proc, read_name, seq, qual):
-                if fastq:
-                    gzip_proc.stdin.write("@%s\n" % read_name)
-                    gzip_proc.stdin.write("%s\n" % seq)
-                    gzip_proc.stdin.write("+\n")
-                    gzip_proc.stdin.write("%s\n" % qual)
-                else:
-                    gzip_proc.stdin.write(">%s\n" % prev_read_name)
-                    gzip_proc.stdin.write("%s\n" % seq)                    
-
-            prev_read_name, extract_read, whole_extract_read, read1, read2, read1_first, read2_first = "", set(), set(), [], [], True, True
-            for line in align_proc.stdout:
-                if line.startswith('@'):
-                    continue
-                line = line.strip()
-                cols = line.split()
-                read_name, flag, chr, pos, mapQ, cigar, _, _, _, read, qual = cols[:11]
-                flag, pos = int(flag), int(pos) - 1
-                strand = '-' if flag & 0x10 else '+'                   
-                AS, XS, NH = "", "", ""
-                for i in range(11, len(cols)):
-                    col = cols[i]
-                    if col.startswith("AS"):
-                        AS = int(col[5:])
-                    elif col.startswith("XS"):
-                        XS = int(col[5:])
-                    elif col.startswith("NH"):
-                        NH = int(col[5:])
-
-                if (not simulation and read_name != prev_read_name) or \
-                   (simulation and read_name.split('|')[0] != prev_read_name.split('|')[0]):
-                    for region in extract_read:
-                        write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1])
-                        if paired:
-                            write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1])
-                            
-                    for chr_region_num in whole_extract_read:
-                        region_chr, region_num = chr_region_num.split('-')
-                        region_num = int(region_num)
-                        if region_chr not in whole_gzip_dic:
-                            continue
-
-                        assert region_num < len(whole_gzip_dic[region_chr])
-                        write_read(whole_gzip_dic[region_chr][region_num][0], prev_read_name, read1[0], read1[1])
-                        if paired:
-                            write_read(whole_gzip_dic[region_chr][region_num][1], prev_read_name, read2[0], read2[1])
-
-                    prev_read_name, extract_read, whole_extract_read, read1, read2, read1_first, read2_first = read_name, set(), set(), [], [], True, True
-
-                if flag & 0x4 == 0 and \
-                   ((aligner == "hisat2" and NH == 1) or (aligner == "bowtie2" and AS > XS and read1_first if flag & 0x40 or not paired else read2_first)):
-                    if chr in region_loci:
-                        for region, loci in region_loci[chr].items():
-                            region = region.split('-')[0].lower()
-                            _, _, loci_left, loci_right = loci
-                            # there might be a different candidate region for each of left and right reads
-                            if pos >= loci_left and pos < loci_right:
-                                extract_read.add(region)
-                                break
-                    if block_size > 0:
-                        chr_region_num = "%s-%d" % (chr, pos / block_size)
-                        whole_extract_read.add(chr_region_num)
-
-                if flag & 0x40 or not paired: # left read
-                    read1_first = False
-                    if not read1:
-                        if flag & 0x10: # reverse complement
-                            read1 = [typing_common.reverse_complement(read), qual[::-1]]
-                        else:
-                            read1 = [read, qual]
-                else:
-                    assert flag & 0x80 # right read
-                    read2_first = False
-                    if flag & 0x10: # reverse complement
-                        read2 = [typing_common.reverse_complement(read), qual[::-1]]
-                    else:
-                        read2 = [read, qual]
-
-            for region in extract_read:
-                write_read(gzip_dic[region][0], prev_read_name, read1[0], read1[1])
-                if paired:
-                    write_read(gzip_dic[region][1], prev_read_name, read2[0], read2[1])
-
-            for chr_region_num in whole_extract_read:
-                region_chr, region_num = chr_region_num.split('-')
-                region_num = int(region_num)
-                if region_chr not in whole_gzip_dic:
-                    continue
-                assert region_num < len(whole_gzip_dic[region_chr])
-                write_read(whole_gzip_dic[region_chr][region_num][0], prev_read_name, read1[0], read1[1])
-                if paired:
-                    write_read(whole_gzip_dic[region_chr][region_num][1], prev_read_name, read2[0], read2[1])
-
-            for gzip1_proc, gzip2_proc in gzip_dic.values():
-                gzip1_proc.stdin.close()
-                if paired:
-                    gzip2_proc.stdin.close()
-
-            for gzip_list in whole_gzip_dic.values():
-                for gzip1_proc, gzip2_proc in gzip_list:
-                    gzip1_proc.stdin.close()
-                    if paired:
-                        gzip2_proc.stdin.close()         
-
-
-        if threads <= 1:
-            work(fq_fname_base, 
-                 fq_fname, 
-                 fq_fname2,
-                 ranges,
-                 simulation,
-                 verbose)
-        else:
-            parallel_work(pids, 
-                          work, 
-                          fq_fname_base, 
-                          fq_fname, 
-                          fq_fname2, 
-                          ranges,
-                          simulation,
-                          verbose)
-
-    if threads > 1:
-        wait_pids(pids)
-
-
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='Extract reads')
-    parser.add_argument("--base", "--base-fname",
-                        dest="base_fname",
-                        type=str,
-                        default="genotype_genome",
-                        help="base filename for genotype genome")
-    parser.add_argument("--read-dir",
-                        dest="read_dir",
-                        type=str,
-                        default="",
-                        help="Directory name for read files")
-    parser.add_argument("--out-dir",
-                        dest="out_dir",
-                        type=str,
-                        default="",
-                        help="Directory name for extracted read files")
-    parser.add_argument("--suffix",
-                        dest="suffix",
-                        type=str,
-                        default="fq.gz",
-                        help="Read file suffix (Default: fq.gz)")
-    parser.add_argument('-f', '--fasta',
-                        dest='fastq',
-                        action='store_false',
-                        help='FASTA format')
-    parser.add_argument("-U",
-                        dest="read_fname_U",
-                        type=str,
-                        default="",
-                        help="filename for single-end reads")
-    parser.add_argument("-1",
-                        dest="read_fname_1",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")
-    parser.add_argument("-2",
-                        dest="read_fname_2",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")    
-    parser.add_argument("--database-list",
-                        dest="database_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of database (default: empty)")
-    parser.add_argument('--simulation',
-                        dest='simulation',
-                        action='store_true',
-                        help='Simulated reads (Default: False)')    
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--pp", "--threads-aprocess",
-                        dest="threads_aprocess",
-                        type=int,
-                        default=1,
-                        help="Number of threads a process")
-    parser.add_argument("--max-sample",
-                        dest="max_sample",
-                        type=int,
-                        default=sys.maxint,
-                        help="Number of samples to be extracted (default: sys.maxint)")
-    parser.add_argument("--job-range",
-                        dest="job_range",
-                        type=str,
-                        default="0,1",
-                        help="two numbers (e.g. 1,3)")
-    parser.add_argument("--aligner",
-                        dest="aligner",
-                        type=str,
-                        default="hisat2",
-                        help="Aligner (default: hisat2)")
-    parser.add_argument("--extract-whole",
-                        dest="extract_whole",
-                        action='store_true',
-                        help="Extract all reads")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-
-    args = parser.parse_args()
-
-    database_list = set()
-    if args.database_list != "":
-        for region in args.database_list.split(','):
-            database_list.add(region)
-    if args.read_fname_U != "":
-        args.read_fname = [args.read_fname_U]
-    elif args.read_fname_1 != "" or args.read_fname_2 != "":
-        if args.read_fname_1 == "" or args.read_fname_2 == "":
-            print >> sys.stderr, "Error: please specify both -1 and -2."
-            sys.exit(1)
-        args.read_fname = [args.read_fname_1, args.read_fname_2]
-    else:
-        args.read_fname = []
-    if len(args.read_fname) == 0:
-        if args.read_dir == "" or not os.path.exists(args.read_dir):
-            print >> sys.stderr, "Error: please specify --read-dir with an existing directory."
-            sys.exit(1)
-        if args.out_dir == "":
-            print >> sys.stderr, "Error: please specify --out-dir with a directory name."
-            sys.exit(1)
-    job_range = []
-    for num in args.job_range.split(','):
-        job_range.append(int(num))
-
-    if args.aligner not in ["hisat2", "bowtie2"]:
-        print >> sys.stderr, "Error: --aligner should be either hisat2 or bowtie2."
-        sys.exit(1)        
-    block_size = 20000000 if args.extract_whole else 0
-        
-    extract_reads(args.base_fname,
-                  database_list,
-                  args.read_dir,
-                  args.out_dir,
-                  args.suffix,
-                  args.read_fname,
-                  args.fastq,
-                  False if args.read_fname_U != "" else True,
-                  args.simulation,
-                  args.threads,
-                  args.threads_aprocess,
-                  args.max_sample,
-                  job_range,
-                  args.aligner,
-                  block_size,
-                  args.verbose)
-
diff --git a/hisatgenotype_extract_vars.py b/hisatgenotype_extract_vars.py
deleted file mode 100755
index 4c673177..00000000
--- a/hisatgenotype_extract_vars.py
+++ /dev/null
@@ -1,1299 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import os, sys, subprocess, re
-import inspect
-import glob
-from argparse import ArgumentParser, FileType
-import hisatgenotype_typing_common as typing_common
-
-
-"""
-Mapping from base pair to a location in MSF format
-"""
-def create_map(seq):
-    seq_map = {}
-    count = 0
-    for i in range(len(seq)):
-        bp = seq[i]
-        if bp == '.':
-            continue
-        assert bp in "ACGT"
-        seq_map[count] = i
-        count += 1
-    return seq_map
-
-
-"""
-"""
-def create_consensus_seq(seqs,
-                         seq_len,
-                         min_var_freq,
-                         remove_empty = True):
-    consensus_freq = [[0, 0, 0, 0, 0] for i in range(seq_len)]
-    for i in range(len(seqs)):                
-        seq = seqs[i]
-        if len(seq) != seq_len:
-            continue                    
-        for j in range(seq_len):
-            nt = seq[j]
-            assert nt in "ACGT.E"
-            if nt == 'A':
-                consensus_freq[j][0] += 1
-            elif nt == 'C':
-                consensus_freq[j][1] += 1
-            elif nt == 'G':
-                consensus_freq[j][2] += 1
-            elif nt == 'T':
-                consensus_freq[j][3] += 1
-            else:
-                assert nt in ".E"
-                consensus_freq[j][4] += 1
-
-    for j in range(len(consensus_freq)):
-        for k in range(len(consensus_freq[j])):
-            consensus_freq[j][k] /= float(len(seqs))
-            consensus_freq[j][k] *= 100.0
-
-    consensus_seq = ""
-    has_empty = False
-    for c in range(len(consensus_freq)):
-        freq = consensus_freq[c]
-        A, C, G, T, E = freq
-        # No alleles have bases at this particular location
-        if E >= 100.0:
-            has_empty = True
-            consensus_seq += 'E'
-            continue
-        if E >= 100.0 - min_var_freq:
-            idx = 4
-        else:
-            idx = freq.index(max(freq[:4]))
-        assert idx < 5
-        consensus_seq += "ACGT."[idx]
-    consensus_seq = ''.join(consensus_seq)
-
-    # Remove dots (deletions)
-    skip_pos = set()
-    if has_empty and remove_empty:
-        for seq_i in range(len(seqs)):
-            seqs[seq_i] = list(seqs[seq_i])
-        for i in range(len(consensus_seq)):
-            if consensus_seq[i] != 'E':
-                continue
-            skip_pos.add(i)
-            for seq_i in range(len(seqs)):
-                if i >= len(seqs[seq_i]):
-                    continue
-                seqs[seq_i][i] = 'E'
-        for seq_i in range(len(seqs)):
-            seqs[seq_i] = ''.join(seqs[seq_i])
-            seqs[seq_i] = seqs[seq_i].replace('E', '')
-        consensus_seq = consensus_seq.replace('E', '')
-
-    # Convert a list form of consensus_freq to a dictionary form
-    temp_freq = []
-    for j in range(len(consensus_freq)):
-        if j in skip_pos:
-            continue
-        freq_dic = {}
-        for k in range(len(consensus_freq[j])):
-            freq = consensus_freq[j][k]
-            if freq <= 0.0:
-                continue
-            nt = "ACGT."[k]                    
-            freq_dic[nt] = freq
-        temp_freq.append(freq_dic)
-    consensus_freq = temp_freq
-
-    assert len(consensus_seq) == len(consensus_freq)                
-    return consensus_seq, consensus_freq
-
-
-
-"""
-Left-shift deletions if poissble
-"""
-def leftshift_deletions(backbone_seq, seq, debug = False):
-    if len(seq) != len(backbone_seq):
-        return seq
-    seq = list(seq)
-    seq_len = len(seq)
-    bp_i = 0
-    # Skip the first deletion
-    while bp_i < seq_len:
-        if seq[bp_i] in "ACGT":
-            break
-        bp_i += 1
-
-    while bp_i < seq_len:
-        bp = seq[bp_i]
-        if bp != '.':
-            bp_i += 1
-            continue
-        bp_j = bp_i + 1
-        while bp_j < seq_len:
-            bp2 = seq[bp_j]
-            if bp2 != '.':
-                break
-            else:
-                bp_j += 1
-
-        if bp_j >= seq_len:
-            bp_i = bp_j
-            break
-
-        if debug:
-            print >> sys.stderr, bp_i, bp_j, backbone_seq[bp_i-10:bp_i], backbone_seq[bp_i:bp_j], backbone_seq[bp_j:bp_j+10]
-            print >> sys.stderr, bp_i, bp_j, ''.join(seq[bp_i-10:bp_i]), ''.join(seq[bp_i:bp_j]), ''.join(seq[bp_j:bp_j+10])
-        prev_i, prev_j = bp_i, bp_j
-
-        while bp_i > 0 and seq[bp_i-1] in "ACGT" and backbone_seq[bp_j-1] in "ACGT":
-            if seq[bp_i-1] != backbone_seq[bp_j-1]:
-                break
-            seq[bp_j-1] = seq[bp_i-1]
-            seq[bp_i-1] = '.'
-            bp_i -= 1
-            bp_j -= 1
-        bp_i = bp_j
-        while bp_i < seq_len:
-            if seq[bp_i] in "ACGT":
-                break
-            bp_i += 1
-
-        # DK - debugging purposes
-        if debug:
-            print prev_i, prev_j, ''.join(seq[prev_i-10:prev_i]), ''.join(seq[prev_i:prev_j]), ''.join(seq[prev_j:prev_j+10])
-
-    return ''.join(seq)
-
-
-"""
-"""
-def extract_vars(base_fname,
-                 base_dname,
-                 locus_list,
-                 inter_gap,
-                 intra_gap,
-                 whole_haplotype,
-                 min_var_freq,
-                 ext_seq_len,
-                 leftshift,
-                 partial,
-                 verbose):
-    base_fullpath_name = base_fname
-    if base_dname != "" and not os.path.exists(base_dname):
-        os.mkdir(base_dname)
-        base_fullpath_name = "%s/%s" % (base_dname, base_fname)
-
-    # Download human genome and HISAT2 index
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-
-    if not typing_common.check_files(HISAT2_fnames):
-        typing_common.download_genome_and_index()
-    
-    # Corresponding genomic loci found by HISAT2 (reference is GRCh38)
-    #   e.g. hisat2 --no-unal --score-min C,0 -x grch38/genome -f hisatgenotype_db/HLA/fasta/A_gen.fasta
-    locus_file = open(base_fullpath_name + ".locus", 'w')
-    left_ext_seq_dic, right_ext_seq_dic = {}, {}
-    genes, gene_strand = {}, {}
-
-    # Clone a git repository, hisatgenotype_db
-    if not os.path.exists("hisatgenotype_db"):
-        typing_common.clone_hisatgenotype_database()
-    fasta_dname = "hisatgenotype_db/%s/fasta" % base_fname.upper()
-
-    # Check HLA genes
-    gene_names = []
-    if base_fname == "hla":
-        fasta_fnames = glob.glob("%s/*_gen.fasta" % fasta_dname)
-    else:
-        assert base_fname in ["codis", "cyp"]
-        fasta_fnames = glob.glob("%s/*.fasta" % fasta_dname)
-    for gen_fname in fasta_fnames:
-        gene_name = gen_fname.split('/')[-1].split('_')[0]
-        if gene_name == "hla":
-            continue
-        gene_names.append(gene_name)
-
-    if locus_list == []:
-        locus_list = gene_names
-
-    cigar_re = re.compile('\d+\w')
-    remove_locus_list = []
-    for gene in locus_list:
-        aligner_cmd = ["hisat2"]
-        if base_fname in ["hla", "coids"]:
-            aligner_cmd += ["--score-min", "C,0"]
-        aligner_cmd += ["--no-unal",
-                        "-x", "grch38/genome",
-                        "-f", "%s/%s_gen.fasta" % (fasta_dname, gene)]
-        align_proc = subprocess.Popen(aligner_cmd,
-                                      stdout=subprocess.PIPE,
-                                      stderr=open("/dev/null", 'w'))
-        allele_id = ""
-        best_chr, best_left, best_right, best_AS, best_strand = "", -1, -1, -sys.maxint, ''
-        for line in align_proc.stdout:
-            if line.startswith('@'):
-                continue
-            line = line.strip()
-            cols = line.split()
-            temp_allele_id, flag, chr, left, _, cigar_str = cols[:6]
-            left = int(left) - 1
-            right = left
-            cigars = cigar_re.findall(cigar_str)
-            cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-            if len(cigars) > 1 or cigars[0][0] != 'M':
-                continue
-            for i in range(len(cigars)):
-                cigar_op, length = cigars[i]
-                if cigar_op in "MND":
-                    right += length
-
-            flag = int(flag)
-            strand = '-' if flag & 0x10 else '+'
-            AS = ""
-            for i in range(11, len(cols)):
-                col = cols[i]
-                if col.startswith("AS"):
-                    AS = col[5:]
-            assert AS != ""
-            AS = int(AS)
-            if AS > best_AS:
-                allele_id = temp_allele_id
-                best_chr, best_left, best_right, best_AS, best_strand = chr, left, right, AS, strand
-
-        chr, left, right, strand = best_chr, best_left, best_right, best_strand
-        align_proc.communicate()
-        if allele_id == "":
-            remove_locus_list.append(gene)
-            continue
-        if base_fname == "hla":
-            allele_name = ""
-            for line in open("%s/%s_gen.fasta" % (fasta_dname, gene)):
-                line = line.strip()
-                if not line.startswith('>'):
-                    continue
-                tmp_allele_id, tmp_allele_name = line[1:].split()[:2]
-                if allele_id == tmp_allele_id:
-                    allele_name = tmp_allele_name
-                    break
-        else:
-            allele_name = allele_id
-        assert allele_name != "" and strand != ''
-        genes[gene] = allele_name
-        gene_strand[gene] = strand
-        print >> sys.stderr, "%s-%s's reference allele is %s on '%s' strand of chromosome %s" % \
-            (base_fname.upper(), gene, allele_name, strand, chr)
-
-        assert chr != "" and left >= 0 and right > left
-        if ext_seq_len > 0:
-            left_ext_seq, right_ext_seq = "", ""
-            left1, left2 = max(1, left - ext_seq_len), max(1, left - 1)
-            if left2 > 0:
-                extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, left1, left2)]
-                extract_seq_proc = subprocess.Popen(extract_seq_cmd,
-                                                    stdout=subprocess.PIPE,
-                                                    stderr=open("/dev/null", 'w'))
-                for line in extract_seq_proc.stdout:
-                    if line.startswith('>'):
-                        continue
-                    line = line.strip()
-                    left_ext_seq += line
-            extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + ext_seq_len - 1)]
-            extract_seq_proc = subprocess.Popen(extract_seq_cmd,
-                                                stdout=subprocess.PIPE,
-                                                stderr=open("/dev/null", 'w'))
-            for line in extract_seq_proc.stdout:
-                if line.startswith('>'):
-                    continue
-                line = line.strip()
-                right_ext_seq += line
-
-            if strand == '-':
-                left_ext_seq, right_ext_seq = typing_common.reverse_complement(right_ext_seq), typing_common.reverse_complement(left_ext_seq)
-            left_ext_seq_dic[gene], right_ext_seq_dic[gene] = left_ext_seq, right_ext_seq
-            
-
-    # Extract exon information from hla.data
-    gene_exons, gene_exon_counts = {}, {}
-    if base_fname == "hla":        
-        skip, look_exon_num = False, False
-        for line in open("hisatgenotype_db/%s/hla.dat" % base_fname.upper()):
-            if line.startswith("DE"):
-                allele_name = line.split()[1][:-1]
-                if allele_name.startswith("HLA-"):
-                    allele_name = allele_name[4:]
-                gene = allele_name.split('*')[0]
-                if not gene in genes:
-                    skip = True
-                else:
-                    skip = False
-            if skip:
-                continue
-            if not line.startswith("FT"):
-                continue
-            
-            if line.find("exon") != -1:
-                look_exon_num = True
-                if allele_name == genes[gene]:
-                    exon_range = line.split()[2].split("..")
-                    exon_left, exon_right = int(exon_range[0]) - 1, int(exon_range[1]) - 1
-                    assert exon_left >= 0
-                    assert exon_left < exon_right
-                    if not gene in gene_exons:
-                        gene_exons[gene] = []
-                    if gene in left_ext_seq_dic:
-                        left_ext_seq_len = len(left_ext_seq_dic[gene])
-                    else:
-                        left_ext_seq_len = 0
-                    gene_exons[gene].append([exon_left + left_ext_seq_len, exon_right + left_ext_seq_len])
-            elif look_exon_num:
-                assert line.find("number")
-                look_exon_num = False
-                num = line.strip().split("number=")[1]
-                num = int(num[1:-1]) - 1
-                if gene not in gene_exon_counts:
-                    gene_exon_counts[gene] = {}
-                if num not in gene_exon_counts[gene]:
-                    gene_exon_counts[gene][num] = 1
-                else:
-                    gene_exon_counts[gene][num] += 1
-                
-        for gene, exon_counts in gene_exon_counts.items():
-            print >> sys.stderr, "%s exon counts:" % gene, exon_counts
-
-    tmp_locus_list = []
-    for gene in locus_list:
-        if gene in remove_locus_list:
-            continue
-        if base_fname == "hla" and gene not in gene_exons:
-            continue
-        tmp_locus_list.append(gene)
-    locus_list = tmp_locus_list
-    for key in genes.keys():
-        if key in locus_list:
-            continue
-        del genes[key]
-        del gene_strand[key]
-
-    # Write the backbone sequences into a fasta file
-    backbone_file = open(base_fullpath_name + "_backbone.fa", 'w')        
-    # variants w.r.t the backbone sequences into a SNP file
-    var_file = open(base_fullpath_name + ".snp", 'w')
-    var_index_file = open(base_fullpath_name + ".index.snp", 'w')
-    # variant frequence
-    var_freq_file = open(base_fullpath_name + ".snp.freq", 'w')
-    # haplotypes
-    haplotype_file = open(base_fullpath_name + ".haplotype", 'w')
-    # pairs of a variant and the corresponding HLA allels into a LINK file    
-    link_file = open(base_fullpath_name + ".link", 'w')
-    # Write all the sequences with dots removed into a file
-    input_file = open(base_fullpath_name + "_sequences.fa", 'w')
-    # Write allele names into a file
-    allele_file = open("%s.allele" % base_fullpath_name, 'w')
-    # Read partial alleles from hla.data, and write them into a file
-    partial_file = open("%s.partial" % base_fullpath_name, 'w')
-    
-    num_vars, num_haplotypes = 0, 0
-    full_alleles = {}
-    for gene, ref_gene in genes.items():
-        strand = gene_strand[gene]
-        left_ext_seq, right_ext_seq = "", ""
-        if gene in left_ext_seq_dic:
-            left_ext_seq, right_ext_seq = left_ext_seq_dic[gene], right_ext_seq_dic[gene]
-
-        def read_MSF_file(fname, left_ext_seq = "", right_ext_seq = ""):
-            names = {} # HLA allele names to numeric IDs
-            seqs = []  # HLA multiple alignment sequences
-            for line in open(fname):
-                line = line.strip()
-                if not line or \
-                        not line[0].isalnum():
-                    continue
-
-                if line.startswith("MSF"):
-                    continue
-
-                if line.startswith("Name"):
-                    try:
-                        name = line.split('\t')[0]
-                        name = name.split()[1]
-                    except ValueError:
-                        continue
-
-                    if name in names:
-                        print >> sys.stderr, "Warning: %s is found more than once in Names" % (name)
-                        continue
-
-                    names[name] = len(names)
-                else:
-                    if len(seqs) == 0:
-                        seqs = [left_ext_seq for i in range(len(names))]
-                    try:
-                        cols = line.split()
-                        name = cols[0]
-                        fives = cols[1:]
-                        assert len(fives) > 0
-                    except ValueError:
-                        continue
-
-                    if name not in names:
-                        names[name] = len(names)
-
-                    id = names[name]
-                    if id >= len(seqs):
-                        assert id == len(seqs)
-                        seqs.append(left_ext_seq)
-                        
-                    seqs[id] += ''.join(fives)
-
-                    # Add sub-names of the allele
-                    sub_name = ""
-                    for group in name.split(':')[:-1]:
-                        if sub_name != "":
-                            sub_name += ":"
-                        sub_name += group
-                        if sub_name not in full_alleles:
-                            full_alleles[sub_name] = [name]
-                        else:
-                            full_alleles[sub_name].append(name)
-
-            if len(right_ext_seq) > 0:
-                for i_ in range(len(seqs)):
-                    seqs[i_] += right_ext_seq
-
-            return names, seqs
-
-        if base_fname == "hla":
-            MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene)
-        else:
-            MSA_fname = "hisatgenotype_db/%s/msf/%s_gen.msf" % (base_fname.upper(), gene)
-            
-        if not os.path.exists(MSA_fname):
-            print >> sys.stderr, "Warning: %s does not exist" % MSA_fname
-            continue
-
-        names, seqs = read_MSF_file(MSA_fname, left_ext_seq, right_ext_seq)
-        full_allele_names = set(names.keys())
-
-        # Identify a consensus sequence
-        assert len(seqs) > 0
-
-        # Check sequences are of equal length
-        def find_seq_len(seqs):
-            seq_lens = {}
-            for s in range(len(seqs)):
-                seq_len = len(seqs[s])
-                if seq_len not in seq_lens:
-                    seq_lens[seq_len] = 1
-                else:
-                    seq_lens[seq_len] += 1
-
-            max_seq_count = 0
-            for tmp_seq_len, tmp_seq_count in seq_lens.items():
-                if tmp_seq_count > max_seq_count:
-                    seq_len = tmp_seq_len
-                    max_seq_count = tmp_seq_count
-            return seq_len
-
-        seq_len = find_seq_len(seqs)        
-        backbone_name = "%s*BACKBONE" % gene
-        backbone_seq, backbone_freq = create_consensus_seq(seqs,
-                                                           seq_len,
-                                                           min_var_freq,
-                                                           not partial) # Remove empty sequences?
-        # Allele sequences can shrink, so readjust the sequence length
-        if not partial:
-            seq_len = find_seq_len(seqs)
-
-        if partial and base_fname == "hla":
-            partial_MSA_fname = "hisatgenotype_db/HLA/msf/%s_nuc.msf" % gene
-            if not os.path.exists(partial_MSA_fname):
-                print >> sys.stderr, "Warning: %s does not exist" % partial_MSA_fname
-                continue
-            partial_names, partial_seqs = read_MSF_file(partial_MSA_fname)
-
-            # DK - debugging purposes
-            # Partial alleles vs. Full alleles
-            """
-            counts = [0, 0, 0, 0]
-            for partial_name in partial_names.keys():
-                if partial_name in names:
-                    continue
-                name_group = partial_name.split(':')
-                for group_i in [3, 2, 1, 0]:
-                    if group_i == 0:
-                        counts[group_i] += 1
-                    if group_i > len(name_group):
-                        continue
-                    sub_name = ':'.join(name_group[:group_i])
-                    if sub_name in full_alleles:
-                        print partial_name, sub_name, full_alleles[sub_name][:5]
-                        counts[group_i] += 1
-                        break
-            print "DK: counts:", counts
-            sys.exit(1)
-            """
-                
-            ref_seq = seqs[names[ref_gene]]
-            ref_seq_map = create_map(ref_seq)
-            ref_partial_seq = partial_seqs[partial_names[ref_gene]]
-            ref_partial_seq_map = create_map(ref_partial_seq)
-            exons = gene_exons[gene]
-            exon_len = 0
-            ref_exons = [] # converted exons to MSF file (e.g. A_gen.msf)
-            ref_partial_exons = [] # converted exons to MSF file (e.g. A_nuc.msf)
-
-            complete = True
-            for exon in exons:
-                left, right = exon
-                ref_exons.append([ref_seq_map[left], ref_seq_map[right]])
-                next_exon_len = right - left + exon_len
-                if next_exon_len >= len(ref_partial_seq_map):
-                    print >> sys.stderr, "Warning: partial sequences (%s) seem to be incomplete" % gene
-                    complete = False
-                    break
-                ref_partial_exons.append([ref_partial_seq_map[exon_len], ref_partial_seq_map[next_exon_len]])
-                exon_len += (right - left + 1)
-                # Make sure two MSF files (e.g. A_gen.msf and A_nuc.msf) share the same MSF lengths in the exonic sequences
-                ref_exon_len = ref_exons[-1][1] - ref_exons[-1][0] + 1
-                ref_partial_exon_len = ref_partial_exons[-1][1] - ref_partial_exons[-1][0] + 1
-                assert ref_exon_len == ref_partial_exon_len
-
-            if complete:
-                partial_seq_len = find_seq_len(partial_seqs)
-                partial_backbone_seq, partial_backbone_freq = create_consensus_seq(partial_seqs,
-                                                                                   partial_seq_len,
-                                                                                   min_var_freq,
-                                                                                   False) # Remove empty sequences?
-                for name, seq_id in partial_names.items():
-                    if name in names:
-                        continue
-                    seq = partial_seqs[seq_id]
-                    new_seq = ""
-                    right = 0
-                    for e in range(len(exons)):
-                        ref_exon = ref_exons[e]
-                        ref_partial_exon = ref_partial_exons[e]
-                        new_seq += backbone_seq[right:ref_exon[0]]
-                        exon_seq = seq[ref_partial_exon[0]:ref_partial_exon[1] + 1]
-                        nt_exon_seq = exon_seq.replace('.', '')
-                        if len(nt_exon_seq) == 0:
-                            exon_seq = partial_backbone_seq[ref_partial_exon[0]:ref_partial_exon[1] + 1]
-                        new_seq += exon_seq
-                        right = ref_exon[1] + 1
-                    new_seq += backbone_seq[right:]
-                    names[name] = len(seqs)
-                    seqs.append(new_seq)
-
-                backbone_seq, backbone_freq = create_consensus_seq(seqs,
-                                                                   seq_len,
-                                                                   min_var_freq,
-                                                                   True) # Remove empty sequences?
-                seq_len = find_seq_len(seqs)
-                
-        if min_var_freq <= 0.0:
-            assert '.' not in backbone_seq and 'E' not in backbone_seq
-        
-        # Reverse complement MSF if this gene is on '-' strand
-        if strand == '-':
-            # Reverse exons
-            ref_seq = seqs[names[ref_gene]]
-            ref_seq = ref_seq.replace('.', '')
-            ref_seq_len = len(ref_seq)
-            if base_fname == "hla":
-                exons = []
-                for left, right in reversed(gene_exons[gene]):
-                    left, right = ref_seq_len - right - 1, ref_seq_len - left - 1
-                    exons.append([left, right])
-                gene_exons[gene] = exons
-                exon_counts = {}
-                for exon_i, count in gene_exon_counts[gene].items():
-                    exon_counts[len(gene_exons[gene]) - exon_i - 1] = count
-                gene_exon_counts[gene] = exon_counts
-
-            for i in range(len(seqs)):
-                seqs[i] = typing_common.reverse_complement(seqs[i])
-            backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True)
-
-        if leftshift:
-            for seq_i in range(len(seqs)):
-                seqs[seq_i] = leftshift_deletions(backbone_seq, seqs[seq_i])
-            backbone_seq, backbone_freq = create_consensus_seq(seqs, seq_len, min_var_freq, True)
-            seq_len = find_seq_len(seqs)
-
-        print >> sys.stderr, "%s: number of HLA alleles is %d." % (gene, len(names))
-
-        Vars = {}
-        for cmp_name, id in names.items():
-            if cmp_name == backbone_name:
-                continue
-            assert id < len(seqs)
-            cmp_seq = seqs[id]
-            if len(cmp_seq) != seq_len:
-                print >> sys.stderr, "Warning: the length of %s (%d) is different from %d" % \
-                    (cmp_name, len(cmp_seq), seq_len)
-                continue
-
-            # DK - debugging purposes
-            """
-            if cmp_name == "A*03:01:07":
-                print cmp_name
-                cmp_seq2 = seqs[names["A*32:29"]]
-                for s in range(0, seq_len, 100):
-                    print s, backbone_seq[s:s+100]
-                    print s, cmp_seq2[s:s+100]
-                    print s, cmp_seq[s:s+100]
-                # sys.exit(1)
-            """
-            def insertVar(type, info):
-                pos, backbone_pos, data = info
-                if type in "MI":
-                    varKey = "%d-%s-%s" % (pos, type, data)
-                else:
-                    varKey = "%d-%s-%d" % (pos, type, data)
-
-                if varKey not in Vars:
-                    if type == 'M':
-                        assert backbone_pos < backbone_freq
-                        assert data in backbone_freq[backbone_pos]
-                        freq = backbone_freq[backbone_pos][data]
-                    elif type == 'D':
-                        del_len = int(data)
-                        freq = 100.0
-                        assert backbone_pos + del_len <= backbone_freq
-                        for d in range(del_len):
-                            assert '.' in backbone_freq[backbone_pos + d]
-                            freq2 = backbone_freq[backbone_pos + d]['.']
-                            if freq2 < freq:
-                                freq = freq2
-                    else:
-                        assert type == 'I'
-                        ins_len = len(data)
-                        freq = 100.0
-                        assert backbone_pos + ins_len <= backbone_freq
-                        for i in range(ins_len):
-                            nt = data[i]
-                            assert nt in backbone_freq[backbone_pos + i]
-                            freq2 = backbone_freq[backbone_pos + i][nt]
-                            if freq2 < freq:
-                                freq = freq2
-                        assert freq <= min_var_freq
-                    
-                    Vars[varKey] = [freq, [cmp_name]]
-                else:
-                    Vars[varKey][1].append(cmp_name)
-
-            insertion, deletion = [], []
-            ndots = 0
-            for s in range(seq_len):
-                assert not (insertion and deletion)
-                bc = backbone_seq[s]
-                cc = cmp_seq[s]
-                if bc != '.' and cc != '.':
-                    if insertion:
-                        insertVar('I', insertion)
-                        insertion = []
-                    elif deletion:
-                        insertVar('D', deletion)
-                        deletion = []
-                    if bc != cc:
-                        mismatch = [s - ndots, s, cc]
-                        insertVar('M', mismatch)
-                elif bc == '.' and cc != '.':
-                    if deletion:
-                        insertVar('D', deletion)
-                        deletion = []
-                    if insertion:
-                        insertion[2] += cc
-                    else:
-                        insertion = [s - ndots, s, cc]
-                elif bc != '.' and cc == '.':
-                    if insertion:
-                        insertVar('I', insertion)
-                        insertion = []
-                    if deletion:
-                        deletion[2] += 1
-                    else:
-                        deletion = [s - ndots, s, 1]
-
-                if bc == '.':
-                    ndots += 1
-
-                """
-                if backbone_seq[s] != cmp_seq[s]:
-                    print "%s is different %s at %d: %s vs. %s" % \
-                        (backbone_name, cmp_name, s+1, backbone_seq[s], cmp_seq[s])
-                """
-
-            if insertion:
-                insertVar('I', insertion)
-            elif deletion:
-                insertVar('D', deletion)
-
-
-        print >> sys.stderr, "Number of variants is %d." % (len(Vars.keys()))
-
-        # Compare variants
-        def cmp_varKey(a, b):
-            a_locus, a_type, a_data = a.split('-')
-            b_locus, b_type, b_data = b.split('-')
-            a_locus, b_locus = int(a_locus), int(b_locus)
-            if a_locus != b_locus:
-                return a_locus - b_locus
-            if a_type != b_type:
-                if a_type == 'I':
-                    return -1
-                elif b_type == 'I':
-                    return 1
-                elif a_type == 'M':
-                    return -1
-                else:
-                    assert b_type == 'M'
-                    return 1
-            assert a_data != b_data
-            if a_type in "MI":
-                if a_data < b_data:
-                    return -1
-                else:
-                    return 1
-            else:
-                assert a_type == 'D'
-                return int(a_data) - int(b_data)            
-
-        Vars_ = {}
-        for key, values in Vars.items():
-            freq, names_ = values
-            for name in names_:
-                if not name in Vars_:
-                    Vars_[name] = [key]
-                else:
-                    Vars_[name].append(key)
-        for name, vars in Vars_.items():
-            Vars_[name] = sorted(vars, cmp=cmp_varKey)
-
-        # Sanity check -
-        #    (1) Reconstruct the other sequences from the backbone sequence and variants and
-        #    (2) Confirm these constructed sequences are the same as those input sequences.
-        for cmp_name, id in names.items():
-            if cmp_name == backbone_name:
-                continue
-
-            constr_seq = backbone_seq.replace('.', '')
-            constr_seq = list(constr_seq)
-            locus_diff = 0
-
-            if cmp_name not in Vars_:
-                continue
-            
-            for var in Vars_[cmp_name]:
-                try:
-                    locus, type, data = var.split('-')
-                    locus = int(locus)
-                except ValueError:
-                    continue
-
-                if type == 'M':
-                    assert len(data) == 1
-                    constr_seq[locus + locus_diff] = data[0]
-                elif type == 'I':
-                    assert locus + locus_diff >= 0
-                    assert locus + locus_diff <= len(constr_seq)
-                    constr_seq = constr_seq[:locus + locus_diff] + list(data) + constr_seq[locus + locus_diff:]
-                    locus_diff += len(data)
-                else:
-                    assert type == 'D'
-                    assert locus + locus_diff + len(data) <= len(constr_seq)
-                    assert locus + locus_diff >= 0
-                    del_len = int(data)
-                    constr_seq = constr_seq[:locus + locus_diff] + constr_seq[locus + locus_diff + del_len:]
-                    locus_diff -= del_len
-
-            constr_seq = "".join(constr_seq)
-            assert id < len(seqs)
-            cmp_seq = seqs[id].replace('.', '')
-            if len(constr_seq) != len(cmp_seq):
-                print >> sys.stderr, "Error: reconstruction fails (%s)! Lengths different: %d vs. %d" % \
-                    (cmp_name, len(constr_seq), len(cmp_seq))
-                assert False
-
-            # Sanity check
-            for s in range(len(constr_seq)):
-                if constr_seq[s] != cmp_seq[s]:
-                    print >> sys.stderr, "Differ at %d: %s vs. %s (reconstruction vs. original)" % \
-                        (s, constr_seq[s], cmp_seq[s])
-                    print "%s:%s vs. %s:%s" % \
-                        (constr_seq[s-10:s], constr_seq[s:s+10], cmp_seq[s-10:s], cmp_seq[s:s+10])
-
-            if constr_seq != cmp_seq.replace('.', ''):
-                print >> sys.stderr, "Error: reconstruction fails for %s" % (cmp_name)
-                assert False
-
-        # Write the backbone sequences into a fasta file
-        print >> backbone_file, ">%s" % (backbone_name)
-        backbone_seq_ = backbone_seq.replace('.', '')
-        for s in range(0, len(backbone_seq_), 60):
-            print >> backbone_file, backbone_seq_[s:s+60]
-
-        # Remap the backbone allele, which is sometimes slighly different from
-        #   fasta version
-        ref_backbone_id = names[ref_gene]
-        ref_backbone_seq = seqs[ref_backbone_id]
-        aligner_cmd = ["hisat2"]
-        if base_fname == "hla":
-            aligner_cmd += ["--score-min", "C,0"]
-        aligner_cmd += ["--no-unal",
-                        "-x", "grch38/genome",
-                        "-f", 
-                        "-c", "%s" % ref_backbone_seq.replace('.', '')]
-        align_proc = subprocess.Popen(aligner_cmd,
-                                      stdout=subprocess.PIPE,
-                                      stderr=open("/dev/null", 'w'))
-        best_chr, best_left, best_right, best_AS = "", 0, 0, -sys.maxint
-        for line in align_proc.stdout:
-            if line.startswith('@'):
-                continue
-            line = line.strip()
-            cols = line.split()
-            allele_id, flag, chr, left, mapQ, cigar_str = cols[:6]
-            flag = int(flag)
-            assert flag & 0x10 == 0
-            left = int(left) - 1
-            right = left
-            AS = ""
-            for i in range(11, len(cols)):
-                col = cols[i]
-                if col.startswith("AS"):
-                    AS = col[5:]
-            AS = int(AS)
-            cigars = cigar_re.findall(cigar_str)
-            cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-            for i in range(len(cigars)):
-                cigar_op, length = cigars[i]
-                if cigar_op in "MND":
-                    right += length
-            if AS > best_AS:
-                best_chr, best_left, best_right, best_AS = chr, left, right, AS
-
-        chr, left, right = best_chr, best_left, best_right
-        align_proc.communicate()
-        if left == right:
-            print >> sys.stderr, "Warning: %s (%s) is not remapped" % (gene, ref_gene)
-            continue
-        assert left < right
-
-        base_locus = 0                
-        ref_seq = seqs[names[ref_gene]]
-        ref_seq_map = create_map(ref_seq)
-
-        del_count = []
-        for nt in backbone_seq:
-            assert nt in "ACGT."
-            add = 1 if nt == '.' else 0
-            if len(del_count) == 0:
-                del_count.append(add)
-            else:
-                del_count.append(del_count[-1] + add)
-        
-        if base_fname == "hla":
-            exon_str = ""
-            for exon_i in range(len(gene_exons[gene])):
-                exon_left, exon_right = gene_exons[gene][exon_i]
-                exon_left, exon_right = ref_seq_map[exon_left], ref_seq_map[exon_right]
-                exon_left -= del_count[exon_left]
-                exon_right -= del_count[exon_right]
-                if exon_str != "":
-                    exon_str += ','
-                primary = gene_exon_counts[gene][exon_i] == max(gene_exon_counts[gene].values())
-                exon_str += ("%d-%d%s" % (exon_left, exon_right, 'p' if primary else ''))
-
-            # Sanity check for exonic sequence
-            sanity_check = True
-            if sanity_check and \
-               os.path.exists("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene):
-                exons_ = []
-                for exon in exon_str.split(','):
-                    if exon.endswith('p'):
-                        exon = exon[:-1]
-                    exon_left, exon_right = exon.split('-')
-                    exon_left, exon_right = int(exon_left), int(exon_right)
-                    exons_.append([exon_left, exon_right])
-
-                backbone_seq_ = backbone_seq.replace('.', '')
-                if ref_gene in Vars_:
-                    vars_ = Vars_[ref_gene]
-                else:
-                    vars_ = []
-                seq_ = list(backbone_seq_)
-                has_insertion = False
-                for var_ in vars_:
-                    var_pos, var_type, var_data = var_.split('-')
-                    var_pos = int(var_pos)
-                    assert var_pos >= 0 and var_pos < len(backbone_seq_)
-                    if var_type == 'M':
-                        seq_[var_pos] = var_data
-                    elif var_type == 'D':
-                        del_len = int(var_data)
-                        assert var_pos + del_len <= len(ref_seq)
-                        seq_[var_pos:var_pos + del_len] = ['.'] * del_len
-                    else:
-                        assert var_type == 'I'
-                        has_insertion = True
-
-                seq_ = ''.join(seq_)
-                exon_seq_ = ""
-                for exon_left, exon_right in exons_:
-                    exon_seq_ += seq_[exon_left:exon_right+1]
-                exon_seq_ = exon_seq_.replace('.', '')
-                if gene_strand[gene] == '-':
-                    exon_seq_ = typing_common.reverse_complement(exon_seq_)
-
-                cmp_exon_seq_, allele_name_ = "", ""
-                for line in open("hisatgenotype_db/HLA/fasta/%s_nuc.fasta" % gene):
-                    if line.startswith(">"):
-                        if allele_name_ == ref_gene:
-                            break
-                        allele_name_ = line.strip().split()[1]
-                        cmp_exon_seq_ = ""
-                    else:
-                        cmp_exon_seq_ += line.strip()
-                """
-                print "Has insertions:", has_insertion
-                print "constructed:", len(exon_seq_)
-                for p in range(0, len(exon_seq_), 60):
-                    print exon_seq_[p:p+60]
-                print "true:", len(cmp_exon_seq_)
-                for p in range(0, len(cmp_exon_seq_), 60):
-                    print cmp_exon_seq_[p:p+60]
-                """
-                if exon_seq_ != cmp_exon_seq_:
-                    print >> sys.stderr, "Waring: exonic sequences do not match (%s)" % gene
-        else:
-            exon_str = "%d-%d" % (left, right - 1)
-
-        print >> locus_file, "%s\t%s\t%d\t%d\t%d\t%s\t%s" % \
-            (backbone_name, chr, left, right - 1, len(backbone_seq.replace('.', '')), exon_str, gene_strand[gene])
-
-        # Write
-        #       (1) variants w.r.t the backbone sequences into a SNP file
-        #       (2) pairs of a variant and the corresponding HLA allels into a LINK file    
-        keys = sorted(Vars.keys(), cmp=cmp_varKey)
-        var2ID = {}
-        for k in range(len(keys)):
-            locus, type, data = keys[k].split('-')
-            locus = int(locus)
-            if type == 'M':
-                type_str = "single"
-            elif type == 'I':
-                type_str = "insertion"
-            else:
-                assert type == 'D'
-                type_str = "deletion"
-
-            freq, names_ = Vars[keys[k]]
-            names_ = sorted(names_)            
-            varID = "hv%d" % (num_vars)
-            tmp_backbone_name = backbone_name
-            print >> var_file, "%s\t%s\t%s\t%d\t%s" % \
-                (varID, type_str, tmp_backbone_name, base_locus + locus, data)
-            if freq >= min_var_freq:
-                print >> var_index_file, "%s\t%s\t%s\t%d\t%s" % \
-                    (varID, type_str, tmp_backbone_name, base_locus + locus, data)
-            print >> var_freq_file, "%s\t%.2f" % (varID, freq)
-            print >> link_file, "%s\t%s" % (varID, ' '.join(names_))
-            var2ID[keys[k]] = num_vars
-            num_vars += 1
-
-        add_seq_len = 0
-        # Write haplotypes
-        excluded_vars = set()
-        var_leftmost, var_rightmost = sys.maxint, -1
-        for k in range(len(keys)):
-            key = keys[k]
-            if Vars[key][0] < min_var_freq:
-                excluded_vars.add(key)
-
-            # Update leftmost and rightmost of Vars
-            locus, type, data = key.split('-')
-            left = right = int(locus)
-            if type == 'D':
-                right = left + int(data) - 1
-            if k == 0:
-                var_leftmost = left
-            if var_rightmost < right:
-                var_rightmost = right
-
-        i = 0
-        while i < len(keys):
-            key_i = keys[i]
-            locus, type, data = key_i.split('-')
-            locus = int(locus)
-            if type == 'D':
-                locus += (int(data) - 1)
-            prev_locus = locus
-            if whole_haplotype:
-                j = len(keys)
-            else:
-                j = i + 1
-                while j < len(keys):
-                    key_j = keys[j]
-                    locus2, type2, data2 = key_j.split('-')
-                    locus2 = int(locus2)
-                    if prev_locus + inter_gap < locus2:
-                        break
-                    prev_locus = locus2
-                    if type == 'D':
-                        prev_locus += (int(data) - 1)
-                    j += 1
-
-            alleles = set()
-            for k in range(i, j):
-                key_k = keys[k]
-                freq, names_ = Vars[key_k]
-                if freq < min_var_freq:
-                    continue
-                add_alleles = set(names_)
-                alleles |= add_alleles
-
-            haplotypes = set()
-            cur_vars = set(keys[i:j]) - excluded_vars
-            for allele in alleles:
-                allele_vars = set(Vars_[allele]) - excluded_vars
-                allele_cur_vars = '#'.join(sorted(list(cur_vars & allele_vars), cmp=cmp_varKey))
-                haplotypes.add(allele_cur_vars)
-
-            # Split some haplotypes that include large gaps inside
-            def split_haplotypes(haplotypes):
-                split_haplotypes = set()
-                for haplotype in haplotypes:
-                    haplotype = haplotype.split('#')
-                    assert len(haplotype) > 0
-                    if len(haplotype) == 1:
-                        split_haplotypes.add(haplotype[0])
-                        continue
-                    prev_s, s = 0, 1
-                    while s < len(haplotype):
-                        prev_locus, prev_type, prev_data = haplotype[s-1].split('-')
-                        locus, type, data = haplotype[s].split('-')
-                        prev_locus, locus = int(prev_locus), int(locus)
-                        if prev_type == 'D':
-                            prev_locus += (int(prev_data) - 1)
-                        if prev_locus + intra_gap < locus:
-                            split_haplotypes.add('#'.join(haplotype[prev_s:s]))
-                            prev_s = s
-                        s += 1
-                        if s == len(haplotype):
-                            split_haplotypes.add('#'.join(haplotype[prev_s:s]))
-                return split_haplotypes
-
-            if not whole_haplotype:
-                haplotypes = split_haplotypes(haplotypes)
-
-            def cmp_haplotype(a, b):
-                a = a.split('#')
-                a1_locus, _, _ = a[0].split('-')
-                a2_locus, a2_type, a2_data = a[-1].split('-')
-                a_begin, a_end = int(a1_locus), int(a2_locus)
-                if a2_type == 'D':
-                    a_end += (int(a2_data) - 1)
-                b = b.split('#')
-                b1_locus, _, _ = b[0].split('-')
-                b2_locus, b2_type, b2_data = b[-1].split('-')
-                b_begin, b_end = int(b1_locus), int(b2_locus)
-                if b2_type == 'D':
-                    b_end += (int(b2_data) - 1)
-                if a_begin != b_begin:
-                    return a_begin - b_begin
-                return a_end - b_end
-
-            haplotypes = sorted(list(haplotypes), cmp=cmp_haplotype)
-            
-            # DK - for debugging purposes
-            """
-            dis = prev_locus - locus
-            print "\n[%d, %d]: %d haplotypes" % (i, j, len(haplotypes)), dis
-            if len(cur_vars) in range(0, 1000):
-                # print "vars:", sorted(list(cur_vars), cmp=cmp_varKey
-                print "num:", len(haplotypes)
-                for haplotype in haplotypes:
-                    print haplotype.split('#')
-                print "\nnum:", len(haplotypes2)
-                for haplotype in haplotypes2:
-                    print haplotype.split('#')
-            """
-
-            # Write haplotypes
-            sanity_vars = set()
-            for h_i in range(len(haplotypes)):
-                h = haplotypes[h_i].split('#')
-                varIDs = []
-                for var in h:
-                    varIDs.append("hv%s" % var2ID[var])
-                    # DK - for debugging purposes
-                    # varIDs.append(var)
-                    sanity_vars.add(var2ID[var])
-                if whole_haplotype:
-                    h_begin, h_end = var_leftmost, var_rightmost
-                else:
-                    h1_locus, _, _ = h[0].split('-')
-                    h2_locus, h2_type, h2_data = h[-1].split('-')
-                    h_begin, h_end = int(h1_locus), int(h2_locus)
-                    if h2_type == 'D':
-                        h_end += (int(h2_data) - 1)
-                    assert h_begin <= h_end
-                    h_new_begin = h_begin
-                    for h_j in reversed(range(0, h_i)):
-                        hc = haplotypes[h_j].split('#')
-                        hc_begin, hc_type, hc_data = hc[-1].split('-')
-                        hc_begin = int(hc_begin)
-                        hc_end = hc_begin
-                        if hc_type == 'D':
-                            hc_end += (int(hc_data) - 1)
-                        if hc_end + inter_gap < h_begin:
-                            break
-                        if h_new_begin > hc_end:
-                            h_new_begin = hc_end
-                    assert h_new_begin <= h_begin
-                    h_begin = h_new_begin
-                tmp_backbone_name = backbone_name
-                print >> haplotype_file, "ht%d\t%s\t%d\t%d\t%s" % \
-                    (num_haplotypes, tmp_backbone_name, base_locus + h_begin, base_locus + h_end, ','.join(varIDs))
-                num_haplotypes += 1
-                add_seq_len += (h_end - h_begin + 1)
-            assert len(sanity_vars) == len(cur_vars)
-                    
-            i = j
-
-        print >> sys.stderr, "Length of additional sequences for haplotypes:", add_seq_len
-                    
-        # Write all the sequences with dots removed into a file
-        for name, ID in names.items():
-            print >> input_file, ">%s" % (name)
-            assert ID < len(seqs)
-            seq = seqs[ID].replace('.', '')
-            for s in range(0, len(seq), 60):
-                print >> input_file, seq[s:s+60]
-            print >> allele_file, name
-
-                    
-        # Write partial allele names
-        for name in names:
-            if name not in full_allele_names:
-                print >> partial_file, name
-
-    backbone_file.close()
-    locus_file.close()
-    var_file.close()
-    var_index_file.close()
-    var_freq_file.close()
-    haplotype_file.close()
-    link_file.close()
-    input_file.close()
-    allele_file.close()
-    partial_file.close()
-   
-    
-        
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description="Extract variants from multiple sequence alignments")
-    parser.add_argument("-b", "--base",
-                        dest="base_fname",
-                        type=str,
-                        default="hla",
-                        help="base filename for backbone sequence, variants, and linking info (Default: hla)")
-    parser.add_argument("--locus-list",
-                        dest="locus_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of gene names (default: empty, all genes)")
-    parser.add_argument("--inter-gap",
-                        dest="inter_gap",
-                        type=int,
-                        default=30,
-                        help="Maximum distance for variants to be in the same haplotype (default: 30)")
-    parser.add_argument("--intra-gap",
-                        dest="intra_gap",
-                        type=int,
-                        default=50,
-                        help="Break a haplotype into several haplotypes (default: 50)")
-    parser.add_argument("--whole-haplotype",
-                        dest="whole_haplotype",
-                        action="store_true",
-                        help="Include partial alleles (e.g. A_nuc.fasta)")
-    parser.add_argument("--min-var-freq",
-                        dest="min_var_freq",
-                        type=float,
-                        default=0.0,
-                        help="Exclude variants whose freq is below than this value in percentage (Default: 0.0)")    
-    parser.add_argument("--ext-seq",
-                        dest="ext_seq_len",
-                        type=int,
-                        default=0,
-                        help="Length of extra sequences flanking backbone sequences (Default: 0)")
-    parser.add_argument("--leftshift",
-                        dest="leftshift",
-                        action="store_true",
-                        help="Shift deletions to the leftmost")
-    parser.add_argument("--no-partial",
-                        dest="partial",
-                        action="store_false",
-                        help="Exclude partial alleles, exon-only sequences in HLA")
-    parser.add_argument("-v", "--verbose",
-                        dest="verbose",
-                        action="store_true",
-                        help="also print some statistics to stderr")
-
-    args = parser.parse_args()
-    if args.locus_list == "":
-        locus_list = []
-    else:
-        locus_list = args.locus_list.split(',')
-    if args.inter_gap > args.intra_gap:
-        print >> sys.stderr, "Error: --inter-gap (%d) must be smaller than --intra-gap (%d)" % (args.inter_gap, args.intra_gap)
-        sys.exit(1)
-             
-    if args.base_fname.find('/') != -1:
-        elems = args.base_fname.split('/')
-        base_fname = elems[-1]
-        base_dname = '/'.join(elems[:-1])
-    else:
-        base_fname = args.base_fname
-        base_dname = ""
-        
-    extract_vars(base_fname,
-                 base_dname,
-                 locus_list,
-                 args.inter_gap,
-                 args.intra_gap,
-                 args.whole_haplotype,
-                 args.min_var_freq,
-                 args.ext_seq_len,
-                 args.leftshift,
-                 args.partial,
-                 args.verbose)
-
diff --git a/hisatgenotype_hla_cyp.py b/hisatgenotype_hla_cyp.py
deleted file mode 100755
index cd97eea9..00000000
--- a/hisatgenotype_hla_cyp.py
+++ /dev/null
@@ -1,1671 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import inspect, random
-import math
-from argparse import ArgumentParser, FileType
-
-"""
-"""
-def simulate_reads(HLAs,
-                   test_HLA_list,
-                   simulate_interval):
-    HLA_reads_1, HLA_reads_2 = [], []
-    for test_HLA_names in test_HLA_list:
-        gene = test_HLA_names[0].split('*')[0]
-        # ref_allele = refHLAs[gene]
-        # ref_seq = HLAs[gene][ref_allele]
-
-        # Simulate reads from two HLA alleles
-        def simulate_reads_impl(seq, simulate_interval = 1, frag_len = 250, read_len = 100):
-            comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
-            reads_1, reads_2 = [], []
-            for i in range(0, len(seq) - frag_len + 1, simulate_interval):
-                reads_1.append(seq[i:i+read_len])
-                tmp_read_2 = reversed(seq[i+frag_len-read_len:i+frag_len])
-                read_2 = ""
-                for s in tmp_read_2:
-                    if s in comp_table:
-                        read_2 += comp_table[s]
-                    else:
-                        read_2 += s
-                reads_2.append(read_2)
-            return reads_1, reads_2
-
-        for test_HLA_name in test_HLA_names:
-            HLA_seq = HLAs[gene][test_HLA_name]
-            tmp_reads_1, tmp_reads_2 = simulate_reads_impl(HLA_seq, simulate_interval)
-            HLA_reads_1 += tmp_reads_1
-            HLA_reads_2 += tmp_reads_2
-
-    # Write reads into a fasta read file
-    def write_reads(reads, idx):
-        read_file = open('hla_input_%d.fa' % idx, 'w')
-        for read_i in range(len(reads)):
-            print >> read_file, ">%d" % (read_i + 1)
-            print >> read_file, reads[read_i]
-        read_file.close()
-    write_reads(HLA_reads_1, 1)
-    write_reads(HLA_reads_2, 2)
-
-
-"""
-Align reads, and sort the alignments into a BAM file
-"""
-def align_reads(ex_path,
-                base_fname,
-                aligner,
-                index_type,
-                read_fname,
-                fastq,
-                threads,
-                verbose):
-    if aligner == "hisat2":
-        hisat2 = os.path.join(ex_path, "hisat2")
-        aligner_cmd = [hisat2,
-                       "--no-unal",
-                       "--mm"]
-        if index_type == "linear":
-            aligner_cmd += ["-k", "10"]
-        aligner_cmd += ["-x", "%s.%s" % (base_fname, index_type)]
-    elif aligner == "bowtie2":
-        aligner_cmd = [aligner,
-                       "--no-unal",
-                       "-k", "10",
-                       "-x", base_fname]
-    else:
-        assert False
-    assert len(read_fname) in [1,2]
-    aligner_cmd += ["-p", str(threads)]
-    if not fastq:
-        aligner_cmd += ["-f"]
-    if len(read_fname) == 1:
-        aligner_cmd += ["-U", read_fname[0]]
-    else:
-        aligner_cmd += ["-1", "%s" % read_fname[0],
-                        "-2", "%s" % read_fname[1]]
-
-    if verbose:
-        print >> sys.stderr, ' '.join(aligner_cmd)
-    align_proc = subprocess.Popen(aligner_cmd,
-                                  stdout=subprocess.PIPE,
-                                  stderr=open("/dev/null", 'w'))
-
-    sambam_cmd = ["samtools",
-                  "view",
-                  "-bS",
-                  "-"]
-    sambam_proc = subprocess.Popen(sambam_cmd,
-                                   stdin=align_proc.stdout,
-                                   stdout=open("hla_input_unsorted.bam", 'w'),
-                                   stderr=open("/dev/null", 'w'))
-    sambam_proc.communicate()
-    if index_type == "graph":
-        bamsort_cmd = ["samtools",
-                       "sort",
-                       "hla_input_unsorted.bam",
-                       "-o", "hla_input.bam"]
-        bamsort_proc = subprocess.Popen(bamsort_cmd,
-                                        stderr=open("/dev/null", 'w'))
-        bamsort_proc.communicate()
-
-        bamindex_cmd = ["samtools",
-                        "index",
-                        "hla_input.bam"]
-        bamindex_proc = subprocess.Popen(bamindex_cmd,
-                                         stderr=open("/dev/null", 'w'))
-        bamindex_proc.communicate()
-
-        os.system("rm hla_input_unsorted.bam")            
-    else:
-        os.system("mv hla_input_unsorted.bam hla_input.bam")
-
-
-"""
-""" 
-def normalize(prob):
-    total = sum(prob.values())
-    for allele, mass in prob.items():
-        prob[allele] = mass / total
-
-        
-"""
-"""
-def prob_diff(prob1, prob2):
-    diff = 0.0
-    for allele in prob1.keys():
-        if allele in prob2:
-            diff += abs(prob1[allele] - prob2[allele])
-        else:
-            diff += prob1[allele]
-    return diff
-
-
-"""
-"""
-def HLA_prob_cmp(a, b):
-    if a[1] != b[1]:
-        if a[1] < b[1]:
-            return 1
-        else:
-            return -1
-    assert a[0] != b[0]
-    if a[0] < b[0]:
-        return -1
-    else:
-        return 1
-
-
-"""
-"""
-def single_abundance(HLA_cmpt,
-                     HLA_length):
-    def normalize2(prob, length):
-        total = 0
-        for allele, mass in prob.items():
-            assert allele in length
-            total += (mass / length[allele])
-        for allele, mass in prob.items():
-            assert allele in length
-            prob[allele] = mass / length[allele] / total
-
-    HLA_prob, HLA_prob_next = {}, {}
-    for cmpt, count in HLA_cmpt.items():
-        alleles = cmpt.split('-')
-        for allele in alleles:
-            if allele not in HLA_prob:
-                HLA_prob[allele] = 0.0
-            HLA_prob[allele] += (float(count) / len(alleles))
-
-    # normalize2(HLA_prob, HLA_length)
-    normalize(HLA_prob)
-    def next_prob(HLA_cmpt, HLA_prob, HLA_length):
-        HLA_prob_next = {}
-        for cmpt, count in HLA_cmpt.items():
-            alleles = cmpt.split('-')
-            alleles_prob = 0.0
-            for allele in alleles:
-                assert allele in HLA_prob
-                alleles_prob += HLA_prob[allele]
-            for allele in alleles:
-                if allele not in HLA_prob_next:
-                    HLA_prob_next[allele] = 0.0
-                HLA_prob_next[allele] += (float(count) * HLA_prob[allele] / alleles_prob)
-        # normalize2(HLA_prob_next, HLA_length)
-        normalize(HLA_prob_next)
-        return HLA_prob_next
-
-    diff, iter = 1.0, 0
-    while diff > 0.0001 and iter < 1000:
-        HLA_prob_next = next_prob(HLA_cmpt, HLA_prob, HLA_length)
-        diff = prob_diff(HLA_prob, HLA_prob_next)
-        HLA_prob = HLA_prob_next
-        iter += 1
-    for allele, prob in HLA_prob.items():
-        allele_len = HLA_length[allele]
-        HLA_prob[allele] /= float(allele_len)
-    normalize(HLA_prob)
-    HLA_prob = [[allele, prob] for allele, prob in HLA_prob.items()]
-    HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-    return HLA_prob
-
-    
-"""
-"""
-def joint_abundance(HLA_cmpt,
-                    HLA_length):
-    allele_names = set()
-    for cmpt in HLA_cmpt.keys():
-        allele_names |= set(cmpt.split('-'))
-    
-    HLA_prob, HLA_prob_next = {}, {}
-    for cmpt, count in HLA_cmpt.items():
-        alleles = cmpt.split('-')
-        for allele1 in alleles:
-            for allele2 in allele_names:
-                if allele1 < allele2:
-                    allele_pair = "%s-%s" % (allele1, allele2)
-                else:
-                    allele_pair = "%s-%s" % (allele2, allele1)
-                if not allele_pair in HLA_prob:
-                    HLA_prob[allele_pair] = 0.0
-                HLA_prob[allele_pair] += (float(count) / len(alleles))
-
-    if len(HLA_prob) <= 0:
-        return HLA_prob
-
-    # Choose top allele pairs
-    def choose_top_alleles(HLA_prob):
-        HLA_prob_list = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-        HLA_prob_list = sorted(HLA_prob_list, cmp=HLA_prob_cmp)
-        HLA_prob = {}
-        best_prob = HLA_prob_list[0][1]
-        for i in range(len(HLA_prob_list)):
-            allele_pair, prob = HLA_prob_list[i]
-            if prob * 2 <= best_prob:
-                break                        
-            HLA_prob[allele_pair] = prob
-        normalize(HLA_prob)
-        return HLA_prob
-    HLA_prob = choose_top_alleles(HLA_prob)
-
-    def next_prob(HLA_cmpt, HLA_prob):
-        HLA_prob_next = {}
-        for cmpt, count in HLA_cmpt.items():
-            alleles = cmpt.split('-')
-            prob = 0.0
-            for allele in alleles:
-                for allele_pair in HLA_prob.keys():
-                    if allele in allele_pair:
-                        prob += HLA_prob[allele_pair]
-            for allele in alleles:
-                for allele_pair in HLA_prob.keys():
-                    if not allele in allele_pair:
-                        continue
-                    if allele_pair not in HLA_prob_next:
-                        HLA_prob_next[allele_pair] = 0.0
-                    HLA_prob_next[allele_pair] += (float(count) * HLA_prob[allele_pair] / prob)
-        normalize(HLA_prob_next)
-        return HLA_prob_next
-
-    diff, iter = 1.0, 0
-    while diff > 0.0001 and iter < 1000:
-        HLA_prob_next = next_prob(HLA_cmpt, HLA_prob)
-        diff = prob_diff(HLA_prob, HLA_prob_next)
-        HLA_prob = HLA_prob_next
-        HLA_prob = choose_top_alleles(HLA_prob)
-        iter += 1
-
-    HLA_prob = [[allele_pair, prob] for allele_pair, prob in HLA_prob.items()]
-    HLA_prob = sorted(HLA_prob, cmp=HLA_prob_cmp)
-    return HLA_prob
-
-
-"""
-"""
-def HLA_typing(ex_path,
-               base_fname,
-               simulation,
-               reference_type,
-               hla_list,
-               partial,
-               refHLAs,
-               HLAs,
-               HLA_names,
-               HLA_lengths,
-               refHLA_loci,
-               Vars,
-               Var_list,
-               Links,
-               exclude_allele_list,
-               aligners,
-               num_mismatch,
-               fastq,
-               read_fname,
-               alignment_fname,
-               threads,
-               enable_coverage,
-               best_alleles,
-               verbose):
-
-    def lower_bound(Var_list, pos):
-        low, high = 0, len(Var_list)
-        while low < high:
-            m = (low + high) / 2
-            m_pos = Var_list[m][0]
-            if m_pos < pos:
-                low = m + 1
-            elif m_pos > pos:
-                high = m
-            else:
-                assert m_pos == pos
-                while m > 0:
-                    if Var_list[m-1][0] < pos:
-                        break
-                    m -= 1
-                return m
-        return low        
-            
-    if simulation:
-        test_passed = {}
-    for aligner, index_type in aligners:
-        if index_type == "graph":
-            print >> sys.stderr, "\n\t\t%s %s on %s" % (aligner, index_type, reference_type)
-        else:
-            print >> sys.stderr, "\n\t\t%s %s" % (aligner, index_type)
-
-        if alignment_fname == "":
-            # Align reads, and sort the alignments into a BAM file
-            align_reads(ex_path,
-                        base_fname,
-                        aligner,
-                        index_type,
-                        read_fname,
-                        fastq,
-                        threads,
-                        verbose)
-            
-        for test_HLA_names in hla_list:
-            if simulation:
-                gene = test_HLA_names[0].split('*')[0]
-            else:
-                gene = test_HLA_names
-            
-            ref_allele = refHLAs[gene]
-            ref_seq = HLAs[gene][ref_allele]
-            ref_exons = refHLA_loci[gene][-1]
-
-            # Read alignments
-            alignview_cmd = ["samtools",
-                             "view"]
-            if alignment_fname == "":
-                alignview_cmd += ["hla_input.bam"]
-            else:
-                if not os.path.exists(alignment_fname + ".bai"):
-                    os.system("samtools index %s" % alignment_fname)
-                alignview_cmd += [alignment_fname]
-            base_locus = 0
-            if index_type == "graph":
-                if reference_type == "gene":
-                    alignview_cmd += ["%s" % ref_allele]
-                else:
-                    assert reference_type in ["chromosome", "genome"]
-                    _, chr, left, right, _ = refHLA_loci[gene]
-                    base_locus = left
-                    alignview_cmd += ["%s:%d-%d" % (chr, left + 1, right + 1)]
-
-                bamview_proc = subprocess.Popen(alignview_cmd,
-                                                stdout=subprocess.PIPE,
-                                                stderr=open("/dev/null", 'w'))
-
-                sort_read_cmd = ["sort", "-k", "1", "-n"]
-                alignview_proc = subprocess.Popen(sort_read_cmd,
-                                                  stdin=bamview_proc.stdout,
-                                                  stdout=subprocess.PIPE,
-                                                  stderr=open("/dev/null", 'w'))
-            else:
-                alignview_proc = subprocess.Popen(alignview_cmd,
-                                             stdout=subprocess.PIPE,
-                                             stderr=open("/dev/null", 'w'))
-
-            # Count alleles
-            HLA_counts, HLA_cmpt = {}, {}
-            coverage = [0 for i in range(len(ref_seq) + 1)]
-            num_reads, total_read_len = 0, 0
-            prev_read_id = None
-            prev_exon = False
-            if index_type == "graph":
-                # Cigar regular expression
-                cigar_re = re.compile('\d+\w')
-                for line in alignview_proc.stdout:
-                    cols = line.strip().split()
-                    read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
-                    read_seq, qual = cols[9], cols[10]
-                    num_reads += 1
-                    total_read_len += len(read_seq)
-                    flag, pos = int(flag), int(pos)
-                    pos -= (base_locus + 1)
-                    if pos < 0:
-                        continue
-
-                    if flag & 0x4 != 0:
-                        continue
-
-                    NM, Zs, MD = "", "", ""
-                    for i in range(11, len(cols)):
-                        col = cols[i]
-                        if col.startswith("Zs"):
-                            Zs = col[5:]
-                        elif col.startswith("MD"):
-                            MD = col[5:]
-                        elif col.startswith("NM"):
-                            NM = int(col[5:])
-
-                    if NM > num_mismatch:
-                        continue
-
-                    # daehwan - for debugging purposes
-                    debug = False
-                    if read_id in ["2339"] and False:
-                        debug = True
-                        print "read_id: %s)" % read_id, pos, cigar_str, "NM:", NM, MD, Zs
-                        print "            ", read_seq
-
-                    vars = []
-                    if Zs:
-                        vars = Zs.split(',')
-
-                    assert MD != ""
-                    MD_str_pos, MD_len = 0, 0
-                    read_pos, left_pos = 0, pos
-                    right_pos = left_pos
-                    cigars = cigar_re.findall(cigar_str)
-                    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-                    cmp_list = []
-                    for i in range(len(cigars)):
-                        cigar_op, length = cigars[i]
-                        if cigar_op == 'M':
-                            # Update coverage
-                            if enable_coverage:
-                                if right_pos + length < len(coverage):
-                                    coverage[right_pos] += 1
-                                    coverage[right_pos + length] -= 1
-                                elif right_pos < len(coverage):
-                                    coverage[right_pos] += 1
-                                    coverage[-1] -= 1
-
-                            first = True
-                            MD_len_used = 0
-                            while True:
-                                if not first or MD_len == 0:
-                                    if MD[MD_str_pos].isdigit():
-                                        num = int(MD[MD_str_pos])
-                                        MD_str_pos += 1
-                                        while MD_str_pos < len(MD):
-                                            if MD[MD_str_pos].isdigit():
-                                                num = num * 10 + int(MD[MD_str_pos])
-                                                MD_str_pos += 1
-                                            else:
-                                                break
-                                        MD_len += num
-                                # Insertion or full match followed
-                                if MD_len >= length:
-                                    MD_len -= length
-                                    cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
-                                    break
-                                first = False
-                                read_base = read_seq[read_pos + MD_len]
-                                MD_ref_base = MD[MD_str_pos]
-                                MD_str_pos += 1
-                                assert MD_ref_base in "ACGT"
-                                cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])
-                                cmp_list.append(["mismatch", right_pos + MD_len, 1])
-                                MD_len_used = MD_len + 1
-                                MD_len += 1
-                                # Full match
-                                if MD_len == length:
-                                    MD_len = 0
-                                    break
-                        elif cigar_op == 'I':
-                            cmp_list.append(["insertion", right_pos, length])
-                        elif cigar_op == 'D':
-                            if MD[MD_str_pos] == '0':
-                                MD_str_pos += 1
-                            assert MD[MD_str_pos] == '^'
-                            MD_str_pos += 1
-                            while MD_str_pos < len(MD):
-                                if not MD[MD_str_pos] in "ACGT":
-                                    break
-                                MD_str_pos += 1
-                            cmp_list.append(["deletion", right_pos, length])
-                        elif cigar_op == 'S':
-                            cmp_list.append(["soft", right_pos, length])
-                        else:                    
-                            assert cigar_op == 'N'
-                            cmp_list.append(["intron", right_pos, length])
-
-                        if cigar_op in "MND":
-                            right_pos += length
-
-                        if cigar_op in "MIS":
-                            read_pos += length
-
-                    exon = False
-                    for exon in ref_exons:
-                        exon_left, exon_right = exon
-                        if right_pos <= exon_left or pos > exon_right:
-                            continue
-                        else:
-                            exon = True
-                            break
-
-                    if right_pos > len(ref_seq):
-                        continue
-
-                    def add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, exon = True):
-                        max_count = max(HLA_count_per_read.values())
-                        cur_cmpt = set()
-                        for allele, count in HLA_count_per_read.items():
-                            if count < max_count:
-                                continue
-                            if allele in exclude_allele_list:
-                                continue                                
-                            cur_cmpt.add(allele)                    
-                            if not allele in HLA_counts:
-                                HLA_counts[allele] = 1
-                            else:
-                                HLA_counts[allele] += 1
-
-                        if len(cur_cmpt) == 0:
-                            return
-
-                        # daehwan - for debugging purposes                            
-                        alleles = ["", ""]
-                        # alleles = ["B*40:304", "B*40:02:01"]
-                        allele1_found, allele2_found = False, False
-                        for allele, count in HLA_count_per_read.items():
-                            if count < max_count:
-                                continue
-                            if allele == alleles[0]:
-                                allele1_found = True
-                            elif allele == alleles[1]:
-                                allele2_found = True
-                        if allele1_found != allele2_found:
-                            print alleles[0], HLA_count_per_read[alleles[0]]
-                            print alleles[1], HLA_count_per_read[alleles[1]]
-                            if allele1_found:
-                                print ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, HLA_count_per_read[alleles[1]]))
-                            else:
-                                print ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, HLA_count_per_read[alleles[0]]))
-                            print read_seq
-
-                        cur_cmpt = sorted(list(cur_cmpt))
-                        cur_cmpt = '-'.join(cur_cmpt)
-                        add = 1
-                        if partial and not exon:
-                            add *= 0.2
-                        if not cur_cmpt in HLA_cmpt:
-                            HLA_cmpt[cur_cmpt] = add
-                        else:
-                            HLA_cmpt[cur_cmpt] += add
-
-                    if read_id != prev_read_id:
-                        if prev_read_id != None:
-                            add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read, prev_exon)
-
-                        HLA_count_per_read = {}
-                        for HLA_name in HLA_names[gene]:
-                            if HLA_name.find("BACKBONE") != -1:
-                                continue
-                            HLA_count_per_read[HLA_name] = 0
-
-                    def add_count(var_id, add):
-                        assert var_id in Links
-                        alleles = Links[var_id]
-                        for allele in alleles:
-                            if allele.find("BACKBONE") != -1:
-                                continue
-                            HLA_count_per_read[allele] += add
-                            # daehwan - for debugging purposes
-                            if debug:
-                                if allele in ["DQA1*05:05:01:01", "DQA1*05:05:01:02"]:
-                                    print allele, add, var_id
-
-                    # Decide which allele(s) a read most likely came from
-                    # also sanity check - read length, cigar string, and MD string
-                    for var_id, data in Vars[gene].items():
-                        var_type, var_pos, var_data = data
-                        if var_type != "deletion":
-                            continue
-                        if left_pos >= var_pos and right_pos <= var_pos + int(var_data):
-                            add_count(var_id, -1)                            
-                    ref_pos, read_pos, cmp_cigar_str, cmp_MD = left_pos, 0, "", ""
-                    cigar_match_len, MD_match_len = 0, 0            
-                    for cmp in cmp_list:
-                        type = cmp[0]
-                        length = cmp[2]
-                        if type == "match":
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos + length <= var_pos:
-                                    break
-                                if ref_pos <= var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "insertion":
-                                        if ref_pos < var_pos and ref_pos + length > var_pos + len(var_data):
-                                            add_count(var_id, -1)
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, Links[var_id]
-                                    elif var_type == "deletion":
-                                        del_len = int(var_data)
-                                        if ref_pos < var_pos and ref_pos + length > var_pos + del_len:
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, Links[var_id], -1, Vars[gene][var_id]
-                                            # Check if this might be one of the two tandem repeats (the same left coordinate)
-                                            cmp_left, cmp_right = cmp[1], cmp[1] + cmp[2]
-                                            test1_seq1 = ref_seq[cmp_left:cmp_right]
-                                            test1_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos + del_len:cmp_right + del_len]
-                                            # Check if this happens due to small repeats (the same right coordinate - e.g. 19 times of TTTC in DQA1*05:05:01:02)
-                                            cmp_left -= read_pos
-                                            cmp_right += (len(read_seq) - read_pos - cmp[2])
-                                            test2_seq1 = ref_seq[cmp_left+int(var_data):cmp_right]
-                                            test2_seq2 = ref_seq[cmp_left:var_pos] + ref_seq[var_pos+int(var_data):cmp_right]
-                                            if test1_seq1 != test1_seq2 and test2_seq1 != test2_seq2:
-                                                add_count(var_id, -1)
-                                    else:
-                                        if debug:
-                                            print cmp, var_id, Links[var_id], -1
-                                        add_count(var_id, -1)
-                                var_idx += 1
-
-                            read_pos += length
-                            ref_pos += length
-                            cigar_match_len += length
-                            MD_match_len += length
-                        elif type == "mismatch":
-                            read_base = read_seq[read_pos]
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos < var_pos:
-                                    break
-                                if ref_pos == var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "single":
-                                        if var_data == read_base:
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, 1, var_data, read_base, Links[var_id]
-
-                                            # daehwan - for debugging purposes
-                                            if False:
-                                                read_qual = ord(qual[read_pos])
-                                                add_count(var_id, (read_qual - 60) / 60.0)
-                                            else:
-                                                add_count(var_id, 1)
-                                        # daehwan - check out if this routine is appropriate
-                                        # else:
-                                        #    add_count(var_id, -1)
-                                var_idx += 1
-
-                            cmp_MD += ("%d%s" % (MD_match_len, ref_seq[ref_pos]))
-                            MD_match_len = 0
-                            cigar_match_len += 1
-                            read_pos += 1
-                            ref_pos += 1
-                        elif type == "insertion":
-                            ins_seq = read_seq[read_pos:read_pos+length]
-                            var_idx = lower_bound(Var_list[gene], ref_pos)
-                            # daehwan - for debugging purposes
-                            if debug:
-                                print left_pos, cigar_str, MD, vars
-                                print ref_pos, ins_seq, Var_list[gene][var_idx], Vars[gene][Var_list[gene][var_idx][1]]
-                                # sys.exit(1)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if ref_pos < var_pos:
-                                    break
-                                if ref_pos == var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "insertion":                                
-                                        if var_data == ins_seq:
-                                            # daehwan - for debugging purposes
-                                            if debug:
-                                                print cmp, var_id, 1, Links[var_id]
-                                            add_count(var_id, 1)
-                                var_idx += 1
-
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            read_pos += length
-                            cmp_cigar_str += ("%dI" % length)
-                        elif type == "deletion":
-                            del_len = length
-                            # Deletions can be shifted bidirectionally
-                            temp_ref_pos = ref_pos
-                            while temp_ref_pos > 0:
-                                last_bp = ref_seq[temp_ref_pos + del_len - 1]
-                                prev_bp = ref_seq[temp_ref_pos - 1]
-                                if last_bp != prev_bp:
-                                    break
-                                temp_ref_pos -= 1
-                            var_idx = lower_bound(Var_list[gene], temp_ref_pos)
-                            while var_idx < len(Var_list[gene]):
-                                var_pos, var_id = Var_list[gene][var_idx]
-                                if temp_ref_pos < var_pos:
-                                    first_bp = ref_seq[temp_ref_pos]
-                                    next_bp = ref_seq[temp_ref_pos + del_len]
-                                    if first_bp == next_bp:
-                                        temp_ref_pos += 1
-                                        continue
-                                    else:
-                                        break
-                                if temp_ref_pos == var_pos:
-                                    var_type, _, var_data = Vars[gene][var_id]
-                                    if var_type == "deletion":
-                                        var_len = int(var_data)
-                                        if var_len == length:
-                                            if debug:
-                                                print cmp, var_id, 1, Links[var_id]
-                                                print ref_seq[var_pos - 10:var_pos], ref_seq[var_pos:var_pos+int(var_data)], ref_seq[var_pos+int(var_data):var_pos+int(var_data)+10]
-                                            add_count(var_id, 1)
-                                var_idx += 1
-
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            cmp_MD += ("%d" % MD_match_len)
-                            MD_match_len = 0
-                            cmp_cigar_str += ("%dD" % length)
-                            cmp_MD += ("^%s" % ref_seq[ref_pos:ref_pos+length])
-                            ref_pos += length
-                        elif type == "soft":
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            read_pos += length
-                            cmp_cigar_str += ("%dS" % length)
-                        else:
-                            assert type == "intron"
-                            if cigar_match_len > 0:
-                                cmp_cigar_str += ("%dM" % cigar_match_len)
-                                cigar_match_len = 0
-                            cmp_cigar_str += ("%dN" % length)
-                            ref_pos += length                    
-                    if cigar_match_len > 0:
-                        cmp_cigar_str += ("%dM" % cigar_match_len)
-                    cmp_MD += ("%d" % MD_match_len)
-                    if read_pos != len(read_seq) or \
-                            cmp_cigar_str != cigar_str or \
-                            cmp_MD != MD:
-                        print >> sys.stderr, "Error:", cigar_str, MD
-                        print >> sys.stderr, "\tcomputed:", cmp_cigar_str, cmp_MD
-                        print >> sys.stderr, "\tcmp list:", cmp_list
-                        assert False            
-
-                    prev_read_id = read_id
-                    prev_exon = exon
-
-                if num_reads <= 0:
-                    continue
-
-                if prev_read_id != None:
-                    add_stat(HLA_cmpt, HLA_counts, HLA_count_per_read)
-
-                # Coverage
-                # it is not used by the default
-                if enable_coverage:
-                    assert num_reads > 0
-                    read_len = int(total_read_len / float(num_reads))
-                    coverage_sum = 0
-                    for i in range(len(coverage)):
-                        if i > 0:
-                            coverage[i] += coverage[i-1]
-                        coverage_sum += coverage[i]
-                    coverage_avg = coverage_sum / float(len(coverage))
-                    assert len(ref_seq) < len(coverage)
-                    for i in range(len(ref_seq)):
-                        coverage_threshold = 1.0 * coverage_avg
-                        if i < read_len:
-                            coverage_threshold *= ((i+1) / float(read_len))
-                        elif i + read_len > len(ref_seq):
-                            coverage_threshold *= ((len(ref_seq) - i) / float(read_len))
-                        if coverage[i] >= coverage_threshold:
-                            continue
-                        pseudo_num_reads = (coverage_threshold - coverage[i]) / read_len
-                        var_idx = lower_bound(Var_list[gene], i + 1)
-                        if var_idx >= len(Var_list[gene]):
-                            var_idx = len(Var_list[gene]) - 1
-                        cur_cmpt = set()
-                        while var_idx >= 0:
-                            var_pos, var_id = Var_list[gene][var_idx]
-                            var_type, _, var_data = Vars[gene][var_id]
-                            if var_type == "deletion":
-                                del_len = int(var_data)
-                                if i < var_pos:
-                                    break
-                                if i + read_len < var_pos + int(var_data):
-                                    assert var_id in Links
-                                    cur_cmpt = cur_cmpt.union(set(Links[var_id]))
-                            var_idx -= 1
-                        if cur_cmpt:
-                            cur_cmpt = '-'.join(list(cur_cmpt))
-                            if not cur_cmpt in HLA_cmpt:
-                                HLA_cmpt[cur_cmpt] = 0
-                            HLA_cmpt[cur_cmpt] += pseudo_num_reads
-            else:
-                assert index_type == "linear"
-                def add_alleles(alleles):
-                    if not allele in HLA_counts:
-                        HLA_counts[allele] = 1
-                    else:
-                        HLA_counts[allele] += 1
-
-                    cur_cmpt = sorted(list(alleles))
-                    cur_cmpt = '-'.join(cur_cmpt)
-                    if not cur_cmpt in HLA_cmpt:
-                        HLA_cmpt[cur_cmpt] = 1
-                    else:
-                        HLA_cmpt[cur_cmpt] += 1
-
-                prev_read_id, prev_AS = None, None
-                alleles = set()
-                for line in alignview_proc.stdout:
-                    cols = line[:-1].split()
-                    read_id, flag, allele = cols[:3]
-                    flag = int(flag)
-                    if flag & 0x4 != 0:
-                        continue
-                    if not allele.startswith(gene):
-                        continue
-                    if allele.find("BACKBONE") != -1:
-                        continue
-
-                    AS = None
-                    for i in range(11, len(cols)):
-                        col = cols[i]
-                        if col.startswith("AS"):
-                            AS = int(col[5:])
-                    assert AS != None
-                    if read_id != prev_read_id:
-                        if alleles:
-                            if aligner == "hisat2" or \
-                                    (aligner == "bowtie2" and len(alleles) < 10):
-                                add_alleles(alleles)
-                            alleles = set()
-                        prev_AS = None
-                    if prev_AS != None and AS < prev_AS:
-                        continue
-                    prev_read_id = read_id
-                    prev_AS = AS
-                    alleles.add(allele)
-
-                if alleles:
-                    add_alleles(alleles)
-
-            HLA_counts = [[allele, count] for allele, count in HLA_counts.items()]
-            def HLA_count_cmp(a, b):
-                if a[1] != b[1]:
-                    return b[1] - a[1]
-                assert a[0] != b[0]
-                if a[0] < b[0]:
-                    return -1
-                else:
-                    return 1
-            HLA_counts = sorted(HLA_counts, cmp=HLA_count_cmp)
-            for count_i in range(len(HLA_counts)):
-                count = HLA_counts[count_i]
-                if simulation:
-                    found = False
-                    for test_HLA_name in test_HLA_names:
-                        if count[0] == test_HLA_name:
-                            print >> sys.stderr, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_HLA_name, count[1])
-                            found = True
-                            """
-                            if count_i > 0 and HLA_counts[0][1] > count[1]:
-                                print >> sys.stderr, "Warning: %s ranked first (count: %d)" % (HLA_counts[0][0], HLA_counts[0][1])
-                                assert False
-                            else:
-                                test_passed += 1
-                            """
-                    if count_i < 5 and not found:
-                        print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                else:
-                    print >> sys.stderr, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                    if count_i >= 9:
-                        break
-            print >> sys.stderr
-
-            HLA_prob = single_abundance(HLA_cmpt, HLA_lengths[gene])
-
-            success = [False for i in range(len(test_HLA_names))]
-            found_list = [False for i in range(len(test_HLA_names))]
-            for prob_i in range(len(HLA_prob)):
-                prob = HLA_prob[prob_i]
-                found = False
-                if simulation:
-                    for name_i in range(len(test_HLA_names)):
-                        test_HLA_name = test_HLA_names[name_i]
-                        if prob[0] == test_HLA_name:
-                            rank_i = prob_i
-                            while rank_i > 0:
-                                if prob == HLA_prob[rank_i - 1][1]:
-                                    rank_i -= 1
-                                else:
-                                    break
-                            print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_HLA_name, prob[1] * 100.0)
-                            if rank_i < len(success):
-                                success[rank_i] = True
-                            found_list[name_i] = True
-                            found = True                        
-                    if not False in found_list:
-                        break
-                if not found:
-                    print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, prob[0], prob[1] * 100.0)
-                    if best_alleles and prob_i < 2:
-                        print >> sys.stdout, "SingleModel %s (abundance: %.2f%%)" % (prob[0], prob[1] * 100.0)
-                if not simulation and prob_i >= 9:
-                    break
-            print >> sys.stderr
-
-            if len(test_HLA_names) == 2 or not simulation:
-                HLA_prob = joint_abundance(HLA_cmpt, HLA_lengths[gene])
-                if len(HLA_prob) <= 0:
-                    continue
-                success = [False]
-                for prob_i in range(len(HLA_prob)):
-                    allele_pair, prob = HLA_prob[prob_i]
-                    allele1, allele2 = allele_pair.split('-')
-                    if best_alleles and prob_i < 1:
-                        print >> sys.stdout, "PairModel %s (abundance: %.2f%%)" % (allele_pair, prob * 100.0)
-                    if simulation:
-                        if allele1 in test_HLA_names and allele2 in test_HLA_names:
-                            rank_i = prob_i
-                            while rank_i > 0:
-                                if HLA_prob[rank_i-1][1] == prob:                                        
-                                    rank_i -= 1
-                                else:
-                                    break
-                            print >> sys.stderr, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, allele_pair, prob * 100.0)
-                            if rank_i == 0:
-                                success[0] = True
-                            break
-                    print >> sys.stderr, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, allele_pair, prob * 100.0)
-                    if not simulation and prob_i >= 9:
-                        break
-                print >> sys.stderr
-
-                # Li's method
-                """
-                li_hla = os.path.join(ex_path, "li_hla/hla")
-                if os.path.exists(li_hla):
-                    li_hla_cmd = [li_hla,
-                                  "hla",
-                                  "hla_input.bam",
-                                  "-b", "%s*BACKBONE" % gene]
-                    li_hla_proc = subprocess.Popen(li_hla_cmd,
-                                                   stdout=subprocess.PIPE,
-                                                   stderr=open("/dev/null", 'w'))
-
-                    # read in the result of Li's hla
-                    for line in li_hla_proc.stdout:
-                        allele1, allele2, score = line.strip().split()
-                        score = float(score)
-                        if simulation:
-                            if allele1 in test_HLA_names and allele2 in test_HLA_names:
-                                print >> sys.stderr, "\t\t\t*** 1 ranked %s-%s (score: %.2f)" % (allele1, allele2, score)
-                                success[0] = True
-                            else:
-                                print >> sys.stderr, "\t\t\tLiModel fails"
-                        if best_alleles:
-                            print >> sys.stdout, "LiModel %s-%s (score: %.2f)" % (allele1, allele2, score)
-                    li_hla_proc.communicate()
-                """
-
-            if simulation and not False in success:
-                aligner_type = "%s %s" % (aligner, index_type)
-                if not aligner_type in test_passed:
-                    test_passed[aligner_type] = 1
-                else:
-                    test_passed[aligner_type] += 1
-
-    if simulation:
-        return test_passed
-
-
-def read_HLA_alleles(fname, HLAs):
-    for line in open(fname):
-        if line.startswith(">"):
-            HLA_name = line.strip().split()[0][1:]
-            HLA_gene = HLA_name.split('*')[0]
-            if not HLA_gene in HLAs:
-                HLAs[HLA_gene] = {}
-            if not HLA_name in HLAs[HLA_gene]:
-                HLAs[HLA_gene][HLA_name] = ""
-        else:
-            HLAs[HLA_gene][HLA_name] += line.strip()
-    return HLAs
-
-"""
-"""
-def genotyping(base_fname,
-               reference_type,
-               hla_list,
-               partial,
-               aligners,
-               read_fname,
-               alignment_fname,
-               threads,
-               simulate_interval,
-               enable_coverage,
-               best_alleles,
-               exclude_allele_list,
-               default_allele_list,
-               num_mismatch,
-               verbose,
-               daehwan_debug):
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(genotyping))
-    ex_path = os.path.dirname(curr_script)
-
-    # Clone a git repository, IMGTHLA
-    if not os.path.exists("IMGTHLA"):
-        os.system("git clone https://github.com/jrob119/IMGTHLA.git")
-
-    # Clone hisat2 genotype database, hisat_genotype_db
-    """
-    if not os.path.exists("hisat_genotype_db"):
-        os.system("git clone https://github.com/infphilo/hisat_genotype_db.git")
-    """
-
-    simulation = (read_fname == [] and alignment_fname == "")
-
-    def check_files(fnames):
-        for fname in fnames:
-            if not os.path.exists(fname):
-                return False
-        return True
-
-    # Download HISAT2 index
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-    if not check_files(HISAT2_fnames):
-        os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz")
-        hisat2_inspect = os.path.join(ex_path, "hisat2-inspect")
-        os.system("%s grch38/genome > genome.fa" % hisat2_inspect)
-        os.system("samtools faidx genome.fa")
-
-    # Check if the pre-existing files (hla*) are compatible with the current parameter setting
-    if os.path.exists("%s.ref" % base_fname):
-        left = 0
-        HLA_genes = set()
-        BACKBONE = False
-        for line in open("%s.ref" % base_fname):
-            HLA_name = line.strip().split()[0]
-            if HLA_name.find("BACKBONE") != -1:
-                BACKBONE = True
-            HLA_gene = HLA_name.split('*')[0]
-            HLA_genes.add(HLA_gene)
-        delete_hla_files = False
-        if reference_type == "gene":
-            if not BACKBONE:
-                delete_hla_files = True
-        elif reference_type in ["chromosome", "genome"]:
-            if BACKBONE:
-                delete_hla_files = True
-        else:
-            assert False
-        if not set(hla_list).issubset(HLA_genes):
-            delete_hla_files = True
-        if base_fname == "hla":
-            if delete_hla_files:
-                os.system("rm %s*" % base_fname)
-    
-    # Extract HLA variants, backbone sequence, and other sequeces  
-    HLA_fnames = [base_fname+"_backbone.fa",
-                  base_fname+"_sequences.fa",
-                  base_fname+".ref",
-                  base_fname+".snp",
-                  base_fname+".haplotype",
-                  base_fname+".link",
-                  base_fname+"_alleles_excluded.txt"]
-    
-    # Check if excluded alleles in current files match
-    excluded_alleles_match = False
-    if(os.path.exists(HLA_fnames[6])):
-        afile = open(HLA_fnames[6],'r')
-        afile.readline()
-        lines = afile.read().split()
-        excluded_alleles_match = (set(exclude_allele_list) == set(lines))
-        afile.close()
-    elif len(exclude_allele_list) == 0:
-        excluded_alleles_match = True
-        try:
-            temp_name = HLA_fnames[6]
-            HLA_fnames.remove(HLA_fnames[6])
-            os.remove(temp_name)
-        except OSError:
-            pass
-        
-    if not excluded_alleles_match:
-        print("Creating Allele Exclusion File.\n")
-        afile = open(HLA_fnames[6],'w')
-        afile.write("Alleles excluded:\n")
-        afile.write("\n".join(exclude_allele_list))
-        afile.close()
-        
-    if (not check_files(HLA_fnames)) or (not excluded_alleles_match) :
-        extract_hla_script = os.path.join(ex_path, "hisatgenotype_extract_vars.py")
-        extract_cmd = [extract_hla_script,
-                       "--base", base_fname,
-                       "--reference-type", reference_type]
-
-        if base_fname == "hla":
-            extract_cmd += ["--hla-list", ','.join(hla_list)]
-
-        if len(exclude_allele_list) > 0:
-            print exclude_allele_list
-            extract_cmd += ["--exclude-allele-list", ",".join(exclude_allele_list)]
-
-        if len(base_fname) > 3:
-            extract_cmd += ["--base", base_fname]
-
-        if partial:
-            extract_cmd += ["--partial"]
-        extract_cmd += ["--inter-gap", "30",
-                        "--intra-gap", "50"]
-        if verbose:
-            print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-        proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-        proc.communicate()
-        
-        if not check_files(HLA_fnames):
-            print >> sys.stderr, "Error: extract_HLA_vars failed!"
-            sys.exit(1)
-            
-    for aligner, index_type in aligners:
-        # Build HISAT2 graph indexes based on the above information
-        if aligner == "hisat2" and index_type == "graph":
-            HLA_hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base_fname, i+1) for i in range(8)]
-            if not check_files(HLA_hisat2_graph_index_fnames) or (not excluded_alleles_match):
-                hisat2_build = os.path.join(ex_path, "hisat2-build")
-                build_cmd = [hisat2_build,
-                             "-p", str(threads),
-                             "--snp", HLA_fnames[3],
-                             "--haplotype", HLA_fnames[4] ,
-                             HLA_fnames[0],
-                             "%s.graph" % base_fname]
-                if verbose:
-                    print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
-                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                proc.communicate()        
-                if not check_files(HLA_hisat2_graph_index_fnames):
-                    print >> sys.stderr, "Error: indexing HLA failed!  Perhaps, you may have forgotten to build hisat2 executables?"
-                    sys.exit(1)
-
-        # Build HISAT2 linear indexes based on the above information
-        elif aligner == "hisat2" and index_type == "linear":
-            HLA_hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base_fname, i+1) for i in range(8)]
-            if reference_type == "gene" and (not check_files(HLA_hisat2_linear_index_fnames) or (not excluded_alleles_match)):
-                hisat2_build = os.path.join(ex_path, "hisat2-build")
-                build_cmd = [hisat2_build,
-                             "%s,%s"%(HLA_fnames[0],HLA_fnames[1]),
-                             "%s.linear" % base_fname]
-                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                proc.communicate()        
-                if not check_files(HLA_hisat2_linear_index_fnames):
-                    print >> sys.stderr, "Error: indexing HLA failed!"
-                    sys.exit(1)
-
-        # Build Bowtie2 indexes based on the above information
-        else:
-            assert aligner == "bowtie2" and index_type == "linear"
-            HLA_bowtie2_index_fnames = ["%s.%d.bt2" % (base_fname, i+1) for i in range(4)]
-            HLA_bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base_fname, i+1) for i in range(2)]
-            if reference_type == "gene" and (not check_files(HLA_bowtie2_index_fnames) or (not excluded_alleles_match)):
-                build_cmd = ["bowtie2-build",
-                             "%s,%s"%(HLA_fnames[0],HLA_fnames[1]),
-                             base_fname]
-                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'))
-                proc.communicate()        
-                if not check_files(HLA_bowtie2_index_fnames):
-                    print >> sys.stderr, "Error: indexing HLA failed!"
-                    sys.exit(1)
-        
-    # Read partial alleles from hla.data (temporary)
-    partial_alleles = set()
-    if base_fname == "hla":
-        for line in open("IMGTHLA/hla.dat"):
-            if not line.startswith("DE"):
-                continue
-            allele_name = line.split()[1][4:-1]
-            gene = allele_name.split('*')[0]
-            if line.find("partial") != -1:
-                partial_alleles.add(allele_name)
-
-    if len(default_allele_list) != 0:
-        #print os.getcwd()
-        if not os.path.exists("./Default-HLA/hla_backbone.fa"):
-            #current_path = os.getcwd()
-            try:
-                os.mkdir("./Default-HLA")
-            except:
-                pass
-            #os.chdir(current_path + "/Default-HLA")
-            
-            extract_hla_script = os.path.join(ex_path, "hisat2_extract_HLA_vars.py")
-            extract_cmd = [extract_hla_script,
-                           "--reference-type", reference_type,
-                           "--hla-list", ','.join(hla_list),
-                           "--base", "./Default-HLA/hla"]
-
-            if partial:
-                extract_cmd += ["--partial"]
-            extract_cmd += ["--inter-gap", "30",
-                            "--intra-gap", "50"]
-            if verbose:
-                print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-            proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-            proc.communicate()
-            
-            if not os.path.exists("./Default-HLA/hla_backbone.fa"):
-                print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!"
-                sys.exit(1)
-    
-    # Read HLA alleles (names and sequences)
-    refHLAs, refHLA_loci = {}, {}
-    for line in open("%s.ref" % base_fname):
-        HLA_name, chr, left, right, length, exon_str = line.strip().split()
-        HLA_gene = HLA_name.split('*')[0]
-        assert not HLA_gene in refHLAs
-        refHLAs[HLA_gene] = HLA_name
-        left, right = int(left), int(right)
-        exons = []
-        for exon in exon_str.split(','):
-            exon_left, exon_right = exon.split('-')
-            exons.append([int(exon_left), int(exon_right)])
-        refHLA_loci[HLA_gene] = [HLA_name, chr, left, right, exons]
-        
-    HLAs = {}
-    if reference_type == "gene":
-        read_HLA_alleles(HLA_fnames[0], HLAs)
-    read_HLA_alleles(HLA_fnames[1], HLAs)
-    
-    # HLA gene alleles
-    HLA_names = {}
-    for HLA_gene, data in HLAs.items():
-        HLA_names[HLA_gene] = list(data.keys())
-
-    # HLA gene allele lengths
-    HLA_lengths = {}
-    for HLA_gene, HLA_alleles in HLAs.items():
-        HLA_lengths[HLA_gene] = {}
-        for allele_name, seq in HLA_alleles.items():
-            HLA_lengths[HLA_gene][allele_name] = len(seq)
-
-    # Construct excluded alleles (Via default backbone data)
-    custom_allele_check = False
-    if len(default_allele_list) > 0:
-        custom_allele_check = True
-        HLAs_default = {}
-        read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default)
-        read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default)
-        
-        for HLA_gene, HLA_alleles in HLAs_default.items():
-            for allele_name, seq in HLA_alleles.items():
-                if allele_name in default_allele_list:
-                    HLA_lengths[HLA_gene][allele_name] = len(seq)
-
-    # Read HLA variants, and link information
-    Vars, Var_list = {}, {}
-    for line in open("%s.snp" % base_fname):
-        var_id, var_type, allele, pos, data = line.strip().split('\t')
-        pos = int(pos)
-        if reference_type != "gene":
-            allele, dist = None, 0
-            for tmp_gene, values in refHLA_loci.items():
-                allele_name, chr, left, right, exons = values
-                if allele == None:
-                    allele = allele_name
-                    dist = abs(pos - left)
-                else:
-                    if dist > abs(pos - left):
-                        allele = allele_name
-                        dist = abs(pos - left)
-            
-        gene = allele.split('*')[0]
-        if not gene in Vars:
-            Vars[gene] = {}
-            assert not gene in Var_list
-            Var_list[gene] = []
-            
-        assert not var_id in Vars[gene]
-        left = 0
-        if reference_type != "gene":
-            _, _, left, _, _ = refHLA_loci[gene]
-        Vars[gene][var_id] = [var_type, pos - left, data]
-        Var_list[gene].append([pos - left, var_id])
-        
-    for gene, in_var_list in Var_list.items():
-        Var_list[gene] = sorted(in_var_list)
-        
-    Links = {}
-    for line in open("%s.link" % base_fname):
-        var_id, alleles = line.strip().split('\t')
-        alleles = alleles.split()
-        assert not var_id in Links
-        Links[var_id] = alleles
-
-    # Test HLA typing
-    test_list = []
-    if simulation:
-        basic_test, pair_test = True, False
-        if daehwan_debug:
-            if "basic_test" in daehwan_debug:
-                basic_test, pair_test = True, False
-            else:
-                basic_test, pair_test = False, True
-
-        test_passed = {}
-        test_list = []
-        if base_fname == "hla":
-            genes = list(set(hla_list) & set(HLA_names.keys()))
-        else:
-            genes = HLA_names.keys()
-            
-        if basic_test:
-            for gene in genes:
-                HLA_gene_alleles = HLA_names[gene]
-                for HLA_name in HLA_gene_alleles:
-                    if HLA_name.find("BACKBONE") != -1:
-                        continue
-                    test_list.append([[HLA_name]])
-        if pair_test:
-            test_size = 500
-            allele_count = 2
-            for test_i in range(test_size):
-                test_pairs = []
-                for gene in genes:
-                    HLA_gene_alleles = []                    
-                    for allele in HLA_names[gene]:
-                        if allele.find("BACKBONE") != -1:
-                            continue
-                        HLA_gene_alleles.append(allele)
-
-                    # DK - temporary
-                    if len(HLA_gene_alleles) < 2:
-                        continue
-                        
-                    nums = [i for i in range(len(HLA_gene_alleles))]
-                    random.shuffle(nums)
-                    test_pairs.append(sorted([HLA_gene_alleles[nums[i]] for i in range(allele_count)]))
-                test_list.append(test_pairs)
-
-        for test_i in range(len(test_list)):
-            if "test_id" in daehwan_debug:
-                daehwan_test_ids = daehwan_debug["test_id"].split('-')
-                if str(test_i + 1) not in daehwan_test_ids:
-                    continue
-
-            print >> sys.stderr, "Test %d" % (test_i + 1)
-            test_HLA_list = test_list[test_i]
-           
-            # daehwan - for debugging purposes
-            # test_HLA_list = [["A*11:50Q", "A*11:01:01:01", "A*01:01:01:01"]]
-            for test_HLA_names in test_HLA_list:
-                for test_HLA_name in test_HLA_names:
-                    if custom_allele_check:
-                        gene = test_HLA_name.split('*')[0]
-                        test_HLA_seq = HLAs_default[gene][test_HLA_name]
-                        seq_type = "partial" if test_HLA_name in partial_alleles else "full"
-                        print >> sys.stderr, "\t%s - %d bp (%s sequence)" % (test_HLA_name, len(test_HLA_seq), seq_type)
-                        continue
-                    gene = test_HLA_name.split('*')[0]
-                    test_HLA_seq = HLAs[gene][test_HLA_name]
-                    seq_type = "partial" if test_HLA_name in partial_alleles else "full"
-                    print >> sys.stderr, "\t%s - %d bp (%s sequence)" % (test_HLA_name, len(test_HLA_seq), seq_type)
-            if custom_allele_check:
-                simulate_reads(HLAs_default, test_HLA_list, simulate_interval)
-            else:
-                simulate_reads(HLAs, test_HLA_list, simulate_interval)
-
-            if "test_id" in daehwan_debug:
-                read_fname = ["hla_input_1.fa"]
-            else:
-                read_fname = ["hla_input_1.fa", "hla_input_2.fa"]
-
-            fastq = False
-            
-            tmp_test_passed = HLA_typing(ex_path,
-                                         base_fname,
-                                         simulation,
-                                         reference_type,
-                                         test_HLA_list,
-                                         partial,
-                                         refHLAs,
-                                         HLAs,                       
-                                         HLA_names,
-                                         HLA_lengths,
-                                         refHLA_loci,
-                                         Vars,
-                                         Var_list,
-                                         Links,
-                                         exclude_allele_list,
-                                         aligners,
-                                         num_mismatch,
-                                         fastq,
-                                         read_fname,
-                                         alignment_fname,
-                                         threads,
-                                         enable_coverage,
-                                         best_alleles,
-                                         verbose)
-
-            for aligner_type, passed in tmp_test_passed.items():
-                if aligner_type in test_passed:
-                    test_passed[aligner_type] += passed
-                else:
-                    test_passed[aligner_type] = passed
-
-                print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1)))
-
-
-        for aligner_type, passed in test_passed.items():
-            print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
-    
-    else: # With real reads or BAMs
-        if base_fname == "hla":
-            gene_list = hla_list
-        else:
-            gene_list = Vars.keys()
-        print >> sys.stderr, "\t", ' '.join(gene_list)
-
-        fastq = True
-        HLA_typing(ex_path,
-                   base_fname,
-                   simulation,
-                   reference_type,
-                   gene_list,
-                   partial,
-                   refHLAs,
-                   HLAs,                       
-                   HLA_names,
-                   HLA_lengths,
-                   refHLA_loci,
-                   Vars,
-                   Var_list,
-                   Links,
-                   exclude_allele_list,
-                   aligners,
-                   num_mismatch,
-                   fastq,
-                   read_fname,
-                   alignment_fname,
-                   threads,
-                   enable_coverage,
-                   best_alleles,
-                   verbose)
-
-        
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='genotyping')
-    parser.add_argument("--base",
-                        dest="base_fname",
-                        type=str,
-                        default="hla",
-                        help="base filename for backbone HLA sequence, HLA variants, and HLA linking info")
-    parser.add_argument("--default-list",
-                        dest = "default_allele_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of HLA alleles to be tested. Alleles are retrieved from default backbone data (all alleles included in backbone).")
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
-                        type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome (default: gene)")
-    parser.add_argument("--hla-list",
-                        dest="hla_list",
-                        type=str,
-                        default="A,B,C,DQA1,DQB1,DRB1",
-                        help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)")
-    parser.add_argument('--partial',
-                        dest='partial',
-                        action='store_true',
-                        help='Include partial alleles (e.g. A_nuc.fasta)')
-    parser.add_argument("--aligner-list",
-                        dest="aligners",
-                        type=str,
-                        default="hisat2.graph,hisat2.linear,bowtie2.linear",
-                        help="A comma-separated list of aligners (default: hisat2.graph,hisat2.linear,bowtie2.linear)")
-    parser.add_argument("--reads",
-                        dest="read_fname",
-                        type=str,
-                        default="",
-                        help="Fastq read file name")
-    parser.add_argument("--alignment",
-                        dest="alignment_fname",
-                        type=str,
-                        default="",
-                        help="BAM file name")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument("--simulate-interval",
-                        dest="simulate_interval",
-                        type=int,
-                        default=1,
-                        help="Reads simulated at every these base pairs (default: 1)")
-    parser.add_argument("--coverage",
-                        dest="coverage",
-                        action='store_true',
-                        help="Experimental purpose (assign reads based on coverage)")
-    parser.add_argument("--best-alleles",
-                        dest="best_alleles",
-                        action='store_true',
-                        help="")
-    parser.add_argument("--exclude-allele-list",
-                        dest="exclude_allele_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of alleles to be excluded. Enter a number N to randomly select N alleles for exclusion and N non-excluded alleles for testing (2N tested in total).")
-    parser.add_argument("--num-mismatch",
-                        dest="num_mismatch",
-                        type=int,
-                        default=0,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 0)")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument("--debug",
-                        dest="debug",
-                        type=str,
-                        default="",
-                        help="e.g., test_id:10,read_id:10000,basic_test")
-    parser.add_argument("--novel_allele_detection",
-                        dest="novel_allele_detection",
-                        action='store_true',
-                        help="Change test to detection of new alleles. Report sensitivity and specificity rate at the end.")
-
-
-    args = parser.parse_args()
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
-        sys.exit(1)
-    args.hla_list = args.hla_list.split(',')
-    if args.aligners == "":
-        print >> sys.stderr, "Error: --aligners must be non-empty."
-        sys.exit(1)    
-    args.aligners = args.aligners.split(',')
-    for i in range(len(args.aligners)):
-        args.aligners[i] = args.aligners[i].split('.')
-    if args.read_fname:
-        args.read_fname = args.read_fname.split(',')
-    else:
-        args.read_fname = []
-    if args.alignment_fname != "" and \
-            not os.path.exists(args.alignment_fname):
-        print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname
-        sys.exit(1)
-    
-    if len(args.default_allele_list) > 0:
-        args.default_allele_list = args.default_allele_list.split(',')
-        
-    if len(args.exclude_allele_list) > 0:
-        if args.exclude_allele_list.strip().isdigit():
-            num_alleles = int(args.exclude_allele_list)
-            if not os.path.exists("./Default-HLA/hla_backbone.fa"):
-                try:
-                    os.mkdir("./Default-HLA")
-                except:
-                    pass
-                
-                extract_hla_script = os.path.join(ex_path, "hisat2_extract_HLA_vars.py")
-                extract_cmd = [extract_hla_script,
-                               "--reference-type", reference_type,
-                               "--hla-list", ','.join(hla_list),
-                               "--base", "./Default-HLA/hla"]
-                if partial:
-                    extract_cmd += ["--partial"]
-                extract_cmd += ["--inter-gap", "30",
-                                "--intra-gap", "50"]
-                if verbose:
-                    print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-                proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                proc.communicate()
-                if not os.path.exists("./Default-HLA/hla_backbone.fa"):
-                    print >> sys.stderr, "Error: extract_HLA_vars (Default) failed!"
-                    sys.exit(1)
-       
-            HLAs_default = {}
-            #read_HLA_alleles("./Default-HLA/hla_backbone.fa",HLAs_default)
-            read_HLA_alleles("./Default-HLA/hla_sequences.fa",HLAs_default)
-    
-            allele_names = list(HLAs_default['A'].keys())
-            random.shuffle(allele_names)
-            args.exclude_allele_list = allele_names[0:num_alleles]
-            args.default_allele_list = allele_names[num_alleles:2*num_alleles]
-            
-            args.default_allele_list = args.default_allele_list + args.exclude_allele_list
-        else:
-            args.exclude_allele_list = args.exclude_allele_list.split(',')
-        
-    debug = {}
-    if args.debug != "":
-        for item in args.debug.split(','):
-            if ':' in item:
-                key, value = item.split(':')
-                debug[key] = value
-            else:
-                debug[item] = 1
-
-    random.seed(1)
-    genotyping(args.base_fname,
-               args.reference_type,
-               args.hla_list,
-               args.partial,
-               args.aligners,
-               args.read_fname,
-               args.alignment_fname,
-               args.threads,
-               args.simulate_interval,
-               args.coverage,
-               args.best_alleles,
-               args.exclude_allele_list,
-               args.default_allele_list,
-               args.num_mismatch,
-               args.verbose,
-               debug)
-
-    
diff --git a/hisatgenotype_locus.py b/hisatgenotype_locus.py
deleted file mode 100755
index 4d958058..00000000
--- a/hisatgenotype_locus.py
+++ /dev/null
@@ -1,2631 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2017, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT-genotype.
-#
-# HISAT-genotype is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT-genotype is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import inspect, random
-import math
-from datetime import datetime, date, time
-from argparse import ArgumentParser, FileType
-from copy import deepcopy
-import hisatgenotype_typing_common as typing_common, hisatgenotype_assembly_graph as assembly_graph
-
-
-"""
-   var: ['single', 3300, 'G']
-   exons: [[301, 373], [504, 822], [1084, 1417], [2019, 2301], [2404, 2520], [2965, 2997], [3140, 3187], [3357, 3361]]
-"""
-def var_in_exon(var, exons):
-    exonic = False
-    var_type, var_left, var_data = var
-    var_right = var_left
-    if var_type == "deletion":
-        var_right = var_left + int(var_data) - 1
-    for exon_left, exon_right in exons:
-        if var_left >= exon_left and var_right <= exon_right:
-            return True
-    return False
-
-
-"""
-Report variant IDs whose var is within exonic regions
-"""
-def get_exonic_vars(Vars, exons):
-    vars = set()
-    for var_id, var in Vars.items():
-        var_type, var_left, var_data = var
-        var_right = var_left
-        if var_type == "deletion":
-            var_right = var_left + int(var_data) - 1
-        for exon_left, exon_right in exons:
-            if var_left >= exon_left and var_right <= exon_right:
-                vars.add(var_id)
-                
-    return vars
-
-
-"""
-Get representative alleles among those that share the same exonic sequences
-"""
-def get_rep_alleles(Links, exon_vars, in_alleles = None):
-    allele_vars = {}
-    for var, alleles in Links.items():
-        if var not in exon_vars:
-            continue
-        for allele in alleles:
-            if in_alleles != None and allele not in in_alleles:
-                continue
-            if allele not in allele_vars:
-                allele_vars[allele] = set()
-            allele_vars[allele].add(var)
-
-    allele_groups = {}
-    for allele, vars in allele_vars.items():
-        vars = '-'.join(vars)
-        if vars not in allele_groups:
-            allele_groups[vars] = []
-        allele_groups[vars].append(allele)
-
-    allele_reps = {} # allele representatives
-    allele_rep_groups = {} # allele groups by allele representatives
-    for allele_members in allele_groups.values():
-        assert len(allele_members) > 0
-        allele_rep = allele_members[0]
-        allele_rep_groups[allele_rep] = allele_members
-        for allele_member in allele_members:
-            assert allele_member not in allele_reps
-            allele_reps[allele_member] = allele_rep
-
-    return allele_reps, allele_rep_groups
-    
-
-"""
-"""
-def error_correct(ref_seq,
-                  read_seq,
-                  read_pos,
-                  mpileup,
-                  Vars,
-                  Var_list,
-                  cmp_list,
-                  debug = False):
-    if debug:
-        print >> sys.stderr, cmp_list
-        print >> sys.stderr, read_seq
-
-    num_correction = 0
-    i = 0
-    while i < len(cmp_list):
-        type, left, length = cmp_list[i][:3]
-        assert length > 0
-        if left >= len(ref_seq):
-            break
-        if type == "match":
-            middle_cmp_list = []
-            last_j = 0
-            for j in range(length):
-                if read_pos + j >= len(read_seq) or \
-                   left + j >= len(ref_seq):
-                    continue
-                
-                read_bp, ref_bp = read_seq[read_pos + j], ref_seq[left + j]
-                assert left + j < len(mpileup)
-                nt_set = mpileup[left + j][0]
-                if len(nt_set) > 0 and read_bp not in nt_set:
-                    read_bp = 'N' if len(nt_set) > 1 else nt_set[0]                    
-                    read_seq = read_seq[:read_pos + j] + read_bp + read_seq[read_pos + j + 1:]
-                    assert read_bp != ref_bp
-                    new_cmp = ["mismatch", left + j, 1, "unknown"]
-                    num_correction += 1
-                    if read_bp != 'N':
-                        var_idx = typing_common.lower_bound(Var_list, left + j)
-                        while var_idx < len(Var_list):
-                            var_pos, var_id = Var_list[var_idx]
-                            if var_pos > left + j:
-                                break
-                            if var_pos == left + j:
-                                var_type, _, var_data = Vars[var_id]
-                                if var_type == "single" and read_bp == var_data:
-                                    new_cmp[3] = var_id
-                                    break                                                        
-                            var_idx += 1
-                    if j > last_j:
-                        middle_cmp_list.append(["match", left + last_j, j- last_j])
-                    middle_cmp_list.append(new_cmp)
-                    last_j = j + 1
-            if last_j < length:
-                middle_cmp_list.append(["match", left + last_j, length - last_j])
-
-            assert len(middle_cmp_list) > 0
-            cmp_list = cmp_list[:i] + middle_cmp_list + cmp_list[i+1:]
-            i += (len(middle_cmp_list) - 1)
-        else:
-            assert type == "mismatch"
-            read_bp, ref_bp = read_seq[read_pos], ref_seq[left]
-            assert left < len(mpileup)
-            nt_set = mpileup[left][0]
-
-            if debug:
-                print >> sys.stderr, left, read_bp, ref_bp, mpileup[left]
-
-            if len(nt_set) > 0 and read_bp not in nt_set:
-                read_bp = 'N' if len(nt_set) > 1 else nt_set[0]
-                read_seq = read_seq[:read_pos] + read_bp + read_seq[read_pos+1:]
-                if read_bp == 'N':
-                    cmp_list[i][3] = "unknown"
-                elif read_bp == ref_bp:
-                    cmp_list[i] = ["match", left, 1]
-                    num_correction += 1
-                else:
-                    cmp_list[i][3] = "unknown"
-                    var_idx = typing_common.lower_bound(Var_list, left)
-                    while var_idx < len(Var_list):
-                        var_pos, var_id = Var_list[var_idx]
-                        if var_pos > left:
-                            break
-                        if var_pos == left:
-                            var_type, _, var_data = Vars[var_id]
-                            if var_type == "single" and read_bp == var_data:
-                                cmp_list[i][3] = var_id
-                                break                                                        
-                        var_idx += 1
-
-                if debug:
-                    print >> sys.stderr, left, read_bp, ref_bp, mpileup[left]
-                    print >> sys.stderr, cmp_list[i]
-
-        read_pos += length
-        i += 1
-
-    # Combine matches
-    i = 0
-    while i < len(cmp_list):
-        type, left, length = cmp_list[i][:3]
-        if type == "match" and i + 1 < len(cmp_list):
-            type2, left2, length2 = cmp_list[i+1][:3]
-            if type2 == "match":
-                cmp_list[i] = [type, left, length + length2]
-                cmp_list = cmp_list[:i+1] + cmp_list[i+2:]
-                continue
-        i += 1
-
-    if debug:
-        print >> sys.stderr, cmp_list
-        print >> sys.stderr, read_seq
-                            
-    return cmp_list, read_seq, num_correction
-
-
-"""
-"""
-def typing(simulation,
-           base_fname,
-           locus_list,
-           genotype_genome,
-           partial,
-           partial_alleles,
-           refGenes,
-           Genes,
-           Gene_names,
-           Gene_lengths,
-           refGene_loci,
-           Vars,
-           Var_list,
-           Links,
-           aligners,
-           num_editdist,
-           assembly,
-           output_base,
-           error_correction,
-           keep_alignment,
-           allow_discordant,
-           type_primary_exons,
-           remove_low_abundance_alleles,
-           display_alleles,
-           fastq,
-           read_fname,
-           alignment_fname,
-           num_frag_list,
-           read_len,
-           fragment_len,
-           threads,
-           best_alleles,
-           verbose,
-           assembly_verbose):
-    if simulation:
-        test_passed = {}
-    report_file = open(output_base + ".report", 'w')
-    for aligner, index_type in aligners:
-        for f_ in [sys.stderr, report_file]:
-            if index_type == "graph":
-                print >> f_, "\n\t\t%s %s" % (aligner, index_type)
-            else:
-                print >> f_, "\n\t\t%s %s" % (aligner, index_type)
-
-        remove_alignment_file = False
-        if alignment_fname == "":
-            # Align reads, and sort the alignments into a BAM file
-            remove_alignment_file = True
-            if simulation:
-                alignment_fname = "%s_output.bam" % base_fname
-            else:
-                alignment_fname = read_fname[0].split('/')[-1]
-                alignment_fname = "%s.bam" % '.'.join(alignment_fname.split('.')[:2])
-                    
-            typing_common.align_reads(aligner,
-                                      simulation,
-                                      genotype_genome if genotype_genome != "" else (base_fname + "." + index_type),
-                                      index_type,
-                                      base_fname,
-                                      read_fname,
-                                      fastq,
-                                      threads,
-                                      alignment_fname,
-                                      verbose)
-            
-        for test_Gene_names in locus_list:
-            if base_fname == "genome":
-                if simulation:
-                    region_chr, region_left, region_right = test_Gene_names[0]
-                else:
-                    region_chr, region_left, region_right = test_Gene_names
-                gene = "%s:%d-%d" % (region_chr, region_left, region_right)
-            else:
-                if simulation:
-                    gene = test_Gene_names[0].split('*')[0]
-                else:
-                    gene = test_Gene_names
-                
-            ref_allele = refGenes[gene]
-            ref_seq = Genes[gene][ref_allele]
-            ref_locus = refGene_loci[gene]
-            ref_exons, ref_primary_exons = ref_locus[-2], ref_locus[-1]
-            novel_var_count = 0        
-            gene_vars, gene_var_list = deepcopy(Vars[gene]), deepcopy(Var_list[gene])
-            cur_maxright = -1
-            gene_var_maxrights = {}
-            for var_pos, var_id in gene_var_list:
-                var_type, var_pos, var_data = gene_vars[var_id]
-                if var_type == "deletion":
-                    var_pos = var_pos + int(var_data) - 1
-                cur_maxright = max(cur_maxright, var_pos)
-                gene_var_maxrights[var_id] = cur_maxright
-                    
-            var_count = {}
-            def add_novel_var(gene_vars,
-                              gene_var_list,
-                              novel_var_count,
-                              var_type,
-                              var_pos,
-                              var_data):
-                var_idx = typing_common.lower_bound(gene_var_list, var_pos)
-                while var_idx < len(gene_var_list):
-                    pos_, id_ = gene_var_list[var_idx]
-                    if pos_ > var_pos:
-                        break
-                    if pos_ == var_pos:
-                        type_, _, data_ = gene_vars[id_]
-                        assert type_ != var_type or data_ != var_data
-                        if type_ != var_type:
-                            if var_type == "insertion":
-                                break
-                            elif var_type == "single" and type_ == "deletion":
-                                break
-                        else:
-                            if var_data < data_:
-                                break
-                    var_idx += 1
-                var_id = "nv%d" % novel_var_count
-                assert var_id not in gene_vars
-                gene_vars[var_id] = [var_type, var_pos, var_data]
-                gene_var_list.insert(var_idx, [var_pos, var_id])                
-                return var_id, novel_var_count + 1
-
-            if not os.path.exists(alignment_fname + ".bai"):
-                os.system("samtools index %s" % alignment_fname)
-            # Read alignments
-            alignview_cmd = ["samtools",
-                             "view",
-                             alignment_fname]
-            base_locus = 0
-            if genotype_genome != "":
-                _, chr, left, right = ref_locus[:4]
-                alignview_cmd += ["%s:%d-%d" % (chr, left+1, right+1)]
-                base_locus = left
-
-            if index_type == "graph":
-                alignview_cmd += [ref_allele]
-                mpileup = typing_common.get_mpileup(alignview_cmd,
-                                                    ref_seq,
-                                                    base_locus,
-                                                    gene_vars,
-                                                    allow_discordant)
-
-                if base_fname == "codis":
-                    pair_interdist = typing_common.get_pair_interdist(alignview_cmd,
-                                                                      simulation,
-                                                                      verbose)
-                else:
-                    pair_interdist = None
-
-                bamview_proc = subprocess.Popen(alignview_cmd,
-                                                stdout=subprocess.PIPE,
-                                                stderr=open("/dev/null", 'w'))
-
-                sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
-                alignview_proc = subprocess.Popen(sort_read_cmd,
-                                                  stdin=bamview_proc.stdout,
-                                                  stdout=subprocess.PIPE,
-                                                  stderr=open("/dev/null", 'w'))
-            else:
-                alignview_proc = subprocess.Popen(alignview_cmd,
-                                             stdout=subprocess.PIPE,
-                                             stderr=open("/dev/null", 'w'))
-
-            # List of nodes that represent alleles
-            allele_vars = {}
-            for _, var_id in gene_var_list:
-                if var_id not in Links:
-                    continue
-                allele_list = Links[var_id]
-                for allele_id in allele_list:
-                    if allele_id not in Genes[gene]:
-                        continue
-                    if allele_id not in allele_vars:
-                        allele_vars[allele_id] = [var_id]
-                    else:
-                        allele_vars[allele_id].append(var_id)
-
-            # Extract variants that are within exons
-            exon_vars = get_exonic_vars(gene_vars, ref_exons)
-            primary_exon_vars = get_exonic_vars(gene_vars, ref_primary_exons)
-
-            # Store nodes that represent alleles
-            allele_nodes = {}
-            def create_allele_node(allele_name):
-                if allele_name in allele_nodes:
-                    return allele_nodes[allele_name]
-                if allele_name in allele_vars:
-                    var_ids = allele_vars[allele_name]
-                else:
-                    var_ids = []
-                seq = list(ref_seq)  # sequence that node represents
-                var = ["" for i in range(len(ref_seq))]  # how sequence is related to backbone
-                for var_id in var_ids:
-                    assert var_id in gene_vars
-                    var_type, var_pos, var_data = gene_vars[var_id]
-                    assert var_pos >= 0 and var_pos < len(ref_seq)
-                    if var_type == "single":
-                        seq[var_pos] = var_data
-                        var[var_pos] = var_id
-                    elif var_type == "deletion":
-                        del_len = int(var_data)
-                        assert var_pos + del_len <= len(ref_seq)
-                        seq[var_pos:var_pos + del_len] = ['D'] * del_len
-                        var[var_pos:var_pos + del_len] = [var_id] * del_len
-                    else:
-                        # DK - to be implemented for insertions
-                        assert var_type == "insertion"
-
-                qual = ' ' * len(seq)
-                allele_node = assembly_graph.Node(allele_name,
-                                                  0,
-                                                  seq,
-                                                  qual,
-                                                  var,
-                                                  ref_seq,
-                                                  gene_vars,
-                                                  mpileup,
-                                                  simulation)
-                allele_nodes[allele_name] = allele_node
-                return allele_node
-
-            true_allele_nodes = {}
-            if simulation:
-                for allele_name in test_Gene_names:
-                    true_allele_nodes[allele_name] = create_allele_node(allele_name)
-
-            display_allele_nodes = {}
-            for display_allele in display_alleles:
-                display_allele_nodes[display_allele] = create_allele_node(display_allele)
-
-            # Assembly graph
-            asm_graph = assembly_graph.Graph(ref_seq,
-                                             gene_vars,
-                                             ref_exons,
-                                             ref_primary_exons,
-                                             partial_alleles,
-                                             true_allele_nodes,
-                                             {}, # predicted_allele_nodes, which is empty for now
-                                             display_allele_nodes,
-                                             simulation)
-
-            # Choose allele representives from those that share the same exonic sequences
-            allele_reps, allele_rep_groups = get_rep_alleles(Links, exon_vars)
-            allele_rep_set = set(allele_reps.values())
-
-            # Choose allele representives from those that share the primary exonic sequences
-            primary_exon_allele_reps, primary_exon_allele_rep_groups = get_rep_alleles(Links, primary_exon_vars, allele_rep_set)
-            primary_exon_allele_rep_set = set(primary_exon_allele_reps.values())
-
-            # Sanity check
-            for exon_allele in primary_exon_allele_reps.keys():
-                # DK - debugging purposes
-                if exon_allele not in allele_rep_set:
-                    print exon_allele, allele_reps[exon_allele], exon_allele in primary_exon_allele_reps.keys()
-                    
-                assert exon_allele in allele_rep_set
-                                    
-            # For checking alternative alignments near the ends of alignments
-            Alts_left, Alts_right = typing_common.get_alternatives(ref_seq,
-                                                                   allele_vars,
-                                                                   gene_vars,
-                                                                   gene_var_list,
-                                                                   verbose >= 2)
-
-            def haplotype_alts_list(haplotype_alts, left = True):
-                haplotype_list = []
-                for haplotype in haplotype_alts.keys():
-                    if left:
-                        pos = int(haplotype.split('-')[-1])
-                    else:
-                        pos = int(haplotype.split('-')[0])
-                    haplotype_list.append([pos, haplotype])
-                return sorted(haplotype_list, cmp = lambda a, b: a[0] - b[0])
-
-            Alts_left_list, Alts_right_list = haplotype_alts_list(Alts_left, True), haplotype_alts_list(Alts_right, False)
-
-            # Count alleles
-            Gene_primary_exons_counts, Gene_primary_exons_cmpt = {}, {}
-            Gene_exons_counts, Gene_exons_cmpt = {}, {}
-            Gene_counts, Gene_cmpt = {}, {}
-            num_reads, num_pairs = 0, 0
-
-            # For debugging purposes
-            debug_allele_names = set(test_Gene_names) if simulation and verbose >= 2 else set()
-
-            # Read information
-            prev_read_id = None
-            prev_right_pos = 0
-            prev_lines = []
-            left_read_ids, right_read_ids = set(), set()
-            if index_type == "graph":
-                # nodes for reads
-                read_nodes = []
-                read_vars_list = []
-
-                # 
-                def add_count(count_per_read, ht, add):
-                    if base_fname == "genome" and len(count_per_read) == 1:
-                        for allele in count_per_read.keys():
-                            count_per_read[allele] = add
-                        return
-                    
-                    orig_ht = ht
-                    ht = ht.split('-')
-
-                    assert len(ht) >= 2
-                    left, right = int(ht[0]), int(ht[-1])
-                    assert left <= right
-
-                    ht = ht[1:-1]
-                    alleles = set(Genes[gene].keys()) - set([ref_allele])
-                    for i in range(len(ht)):
-                        var_id = ht[i]
-                        if var_id.startswith("nv") or \
-                           var_id not in Links:
-                            continue
-                        alleles &= set(Links[var_id])
-                    ht = set(ht)
-
-                    tmp_alleles = set()
-                    var_idx = typing_common.lower_bound(gene_var_list, right + 1)
-                    var_idx = min(var_idx, len(gene_var_list) - 1)
-                    while var_idx >= 0:
-                        _, var_id = gene_var_list[var_idx]
-                        if var_id.startswith("nv") or \
-                           var_id in ht or \
-                           var_id not in Links:
-                            var_idx -= 1
-                            continue
-                        if var_id in gene_var_maxrights and gene_var_maxrights[var_id] < left:
-                            break
-                        var_type, var_left, var_data = gene_vars[var_id]
-                        var_right = var_left
-                        if var_type == "deletion":
-                            var_right = var_left + int(var_data) - 1
-                        if (var_left >= left and var_left <= right) or \
-                           (var_right >= left and var_right <= right):
-                            tmp_alleles |= set(Links[var_id])
-                        var_idx -= 1                        
-                    alleles -= tmp_alleles
-                    alleles &= set(count_per_read.keys())
-                    
-                    for allele in alleles:
-                        count_per_read[allele] += add
-
-                    return len(alleles)
-
-                # Identify best pairs
-                def choose_pairs(left_positive_hts, right_positive_hts):
-                    if len(left_positive_hts) > 0 and \
-                       len(right_positive_hts) > 0 and \
-                       max(len(left_positive_hts), len(right_positive_hts)) >= 2:
-                        expected_inter_dist = pair_interdist
-                        """
-                        if simulation:
-                            expected_inter_dist = fragment_len - read_len * 2
-                        """
-                            
-                        best_diff = sys.maxint
-                        picked = []                                
-                        for left_ht_str in left_positive_hts:
-                            left_ht = left_ht_str.split('-')
-                            l_left, l_right = int(left_ht[0]), int(left_ht[-1])
-                            for right_ht_str in right_positive_hts:
-                                right_ht = right_ht_str.split('-')
-                                r_left, r_right = int(right_ht[0]), int(right_ht[-1])
-                                if l_right < r_right:
-                                    inter_dist = r_left - l_right - 1
-                                else:
-                                    inter_dist = l_left - r_right - 1
-
-                                cur_diff = abs(expected_inter_dist - inter_dist)
-                                if best_diff > cur_diff:
-                                    best_diff = cur_diff
-                                    picked = [[left_ht_str, right_ht_str]]
-                                elif best_diff == cur_diff:
-                                    picked.append([left_ht_str, right_ht_str])
-
-                        assert len(picked) > 0
-
-                        left_positive_hts, right_positive_hts = set(), set()
-                        for left_ht_str, right_ht_str in picked:
-                            left_positive_hts.add(left_ht_str)
-                            right_positive_hts.add(right_ht_str)
-
-                    return left_positive_hts, right_positive_hts
-
-                def get_exon_haplotypes(ht, exons):
-                    if len(exons) <= 0:
-                        return []
-                    
-                    debug_ht = deepcopy(ht)
-                    ht = ht.split('-')
-                    assert len(ht) >= 2
-                    ht[0], ht[-1] = int(ht[0]), int(ht[-1])
-                    exon_hts = []
-                    for e_left, e_right in exons:
-                        assert len(ht) >= 2
-                        ht_left, ht_right = ht[0], ht[-1]
-                        if e_left > ht_right or e_right < ht_left:
-                            continue
-
-                        new_ht = deepcopy(ht)
-                        if ht_left < e_left:
-                            split = False
-                            for i in range(1, len(new_ht) - 1):
-                                var_id = new_ht[i]
-                                type, left, data = gene_vars[var_id]
-                                if (type != "deletion" and left >= e_left) or \
-                                   (type == "deletion" and left - 1 >= e_left):
-                                    ht_left = e_left
-                                    new_ht = [ht_left] + new_ht[i:]
-                                    split = True
-                                    break
-                                if type == "deletion":
-                                    right = left + int(data)
-                                    if right >= e_left:
-                                        ht_left = right
-                                        new_ht = [right] + new_ht[i+1:]
-                                        split = True
-                                        break
-                            if not split:
-                                ht_left = e_left
-                                new_ht = [ht_left, ht_right]
-                        assert ht_left >= e_left
-                        if ht_right > e_right:
-                            split = False
-                            for i in reversed(range(1, len(new_ht) - 1)):
-                                var_id = new_ht[i]
-                                type, right, data = gene_vars[var_id]
-                                if type == "deletion":
-                                    right = right + int(data) - 1
-                                if (type != "deletion" and right <= e_right) or \
-                                   (type == "deletion" and right + 1 <= e_right):
-                                    ht_right = e_right
-                                    new_ht = new_ht[:i+1] + [ht_right]
-                                    split = True
-                                    break
-                                if type == "deletion":
-                                    left = right - int(data)
-                                    if left <= e_right:
-                                        ht_right = left
-                                        new_ht = new_ht[:i] + [ht_right]
-                                        split = True
-                                        break
-                            if not split:
-                                ht_right = e_right
-                                new_ht = [ht_left, ht_right]
-
-                        if len(new_ht) == 2:
-                            new_ht = "%d-%d" % (new_ht[0], new_ht[-1])
-                        else:
-                            assert len(new_ht) > 2
-                            new_ht = "%d-%s-%d" % (new_ht[0], '-'.join(new_ht[1:-1]), new_ht[-1])
-                        assert ht_left <= ht_right
-                        exon_hts.append(new_ht)
-
-                    return exon_hts
-
-                # Positive evidence for left and right reads
-                left_positive_hts, right_positive_hts = set(), set()
-                
-                # Cigar regular expression
-                cigar_re = re.compile('\d+\w')
-                for line in alignview_proc.stdout:
-                    line = line.strip()
-                    cols = line.split()
-                    read_id, flag, chr, pos, mapQ, cigar_str = cols[:6]
-                    node_read_id = orig_read_id = read_id
-                    if simulation:
-                        read_id = read_id.split('|')[0]
-                    read_seq, read_qual = cols[9], cols[10]
-                    flag, pos = int(flag), int(pos)
-                    pos -= (base_locus + 1)
-                    if pos < 0:
-                        continue
-
-                    # Unalined?
-                    if flag & 0x4 != 0:
-                        if simulation and verbose >= 2:
-                            print "Unaligned"
-                            print "\t", line
-                        continue
-
-                    # Concordantly mapped?
-                    if flag & 0x2 != 0:
-                        concordant = True
-                    else:
-                        concordant = False
-
-                    NM, Zs, MD, NH = "", "", "", ""
-                    for i in range(11, len(cols)):
-                        col = cols[i]
-                        if col.startswith("Zs"):
-                            Zs = col[5:]
-                        elif col.startswith("MD"):
-                            MD = col[5:]
-                        elif col.startswith("NM"):
-                            NM = int(col[5:])
-                        elif col.startswith("NH"):
-                            NH = int(col[5:])
-
-                    if NM > num_editdist:
-                        continue
-
-                    # Only consider unique alignment
-                    if NH > 1:
-                        continue
-
-                    # Concordantly aligned mate pairs
-                    if not allow_discordant and not concordant:
-                        continue
-
-                    # Left read?
-                    is_left_read = flag & 0x40 != 0
-                    if is_left_read:
-                        if read_id in left_read_ids:
-                            continue
-                        left_read_ids.add(read_id)
-                        if not simulation:
-                            node_read_id += '|L'
-                    else: # Right read?
-                        assert flag & 0x80 != 0
-                        if read_id in right_read_ids:
-                            continue
-                        right_read_ids.add(read_id)
-                        if not simulation:
-                            node_read_id += '|R'
-
-                    if Zs:
-                        Zs_str = Zs
-                        Zs = Zs.split(',')             
-
-                    assert MD != ""
-                    MD_str_pos, MD_len = 0, 0
-                    Zs_pos, Zs_i = 0, 0
-                    for _i in range(len(Zs)):
-                        Zs[_i] = Zs[_i].split('|')
-                        Zs[_i][0] = int(Zs[_i][0])
-                    if Zs_i < len(Zs):
-                        Zs_pos += Zs[Zs_i][0]
-                    read_pos, left_pos = 0, pos
-                    right_pos = left_pos
-                    cigars = cigar_re.findall(cigar_str)
-                    cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-                    cmp_list = []
-                    num_error_correction = 0
-                    likely_misalignment = False
-
-                    # Extract variants w.r.t backbone from CIGAR string
-                    softclip = [0, 0]
-                    for i in range(len(cigars)):
-                        cigar_op, length = cigars[i]
-                        if cigar_op == 'M':
-                            first = True
-                            MD_len_used = 0
-                            cmp_list_i = len(cmp_list)
-                            while True:
-                                if not first or MD_len == 0:
-                                    if MD[MD_str_pos].isdigit():
-                                        num = int(MD[MD_str_pos])
-                                        MD_str_pos += 1
-                                        while MD_str_pos < len(MD):
-                                            if MD[MD_str_pos].isdigit():
-                                                num = num * 10 + int(MD[MD_str_pos])
-                                                MD_str_pos += 1
-                                            else:
-                                                break
-                                        MD_len += num
-                                # Insertion or full match followed
-                                if MD_len >= length:
-                                    MD_len -= length
-                                    if length > MD_len_used:
-                                        cmp_list.append(["match", right_pos + MD_len_used, length - MD_len_used])
-                                    break
-                                first = False
-                                read_base = read_seq[read_pos + MD_len]
-                                MD_ref_base = MD[MD_str_pos]
-                                MD_str_pos += 1
-                                assert MD_ref_base in "ACGT"
-                                if MD_len > MD_len_used:
-                                    cmp_list.append(["match", right_pos + MD_len_used, MD_len - MD_len_used])
-
-                                _var_id = "unknown"
-                                if read_pos + MD_len == Zs_pos and Zs_i < len(Zs):
-                                    assert Zs[Zs_i][1] == 'S'
-                                    _var_id = Zs[Zs_i][2]
-                                    Zs_i += 1
-                                    Zs_pos += 1
-                                    if Zs_i < len(Zs):
-                                        Zs_pos += Zs[Zs_i][0]
-                                else:
-                                    # Search for a known (yet not indexed) variant or a novel variant
-                                    ref_pos = right_pos + MD_len
-                                    var_idx = typing_common.lower_bound(gene_var_list, ref_pos)
-                                    while var_idx < len(gene_var_list):
-                                        var_pos, var_id = gene_var_list[var_idx]
-                                        if var_pos > ref_pos:
-                                            break
-                                        if var_pos == ref_pos:
-                                            var_type, _, var_data = gene_vars[var_id]
-                                            if var_type == "single" and var_data == read_base:
-                                                _var_id = var_id
-                                                break
-                                        var_idx += 1
-
-                                cmp_list.append(["mismatch", right_pos + MD_len, 1, _var_id])
-                                MD_len_used = MD_len + 1
-                                MD_len += 1
-                                # Full match
-                                if MD_len == length:
-                                    MD_len = 0
-                                    break
-
-                            # Correction for sequencing errors and update for cmp_list
-                            if error_correction:
-                                assert cmp_list_i < len(cmp_list)
-                                new_cmp_list, read_seq, _num_error_correction = error_correct(ref_seq,
-                                                                                              read_seq,
-                                                                                              read_pos,
-                                                                                              mpileup,
-                                                                                              gene_vars,
-                                                                                              gene_var_list,
-                                                                                              cmp_list[cmp_list_i:],
-                                                                                              node_read_id == "aHSQ1008:175:C0JVFACXX:5:1109:17665:21583|L")
-                                cmp_list = cmp_list[:cmp_list_i] + new_cmp_list
-                                num_error_correction += _num_error_correction
-
-                        elif cigar_op == 'I':
-                            _var_id = "unknown"
-                            if read_pos == Zs_pos and Zs_i < len(Zs):
-                                assert Zs[Zs_i][1] == 'I'
-                                _var_id = Zs[Zs_i][2]
-                                Zs_i += 1
-                                if Zs_i < len(Zs):
-                                    Zs_pos += Zs[Zs_i][0]
-                            else:
-                                # Search for a known (yet not indexed) variant or a novel variant
-                                var_idx = typing_common.lower_bound(gene_var_list, right_pos)
-                                while var_idx < len(gene_var_list):
-                                    var_pos, var_id = gene_var_list[var_idx]
-                                    if var_pos > right_pos:
-                                        break
-                                    if var_pos == right_pos:
-                                        var_type, _, var_data = gene_vars[var_id]
-                                        if var_type == "insertion" and len(var_data) == length:
-                                            _var_id = var_id
-                                            break
-                                    var_idx += 1                            
-                            cmp_list.append(["insertion", right_pos, length, _var_id])
-                            if 'N' in read_seq[read_pos:read_pos+length]:
-                                likely_misalignment = True
-                                
-                        elif cigar_op == 'D':
-                            if MD[MD_str_pos] == '0':
-                                MD_str_pos += 1
-                            assert MD[MD_str_pos] == '^'
-                            MD_str_pos += 1
-                            while MD_str_pos < len(MD):
-                                if not MD[MD_str_pos] in "ACGT":
-                                    break
-                                MD_str_pos += 1
-                            _var_id = "unknown"
-                            if read_pos == Zs_pos and \
-                               Zs_i < len(Zs) and \
-                               Zs[Zs_i][1] == 'D':
-                                _var_id = Zs[Zs_i][2]
-                                Zs_i += 1
-                                if Zs_i < len(Zs):
-                                    Zs_pos += Zs[Zs_i][0]
-                            else:
-                                # Search for a known (yet not indexed) variant or a novel variant
-                                var_idx = typing_common.lower_bound(gene_var_list, right_pos)
-                                while var_idx < len(gene_var_list):
-                                    var_pos, var_id = gene_var_list[var_idx]
-                                    if var_pos > right_pos:
-                                        break
-                                    if var_pos == right_pos:
-                                        var_type, _, var_data = gene_vars[var_id]
-                                        if var_type == "deletion" and int(var_data) == length:
-                                            _var_id = var_id
-                                            break
-                                    var_idx += 1
-
-                            cmp_list.append(["deletion", right_pos, length, _var_id])
-
-                            # Check if this deletion is artificial alignment
-                            if right_pos < len(mpileup):
-                                del_count, nt_count = 0, 0
-                                for nt, value in mpileup[right_pos][1].items():
-                                    count = value[0]
-                                    if nt == 'D':
-                                        del_count += count
-                                    else:
-                                        nt_count += count
-
-                                # DK - debugging purposes
-                                if base_fname == "hla":
-                                    if del_count * 6 < nt_count: # and nt_count >= 15:
-                                        likely_misalignment = True
-                            
-                        elif cigar_op == 'S':
-                            if i == 0:
-                                softclip[0] = length
-                                Zs_pos += length
-                            else:
-                                assert i + 1 == len(cigars)
-                                softclip[1] = length
-                        else:                    
-                            assert cigar_op == 'N'
-                            assert False
-                            cmp_list.append(["intron", right_pos, length])
-
-                        if cigar_op in "MND":
-                            right_pos += length
-
-                        if cigar_op in "MIS":
-                            read_pos += length                    
-
-                    # Remove softclip in cigar and modify read_seq and read_qual accordingly
-                    if sum(softclip) > 0:
-                        if softclip[0] > 0:
-                            cigars = cigars[1:]
-                            read_seq = read_seq[softclip[0]:]
-                            read_qual = read_qual[softclip[0]:]
-                        if softclip[1] > 0:
-                            cigars = cigars[:-1]
-                            read_seq = read_seq[:-softclip[1]]
-                            read_qual = read_qual[:-softclip[1]]
-
-                        cigar_str = ""
-                        for type, length in cigars:
-                            cigar_str += str(length)
-                            cigar_str += type
-
-                    if sum(softclip) > 0:
-                        continue
-
-                    if right_pos > len(ref_seq):
-                        continue
-
-                    if num_error_correction > max(1, num_editdist):
-                        continue
-                        
-                    if likely_misalignment:
-                        continue
-
-                    # Add novel variants
-                    read_pos = 0
-                    for cmp_i in range(len(cmp_list)):
-                        type_, pos_, length_ = cmp_list[cmp_i][:3]
-                        if type_ != "match":
-                            var_id_ = cmp_list[cmp_i][3]
-                            if var_id_ == "unknown":
-                                add = True
-                                if type_ == "mismatch":
-                                    data_ = read_seq[read_pos]
-                                    if data_ == 'N':
-                                        add = False
-                                elif type_ == "deletion":
-                                    data_ = str(length_)
-                                else:
-                                    assert type_ == "insertion"
-                                    data_ = read_seq[read_pos:read_pos + length_]
-                                if add:
-                                    var_id_, novel_var_count = add_novel_var(gene_vars,
-                                                                             gene_var_list,
-                                                                             novel_var_count,
-                                                                             type_ if type_ != "mismatch" else "single",
-                                                                             pos_,
-                                                                             data_)
-                                    cmp_list[cmp_i][3] = var_id_
-                            if var_id_ != "unknown":
-                                if var_id_ not in var_count:
-                                    var_count[var_id_] = 1
-                                else:
-                                    var_count[var_id_] += 1
-                                
-                        if type_ != "deletion":
-                            read_pos += length_
-
-                    # Count the number of reads aligned uniquely with some constraints
-                    num_reads += 1
-
-                    def add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read, include_alleles = set()):
-                        if len(Gene_count_per_read) <= 0:
-                            return ""
-                        max_count = max(Gene_count_per_read.values())
-                        cur_cmpt = set()
-                        for allele, count in Gene_count_per_read.items():
-                            if count < max_count:
-                                continue
-                            if len(include_alleles) > 0 and allele not in include_alleles:
-                                continue
-                            
-                            cur_cmpt.add(allele)                    
-                            if allele not in Gene_counts:
-                                Gene_counts[allele] = 1
-                            else:
-                                Gene_counts[allele] += 1
-
-                        if len(cur_cmpt) == 0:
-                            return ""
-
-                        if verbose >= 2:
-                            alleles = ["", ""]
-                            allele1_found, allele2_found = False, False
-                            if alleles[0] != "":
-                                for allele, count in Gene_count_per_read.items():
-                                    if count < max_count:
-                                        continue
-                                    if allele == alleles[0]:
-                                        allele1_found = True
-                                    elif allele == alleles[1]:
-                                        allele2_found = True
-                                if allele1_found != allele2_found:
-                                    print >> sys.stderr, alleles[0], Gene_count_per_read[alleles[0]]
-                                    print >> sys.stderr, alleles[1], Gene_count_per_read[alleles[1]]
-                                    if allele1_found:
-                                        print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[0], prev_read_id, max_count, Gene_count_per_read[alleles[1]]))
-                                    else:
-                                        print >> sys.stderr, ("%s\tread_id %s - %d vs. %d]" % (alleles[1], prev_read_id, max_count, Gene_count_per_read[alleles[0]]))
-
-                        cur_cmpt = sorted(list(cur_cmpt))
-                        cur_cmpt = '-'.join(cur_cmpt)
-                        if not cur_cmpt in Gene_cmpt:
-                            Gene_cmpt[cur_cmpt] = 1
-                        else:
-                            Gene_cmpt[cur_cmpt] += 1
-
-                        return cur_cmpt
-
-                    if read_id != prev_read_id:
-                        if prev_read_id != None:
-                            num_pairs += 1
-                            # DK - needs more test
-                            #      Several alleles go over 100 bps
-                            """
-                            if base_fname == "codis" and gene == "D18S51":
-                                left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts)
-                            """
-
-                            for positive_ht in left_positive_hts | right_positive_hts:
-                                primary_exon_hts = get_exon_haplotypes(positive_ht, ref_primary_exons)
-                                for exon_ht in primary_exon_hts:
-                                    add_count(Gene_primary_exons_count_per_read, exon_ht, 1)
-                                exon_hts = get_exon_haplotypes(positive_ht, ref_exons)
-                                for exon_ht in exon_hts:
-                                    add_count(Gene_exons_count_per_read, exon_ht, 1)
-                                add_count(Gene_count_per_read, positive_ht, 1)
-
-                            # DK - debugging purposes
-                            if prev_read_id.startswith("NS500497:33:HY32TBGXX:3:13511:0:56517876") and False:
-                                print prev_read_id, left_positive_hts, right_positive_hts
-                                max_count = max(Gene_primary_exons_count_per_read.values())
-                                for allele, count in Gene_primary_exons_count_per_read.items():
-                                    if allele not in primary_exon_allele_rep_set:
-                                        continue
-                                    if count < max_count:
-                                        continue
-                                    print allele, count
-
-                            # DK - debugging purposes
-                            """
-                            debug_allele_id = "TH01*10"
-                            assert debug_allele_id in Gene_gen_count_per_read
-                            debug_max_read_count = max(Gene_gen_count_per_read.values())
-                            debug_read_count = Gene_gen_count_per_read[debug_allele_id]
-                            if debug_read_count < debug_max_read_count:
-                                print prev_read_id, debug_read_count, debug_max_read_count, Gene_gen_count_per_read
-                                print "\t", left_positive_hts, right_positive_hts
-                                None
-                            if prev_read_id == "HSQ1008:175:C0JVFACXX:5:1109:17665:21583":
-                                for line in prev_lines:
-                                    print line
-                                print "left_positive_hts :", left_positive_hts
-                                print "right_positive_hts:", right_positive_hts
-                                print "exon:", debug_read_count, "max:", debug_max_read_count
-                                print "gen:", Gene_gen_count_per_read[debug_allele_id], "max:", max(Gene_gen_count_per_read.values())
-
-                                for allele_id, count in Gene_count_per_read.items():
-                                    if count == debug_max_read_count:
-                                        None
-                                        # print "allele max:", allele_id, count
-                                # sys.exit(1)
-                                None
-                            """                                
-
-                            cur_cmpt, cur_cmpt_gen = "", ""
-                            if base_fname == "hla":
-                                cur_primary_exons_cmpt = add_stat(Gene_primary_exons_cmpt, Gene_primary_exons_counts, Gene_primary_exons_count_per_read, primary_exon_allele_rep_set)
-
-                                # DK - debugging purposes
-                                # for cmpt, count in Gene_primary_exons_count_per_read.items():
-                                if cur_primary_exons_cmpt.find("A*24:145") != -1 and cur_primary_exons_cmpt.find("A*24:02:01") == -1:
-                                    print prev_read_id
-                                    print cur_primary_exons_cmpt
-        
-
-            
-                                cur_exons_cmpt = add_stat(Gene_exons_cmpt, Gene_exons_counts, Gene_exons_count_per_read, allele_rep_set)
-                                cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read)
-                            else:
-                                cur_cmpt = add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read)
-                            for read_id_, read_id_i, read_node in read_nodes:
-                                asm_graph.add_node(read_id_,
-                                                   read_id_i,
-                                                   read_node,
-                                                   simulation)
-                            read_nodes, read_var_list = [], []
-                            if simulation and \
-                               verbose >= 2 and \
-                               base_fname in ["hla", "codis"]:
-                                cur_cmpt = cur_cmpt.split('-') if cur_cmpt != "" else set()
-                                cur_cmpt_gen = cur_cmpt_gen.split('-') if cur_cmpt_gen != "" else set()
-                                show_debug = (partial and cur_cmpt != "" and not set(cur_cmpt) & set(test_Gene_names)) or \
-                                              (not partial and cur_cmpt_gen != "" and not set(cur_cmpt_gen) & set(test_Gene_names))
-                                              
-                                if show_debug:
-                                    print "%s are chosen instead of %s" % (cur_cmpt if partial else cur_cmpt_gen, '-'.join(test_Gene_names))
-                                    for prev_line in prev_lines:
-                                        print "\t", prev_line
-
-                            prev_lines = []
-
-                        left_positive_hts, right_positive_hts = set(), set()                        
-                        Gene_primary_exons_count_per_read, Gene_exons_count_per_read, Gene_count_per_read = {}, {}, {}
-                        for allele in Gene_names[gene]:
-                            if allele.find("BACKBONE") != -1:
-                                continue
-                            if base_fname == "genome" and allele.find("GRCh38") != -1:
-                                continue
-                            if allele in primary_exon_allele_rep_set:
-                                Gene_primary_exons_count_per_read[allele] = 0
-                            if allele in allele_rep_set:
-                                Gene_exons_count_per_read[allele] = 0
-                            Gene_count_per_read[allele] = 0
-
-                    prev_lines.append(line)
-
-                    # Remove mismatches due to unknown or novel variants
-                    cmp_list2 = []
-                    for cmp in cmp_list:
-                        cmp = deepcopy(cmp)
-                        type, pos, length = cmp[:3]
-                        if type == "match":
-                            if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match":
-                                cmp_list2[-1][2] += length
-                            else:
-                                cmp_list2.append(cmp)
-                        elif type == "mismatch" and \
-                             (cmp[3] == "unknown" or cmp[3].startswith("nv")):
-                            if len(cmp_list2) > 0 and cmp_list2[-1][0] == "match":
-                                cmp_list2[-1][2] += 1
-                            else:
-                                cmp_list2.append(["match", pos, 1])
-                        else:
-                            cmp_list2.append(cmp)
-                            
-                    cmp_list_left, cmp_list_right, cmp_left_alts, cmp_right_alts = \
-                    typing_common.identify_ambigious_diffs(ref_seq,
-                                                           gene_vars,
-                                                           Alts_left,
-                                                           Alts_right,
-                                                           Alts_left_list,
-                                                           Alts_right_list,
-                                                           cmp_list2,
-                                                           verbose,
-                                                           orig_read_id.startswith("HSQ1009:126:D0UUYACXX:4:2212:9787:80992#"))  # debug?
-
-                    mid_ht = []
-                    for cmp in cmp_list2[cmp_list_left:cmp_list_right+1]:
-                        type = cmp[0]
-                        if type not in ["mismatch", "deletion", "insertion"]:
-                            continue                            
-                        var_id = cmp[3]
-                        mid_ht.append(var_id)
-
-                    for l in range(len(cmp_left_alts)):
-                        left_ht = cmp_left_alts[l].split('-')
-                        left_ht += mid_ht
-                        for r in range(len(cmp_right_alts)):
-                            right_ht = cmp_right_alts[r].split('-')
-                            ht = left_ht + right_ht
-                            if len(ht) <= 0:
-                                continue
-                            ht_str = '-'.join(ht)
-                            if is_left_read:
-                                left_positive_hts.add(ht_str)
-                            else:
-                                right_positive_hts.add(ht_str)
-
-                    # DK - debugging purposes
-                    DK_debug = False
-                    if orig_read_id.startswith("30|R!"):
-                        DK_debug = True
-                        print line
-                        print cmp_list
-                        print "positive hts:", left_positive_hts, right_positive_hts
-                        print "cmp_list [%d, %d]" % (cmp_list_left, cmp_list_right)
-
-                    if assembly:
-                        # Construct multiple candidate realignments for CODIS
-                        cmp_llist = []
-                        hts = left_positive_hts if is_left_read else right_positive_hts
-                        assert len(hts) > 0
-                        for ht in hts:
-                            cmp_list = []
-                            read_pos = 0
-                            vars_ = ht.split('-')
-                            left_ = int(vars_[0])
-                            vars_ = vars_[1:]
-                            for var_i in range(len(vars_)):
-                                var_id = vars_[var_i]
-                                # ref_seq, read_seq
-                                if var_i == len(vars_) - 1:
-                                    right_ = int(var_id)
-                                else:
-                                    var_type, var_pos, var_data = gene_vars[var_id]
-                                    right_ = var_pos - 1
-                                    
-                                for pos in range(left_, right_ + 1):
-                                    if read_seq[read_pos] != ref_seq[pos]:
-                                        if left_ < pos:
-                                            cmp_list.append(["match", left_, pos - left_])
-                                        cmp_list.append(["mismatch", pos, 1, "unknown"])
-                                        left_ = pos + 1
-                                    read_pos += 1                                    
-                                if left_ <= right_:
-                                    cmp_list.append(["match", left_, right_ - left_ + 1])
-                                    
-                                if var_i == len(vars_) - 1:
-                                    left_ = right_ + 1
-                                    break
-
-                                if var_type == "single":
-                                    cmp_list.append(["mismatch", var_pos, 1, var_id])
-                                    left_ = var_pos + 1
-                                    read_pos += 1
-                                elif var_type == "deletion":
-                                    del_len = int(var_data)
-                                    cmp_list.append(["deletion", var_pos, del_len, var_id])
-                                    left_ = var_pos + del_len                                    
-                                else:
-                                    assert var_type == "insertion"
-                                    cmp_list.append(["insertion", var_pos, len(var_data), var_id])
-                                    left_ = var_pos
-                                    read_pos += len(var_data)
-                                    
-                            assert len(cmp_list) > 0
-                            cmp_llist.append(cmp_list)
-
-                        for cmp_list_i in range(len(cmp_llist)):
-                            # Node
-                            cmp_list = cmp_llist[cmp_list_i]
-                            read_node_pos, read_node_seq, read_node_qual, read_node_var = -1, [], [], []
-                            read_vars = []
-                            ref_pos, read_pos = cmp_list[0][1], 0
-                            cmp_i = 0
-                            while cmp_i < len(cmp_list):
-                                cmp = cmp_list[cmp_i]
-                                type, length = cmp[0], cmp[2]
-                                if type in ["match", "mismatch"]:
-                                    if read_node_pos < 0:
-                                        read_node_pos = ref_pos
-                                if type == "match":
-                                    read_node_seq += list(read_seq[read_pos:read_pos+length])
-                                    read_node_qual += list(read_qual[read_pos:read_pos+length])
-                                    read_node_var += ([''] * length)
-                                    read_pos += length
-                                elif type == "mismatch":
-                                    var_id = cmp[3]
-                                    read_base, qual = read_seq[read_pos], read_qual[read_pos]
-                                    read_node_seq += [read_base]
-                                    read_node_qual += [qual]
-                                    read_node_var.append(var_id)
-                                    read_pos += 1
-                                elif type == "deletion":
-                                    var_id = cmp[3]
-                                    del_len = length
-                                    read_node_seq += (['D'] * del_len)
-                                    read_node_qual += ([''] * del_len)
-                                    if len(read_node_seq) > len(read_node_var):
-                                        assert len(read_node_seq) == len(read_node_var) + del_len
-                                        read_node_var += ([var_id] * del_len)
-                                elif type == "insertion":
-                                    var_id = cmp[3]
-                                    ins_len = length
-                                    ins_seq = read_seq[read_pos:read_pos+ins_len]
-                                    read_node_seq += ["I%s" % nt for nt in ins_seq]
-                                    read_node_qual += list(read_qual[read_pos:read_pos+ins_len])
-                                    read_node_var += ([var_id] * ins_len)                                        
-                                    read_pos += length
-                                else:
-                                    assert type == "intron"
-                                cmp_i += 1
-
-                            read_nodes.append([node_read_id,
-                                               cmp_list_i,
-                                               assembly_graph.Node(node_read_id,
-                                                                   read_node_pos,
-                                                                   read_node_seq,
-                                                                   read_node_qual,
-                                                                   read_node_var,
-                                                                   ref_seq,
-                                                                   gene_vars,
-                                                                   mpileup,
-                                                                   simulation)])
-
-                    prev_read_id = read_id
-                    prev_right_pos = right_pos
-
-                if prev_read_id != None:
-                    num_pairs += 1
-                    if base_fname == "codis" and gene == "D18S51":
-                        left_positive_hts, right_positive_hts = choose_pairs(left_positive_hts, right_positive_hts)                            
-                    for positive_ht in left_positive_hts | right_positive_hts:
-                        primary_exon_hts = get_exon_haplotypes(positive_ht, ref_primary_exons)
-                        for exon_ht in primary_exon_hts:
-                            add_count(Gene_primary_exons_count_per_read, exon_ht, 1)
-                        exon_hts = get_exon_haplotypes(positive_ht, ref_exons)
-                        for exon_ht in exon_hts:
-                            add_count(Gene_exons_count_per_read, exon_ht, 1)
-                        add_count(Gene_count_per_read, positive_ht, 1)
-
-                    if base_fname == "hla":
-                        add_stat(Gene_primary_exons_cmpt, Gene_primary_exons_counts, Gene_primary_exons_count_per_read, primary_exon_allele_rep_set)
-                        add_stat(Gene_exons_cmpt, Gene_exons_counts, Gene_exons_count_per_read, allele_rep_set)
-                    add_stat(Gene_cmpt, Gene_counts, Gene_count_per_read)
-                    for read_id_, read_id_i, read_node in read_nodes:
-                        asm_graph.add_node(read_id_,
-                                           read_id_i,
-                                           read_node,
-                                           simulation)
-                    read_nodes, read_var_list = [], []
-
-                if num_reads <= 0:
-                    continue
-
-                for f_ in [sys.stderr, report_file]:
-                    print >> f_, "\t\t\t%d reads and %d pairs are aligned" % (num_reads, num_pairs)
-                
-            else:
-                assert index_type == "linear"
-                def add_alleles(alleles):
-                    if not allele in Gene_counts:
-                        Gene_counts[allele] = 1
-                    else:
-                        Gene_counts[allele] += 1
-
-                    cur_cmpt = sorted(list(alleles))
-                    cur_cmpt = '-'.join(cur_cmpt)
-                    if not cur_cmpt in Gene_cmpt:
-                        Gene_cmpt[cur_cmpt] = 1
-                    else:
-                        Gene_cmpt[cur_cmpt] += 1
-
-                prev_read_id, prev_AS = None, None
-                alleles = set()
-                for line in alignview_proc.stdout:
-                    cols = line[:-1].split()
-                    read_id, flag, allele = cols[:3]
-                    flag = int(flag)
-                    if flag & 0x4 != 0:
-                        continue
-                    if not allele.startswith(gene):
-                        continue
-                    if allele.find("BACKBONE") != -1:
-                        continue
-
-                    AS = None
-                    for i in range(11, len(cols)):
-                        col = cols[i]
-                        if col.startswith("AS"):
-                            AS = int(col[5:])
-                    assert AS != None
-                    if read_id != prev_read_id:
-                        if alleles:
-                            if aligner == "hisat2" or \
-                                    (aligner == "bowtie2" and len(alleles) < 10):
-                                add_alleles(alleles)
-                            alleles = set()
-                        prev_AS = None
-                    if prev_AS != None and AS < prev_AS:
-                        continue
-                    prev_read_id = read_id
-                    prev_AS = AS
-                    alleles.add(allele)
-
-                if alleles:
-                    add_alleles(alleles)
-
-            Gene_counts = [[allele, count] for allele, count in Gene_counts.items()]
-            def Gene_count_cmp(a, b):
-                if a[1] != b[1]:
-                    return b[1] - a[1]
-                assert a[0] != b[0]
-                if a[0] < b[0]:
-                    return -1
-                else:
-                    return 1
-            Gene_counts = sorted(Gene_counts, cmp=Gene_count_cmp)
-            for count_i in range(len(Gene_counts)):
-                count = Gene_counts[count_i]
-                if simulation:
-                    found = False
-                    for test_Gene_name in test_Gene_names:
-                        if count[0] == test_Gene_name:
-                            for f_ in [sys.stderr, report_file]:
-                                print >> f_, "\t\t\t*** %d ranked %s (count: %d)" % (count_i + 1, test_Gene_name, count[1])
-                            found = True
-                    if count_i < 5 and not found:
-                        for f_ in [sys.stderr, report_file]:
-                            print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                else:
-                    for f_ in [sys.stderr, report_file]:
-                        print >> f_, "\t\t\t\t%d %s (count: %d)" % (count_i + 1, count[0], count[1])
-                    if count_i >= 9:
-                        break
-            for f_ in [sys.stderr, report_file]:
-                print >> f_
-
-            # Calculate the abundance of representative alleles on exonic sequences
-            if base_fname == "hla":
-                perform_typing_primary_exon = False
-                # Incorporate representive alleles for primary exons (experimental feature)
-                if perform_typing_primary_exon:
-                    Gene_prob = primary_exon_prob = typing_common.single_abundance(Gene_primary_exons_cmpt)
-                    primary_exon_alleles = set()
-                    primary_exon_prob_sum = 0.0
-                    for prob_i in range(len(primary_exon_prob)):
-                        allele, prob = primary_exon_prob[prob_i][:2]
-                        if len(primary_exon_allele_rep_groups[allele]) <= 1:
-                            continue
-                        primary_exon_prob_sum += prob
-                        primary_exon_alleles |= set(primary_exon_allele_rep_groups[allele])
-
-                    # Incorporate representative alleles for exons
-                    if len(primary_exon_alleles) > 0:
-                        Gene_exons_cmpt2 = {}
-                        for cmpt, value in Gene_exons_cmpt.items():
-                            cmpt2 = []
-                            for allele in cmpt.split('-'):
-                                if allele in primary_exon_alleles:
-                                    cmpt2.append(allele)
-                            if len(cmpt2) == 0:
-                                continue
-                            cmpt2 = '-'.join(cmpt2)
-                            if cmpt2 not in Gene_exons_cmpt2:
-                                Gene_exons_cmpt2[cmpt2] = value
-                            else:
-                                Gene_exons_cmpt2[cmpt2] += value
-                        exon_prob = typing_common.single_abundance(Gene_exons_cmpt2,
-                                                                   remove_low_abundance_alleles)
-                        exon_prob2 = {}
-                        for allele, prob in primary_exon_prob:
-                            if allele not in primary_exon_alleles:
-                                exon_prob2[allele] = prob
-                        for allele, prob in exon_prob:
-                            exon_prob2[allele] = prob * primary_exon_prob_sum
-                        exon_prob = [[allele, prob] for allele, prob in exon_prob2.items()]
-                        Gene_prob = exon_prob = sorted(exon_prob, cmp=typing_common.Gene_prob_cmp)
-                else:
-                    # Incorporate representative alleles for exons
-                    Gene_prob =  exon_prob = typing_common.single_abundance(Gene_exons_cmpt,
-                                                                            remove_low_abundance_alleles)
-
-                exon_alleles = set()
-                exon_prob_sum = 0.0
-                for prob_i in range(len(exon_prob)):
-                    allele, prob = exon_prob[prob_i][:2]
-                    if prob_i >= 10 and prob < 0.03:
-                        break
-                    if len(allele_rep_groups[allele]) <= 1:
-                        continue
-
-                    exon_prob_sum += prob
-                    exon_alleles |= set(allele_rep_groups[allele])
-
-                # Incorporate full-length alleles, non-representative alleles
-                if len(exon_alleles) > 0:
-                    Gene_cmpt2 = {}
-                    for cmpt, value in Gene_cmpt.items():
-                        cmpt2 = []
-                        for allele in cmpt.split('-'):
-                            if allele in exon_alleles:
-                                cmpt2.append(allele)
-                        if len(cmpt2) == 0:
-                            continue
-                        cmpt2 = '-'.join(cmpt2)
-                        if cmpt2 not in Gene_cmpt2:
-                            Gene_cmpt2[cmpt2] = value
-                        else:
-                            Gene_cmpt2[cmpt2] += value
-                    Gene_cmpt = Gene_cmpt2
-                    Gene_prob = typing_common.single_abundance(Gene_cmpt,
-                                                               True,
-                                                               Gene_lengths[gene])
-
-                    Gene_combined_prob = {}
-                    for allele, prob in exon_prob:
-                        if allele not in exon_alleles:
-                            Gene_combined_prob[allele] = prob
-
-                    for allele, prob in Gene_prob:
-                        Gene_combined_prob[allele] = prob * exon_prob_sum
-                                            
-                    Gene_prob = [[allele, prob] for allele, prob in Gene_combined_prob.items()]
-                    Gene_prob = sorted(Gene_prob, cmp=typing_common.Gene_prob_cmp)
-            else:
-                if len(Gene_cmpt.keys()) <= 1:
-                    Gene_prob = []
-                    if len(Gene_cmpt.keys()) == 1:
-                        Gene_prob = [[Gene_cmpt.keys()[0], 1.0]]
-                else:
-                    Gene_prob = typing_common.single_abundance(Gene_cmpt)
-
-            if index_type == "graph" and assembly:
-                allele_node_order = []
-                predicted_allele_nodes = {}
-                for allele_name, prob in Gene_prob:
-                    if prob < 0.1: # abundance of 10%
-                        break
-                    predicted_allele_nodes[allele_name] = create_allele_node(allele_name)
-                    allele_node_order.append([allele_name, prob])
-                    if len(predicted_allele_nodes) >= 2:
-                        break
-                asm_graph.predicted_allele_nodes = predicted_allele_nodes
-                asm_graph.allele_node_order = allele_node_order
-                asm_graph.calculate_coverage()
-
-                # Start drawing assembly graph
-                asm_graph.begin_draw("%s.%s.%s" % (output_base, base_fname, gene))
-
-                # Draw assembly graph
-                begin_y = asm_graph.draw(0, "a. Read alignment")
-                begin_y += 200
-                
-                # Apply De Bruijn graph
-                asm_graph.guided_DeBruijn(assembly_verbose)
-
-                # Draw assembly graph
-                begin_y = asm_graph.draw(begin_y, "b. Asssembly")
-                begin_y += 200
-
-                # Draw assembly graph
-                asm_graph.nodes = asm_graph.nodes2
-                asm_graph.to_node, asm_graph.from_node = {}, {}
-                begin_y = asm_graph.draw(begin_y, "c. Assembly with known alleles")
-
-                # End drawing assembly graph
-                asm_graph.end_draw()
-
-                # Compare two alleles
-                if simulation and len(test_Gene_names) == 2:
-                    allele_name1, allele_name2 = test_Gene_names
-                    print >> sys.stderr, allele_name1, "vs.", allele_name2
-                    asm_graph.print_node_comparison(asm_graph.true_allele_nodes)
-
-                def compare_alleles(vars1, vars2, print_output = True):
-                    skip = True
-                    var_i, var_j = 0, 0
-                    exon_i = 0
-                    allele_seq, mismatches = list(ref_seq), 0
-                    while var_i < len(vars1) and var_j < len(vars2):
-                        cmp_var_id, node_var_id = vars1[var_i], vars2[var_j]
-                        cmp_var, node_var = gene_vars[cmp_var_id], gene_vars[node_var_id]
-
-                        min_pos = min(cmp_var[1], node_var[1])
-                        cmp_var_in_exon, node_var_in_exon = False, False
-                        while exon_i < len(ref_exons):
-                            exon_left, exon_right = ref_exons[exon_i]
-                            if min_pos <= exon_right:
-                                if cmp_var[1] >= exon_left and cmp_var[1] <= exon_right:
-                                    cmp_var_in_exon = True
-                                else:
-                                    cmp_var_in_exon = False
-                                if node_var[1] >= exon_left and node_var[1] <= exon_right:
-                                    node_var_in_exon = True
-                                else:
-                                    node_var_in_exon = False                                
-                                break
-                            exon_i += 1
-                        
-                        if cmp_var_id == node_var_id:
-                            skip = False
-                            if print_output:
-                                if cmp_var_in_exon:
-                                    print >> sys.stderr, "\033[94mexon%d\033[00m" % (exon_i + 1),
-                                print >> sys.stderr, cmp_var_id, cmp_var, "\t\t\t", mpileup[cmp_var[1]]
-                            var_i += 1; var_j += 1
-
-                            var_type, var_pos, var_data = cmp_var
-                            if var_type == "single":
-                                allele_seq[var_pos] = var_data
-                            elif var_type == "deletion":
-                                allele_seq[var_pos:var_pos+int(var_data)] = '.' * int(var_data)
-                            else:
-                                assert var_type == "insertion"
-                            continue
-                        if cmp_var[1] <= node_var[1]:
-                            if not skip:
-                                if (var_i > 0 and var_i + 1 < len(vars1)) or cmp_var[0] != "deletion":
-                                    if print_output:
-                                        if cmp_var_in_exon:
-                                            for f_ in [sys.stderr, report_file]:
-                                                print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1),
-                                        for f_ in [sys.stderr, report_file]:
-                                            print >> f_, "***", cmp_var_id, cmp_var, "==", "\t\t\t", mpileup[cmp_var[1]]
-                                    mismatches += 1
-                            var_i += 1
-                        else:
-                            if print_output:
-                                if node_var_in_exon:
-                                    for f_ in [sys.stderr, report_file]:
-                                        print >> f_, "\033[94mexon%d\033[00m" % (exon_i + 1),
-                                for f_ in [sys.stderr, report_file]:
-                                    print >> f_, "*** ==", node_var_id, node_var, "\t\t\t", mpileup[node_var[1]]
-                            mismatches += 1
-                            var_j += 1
-
-                    allele_exons = ref_exons[:]
-                    allele_seq = ''.join(allele_seq)
-                    del_counts = []
-                    for del_i in range(len(allele_seq)):
-                        del_count = 0 if del_i == 0 else del_counts[-1]
-                        if allele_seq[del_i] == '.':
-                            del_count += 1
-                        del_counts.append(del_count)
-                    for exon_i in range(len(allele_exons)):
-                        exon_left, exon_right = allele_exons[exon_i]
-                        exon_left -= del_counts[exon_left]
-                        exon_right -= del_counts[exon_right]
-                        allele_exons[exon_i] = [exon_left, exon_right]
-                        
-                    allele_seq = allele_seq.replace('.', '')
-                    return allele_seq, allele_exons, mismatches
-                    
-                tmp_nodes = asm_graph.nodes
-                print >> sys.stderr, "Number of tmp nodes:", len(tmp_nodes)
-                count = 0
-                for id, node in tmp_nodes.items():
-                    count += 1
-                    if count > 10:
-                        break
-                    node_vars = node.get_var_ids()
-                    node.print_info(); print >> sys.stderr
-                    if node.id in asm_graph.to_node:
-                        for id2, at in asm_graph.to_node[node.id]:
-                            print >> sys.stderr, "\tat %d ==> %s" % (at, id2)
-
-                    if simulation:
-                        cmp_Gene_names = test_Gene_names
-                    else:
-                        cmp_Gene_names = [allele_name for allele_name, _ in allele_node_order]
-                        
-                    alleles, cmp_vars, max_common = [], [], -sys.maxint
-                    for cmp_Gene_name in cmp_Gene_names:
-                        tmp_vars = allele_nodes[cmp_Gene_name].get_var_ids(node.left, node.right)
-                        tmp_common = len(set(node_vars) & set(tmp_vars))
-                        tmp_common -= len(set(node_vars) | set(tmp_vars))
-                        if max_common < tmp_common:
-                            max_common = tmp_common
-                            alleles = [[cmp_Gene_name, tmp_vars]]
-                        elif max_common == tmp_common:
-                            alleles.append([cmp_Gene_name, tmp_vars])
-
-                    for allele_name, cmp_vars in alleles:
-                        for f_ in [sys.stderr, report_file]:
-                            print >> f_, "vs.", allele_name
-                            allele_seq, allele_exons, allele_mm = compare_alleles(cmp_vars, node_vars)
-                            print >> f_, "\t\tallele sequence (%d bps):" % len(allele_seq), allele_seq
-                            print >> f_, "\t\texons (zero-based offset):", allele_exons
-
-                    print >> sys.stderr
-                    print >> sys.stderr
-
-
-            # Identify alleles that perfectly or closesly match assembled alleles
-            for node_name, node in asm_graph.nodes.items():
-                vars = set(node.get_var_ids())
-
-                max_allele_names, max_common = [], -sys.maxint
-                for allele_name, vars2 in allele_vars.items():
-                    vars2 = set(vars2)
-                    tmp_common = len(vars & vars2) - len(vars | vars2)
-                    if tmp_common > max_common:
-                        max_common = tmp_common
-                        max_allele_names = [allele_name]                        
-                    elif tmp_common == max_common:
-                        max_allele_names.append(allele_name)
-
-                for f_ in [sys.stderr, report_file]:
-                    print >> f_, "Genomic:", node_name
-                    node_vars = node.get_var_ids()
-                    min_mismatches = sys.maxint
-                    for max_allele_name in max_allele_names:
-                        cmp_vars = allele_vars[max_allele_name]
-                        cmp_vars = sorted(cmp_vars, cmp=lambda a, b: int(a[2:]) - int(b[2:]))
-                        print_output = False
-                        _, _, tmp_mismatches = compare_alleles(cmp_vars, node_vars, print_output)
-                        print >> f_, "\t\t%s:" % max_allele_name, max_common, tmp_mismatches
-                        if tmp_mismatches < min_mismatches:
-                            min_mismatches = tmp_mismatches
-                    if min_mismatches > 0:
-                        print >> f_, "Novel allele"
-                    else:
-                        print >> f_, "Known allele"
-
-            """
-            allele_exon_vars = {}
-            for allele_name, vars in allele_vars.items():
-                allele_exon_vars[allele_name] = set(vars) & exon_vars
-
-            for node_name, node in asm_graph.nodes.items():
-                vars = []
-                for left, right in ref_exons:
-                    vars += node.get_var_ids(left, right)
-                vars = set(vars) & exon_vars
-
-                max_allele_names, max_common = [], -sys.maxint
-                for allele_name, vars2 in allele_exon_vars.items():
-                    tmp_common = len(vars & vars2) - len(vars | vars2)
-                    if tmp_common > max_common:
-                        max_common = tmp_common
-                        max_allele_names = [allele_name]                        
-                    elif tmp_common == max_common:
-                        max_allele_names.append(allele_name)
-
-                for f_ in [sys.stderr, report_file]:
-                    print >> f_, "Exonic:", node_name
-                    for max_allele_name in max_allele_names:
-                        print >> f_, "\t\t%s:" % max_allele_name, max_common
-            """
-
-            if simulation:
-                success = [False for i in range(len(test_Gene_names))]
-                found_list = [False for i in range(len(test_Gene_names))]
-            for prob_i in range(len(Gene_prob)):
-                prob = Gene_prob[prob_i]
-                if prob[1] < 0.01:
-                    break
-                found = False
-                _allele_rep = prob[0]
-                """
-                if partial and exonic_only:
-                    _fields = _allele_rep.split(':')
-                    if len(_fields) == 4:
-                        _allele_rep = ':'.join(_fields[:-1])
-                """
-                if simulation:
-                    for name_i in range(len(test_Gene_names)):
-                        test_Gene_name = test_Gene_names[name_i]
-                        if prob[0] == test_Gene_name:
-                            rank_i = prob_i
-                            while rank_i > 0:
-                                if prob == Gene_prob[rank_i - 1][1]:
-                                    rank_i -= 1
-                                else:
-                                    break
-                            for f_ in [sys.stderr, report_file]:
-                                print >> f_, "\t\t\t*** %d ranked %s (abundance: %.2f%%)" % (rank_i + 1, test_Gene_name, prob[1] * 100.0)
-                            if rank_i < len(success):
-                                success[rank_i] = True
-                            found_list[name_i] = True
-                            found = True
-                    # DK - for debugging purposes
-                    if not False in found_list and prob_i >= 10:
-                        break
-                if not found:
-                    for f_ in [sys.stderr, report_file]:
-                        print >> f_, "\t\t\t\t%d ranked %s (abundance: %.2f%%)" % (prob_i + 1, _allele_rep, prob[1] * 100.0)
-
-                    if best_alleles and prob_i < 2:
-                        for f_ in [sys.stderr, report_file]:
-                            print >> f_, "SingleModel %s (abundance: %.2f%%)" % (_allele_rep, prob[1] * 100.0)
-
-                # DK - debugging purposes
-                """
-                # ref_allele_node_ = create_allele_node("A*03:01:01:01")
-                ref_allele_node_ = create_allele_node("DQA1*01:02:01:01")
-                cmp_node_ = create_allele_node(_allele_rep)
-                count_ = 0
-                for i_ in range(len(ref_allele_node_.seq)):
-                    if assembly_graph.get_major_nt(ref_allele_node_.seq[i_]) != assembly_graph.get_major_nt(cmp_node_.seq[i_]):
-                        count_ += 1
-                print "\t\t\t\t\tDK:", count_, len(ref_allele_node_.seq)
-                vars1, vars2 = allele_vars["DQA1*01:02:01:01"], allele_vars[_allele_rep]
-                print "\t\t\t\t\tDK:", set(vars1) - set(vars2), set(vars2) - set(vars1)
-                """
-
-                if not simulation and prob_i >= 9:
-                    break
-                if prob_i >= 19:
-                    break
-            print >> sys.stderr         
-
-            if simulation and not False in success:
-                aligner_type = "%s %s" % (aligner, index_type)
-                if not aligner_type in test_passed:
-                    test_passed[aligner_type] = 1
-                else:
-                    test_passed[aligner_type] += 1
-
-        if not keep_alignment and remove_alignment_file:
-            os.system("rm %s*" % (alignment_fname))
-
-    report_file.close()
-    if simulation:
-        return test_passed
-
-    
-"""
-"""
-def read_backbone_alleles(genotype_genome, refGene_loci, Genes):
-    for gene_name in refGene_loci:
-        allele_name, chr, left, right = refGene_loci[gene_name][:4]
-        seq_extract_cmd = ["samtools",
-                           "faidx",
-                           "%s.fa" % genotype_genome,
-                           "%s:%d-%d" % (chr, left+1, right+1)]
-
-        length = right - left + 1
-        proc = subprocess.Popen(seq_extract_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
-        seq = ""
-        for line in proc.stdout:
-            line = line.strip()
-            if line.startswith('>'):
-                continue
-            seq += line
-        assert len(seq) == length
-        assert gene_name not in Genes
-        Genes[gene_name] = {}
-        Genes[gene_name][allele_name] = seq
-
-        
-"""
-"""
-def read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes):
-    for gene_name in Genes:
-        # Assert there is only one allele per gene, which is a backbone allele
-        assert len(Genes[gene_name]) == 1
-        backbone_allele_name, backbone_seq = Genes[gene_name].items()[0]
-        gene_vars, gene_var_list = Vars[gene_name], Var_list[gene_name]
-        allele_vars = {}
-        for _, var_id in gene_var_list:
-            if var_id not in Links:
-                continue
-            for allele_name in Links[var_id]:
-                if allele_name not in allele_vars:
-                    allele_vars[allele_name] = []
-                allele_vars[allele_name].append(var_id)
-
-        for allele_name, vars in allele_vars.items():
-            seq = ""
-            prev_pos = 0
-            for var_id in vars:
-                type, pos, data = gene_vars[var_id]
-                assert prev_pos <= pos
-                if pos > prev_pos:
-                    seq += backbone_seq[prev_pos:pos]
-                if type == "single":
-                    prev_pos = pos + 1
-                    seq += data
-                elif type == "deletion":
-                    prev_pos = pos + int(data)
-                else:
-                    assert type == "insertion"
-                    seq += data
-                    prev_pos = pos
-            if prev_pos < len(backbone_seq):
-                seq += backbone_seq[prev_pos:]
-            Genes[gene_name][allele_name] = seq
-
-        if len(Genes[gene_name]) <= 1:
-            Genes[gene_name]["%s*GRCh38" % gene_name] = backbone_seq
-            
-    
-"""
-"""
-def read_Gene_alleles(fname, Genes):
-    for line in open(fname):
-        if line.startswith(">"):
-            allele_name = line.strip().split()[0][1:]
-            gene_name = allele_name.split('*')[0]
-            if not gene_name in Genes:
-                Genes[gene_name] = {}
-            if not allele_name in Genes[gene_name]:
-                Genes[gene_name][allele_name] = ""
-        else:
-            Genes[gene_name][allele_name] += line.strip()
-    return Genes
-
-
-"""
-"""
-def read_Gene_vars(fname):
-    Vars, Var_list = {}, {}
-    for line in open(fname):
-        var_id, var_type, allele, pos, data = line.strip().split('\t')
-        pos = int(pos)
-        gene = allele.split('*')[0]
-        if not gene in Vars:
-            Vars[gene] = {}
-            assert not gene in Var_list
-            Var_list[gene] = []
-            
-        assert not var_id in Vars[gene]
-        Vars[gene][var_id] = [var_type, pos, data]
-        Var_list[gene].append([pos, var_id])
-        
-    for gene, in_var_list in Var_list.items():
-        Var_list[gene] = sorted(in_var_list)
-
-    return Vars, Var_list
-
-
-"""
-"""
-def read_Gene_vars_genotype_genome(fname, refGene_loci):
-    loci = {}
-    for gene, values in refGene_loci.items():
-        allele_name, chr, left, right = values[:4]
-        if chr not in loci:
-            loci[chr] = []
-        loci[chr].append([allele_name, left, right])
-        
-    Vars, Var_list = {}, {}
-    for line in open(fname):
-        var_id, var_type, var_chr, pos, data = line.strip().split('\t')
-        if var_chr not in loci:
-            continue
-        pos = int(pos)
-        found = False
-        for allele_name, left, right in loci[var_chr]:
-            if pos >= left and pos <= right:
-                found = True
-                break
-        if not found:
-            continue
-        
-        gene = allele_name.split('*')[0]
-        if not gene in Vars:
-            Vars[gene] = {}
-            assert not gene in Var_list
-            Var_list[gene] = []
-            
-        assert not var_id in Vars[gene]
-        Vars[gene][var_id] = [var_type, pos - left, data]
-        Var_list[gene].append([pos - left, var_id])
-        
-    for gene, in_var_list in Var_list.items():
-        Var_list[gene] = sorted(in_var_list)
-
-    return Vars, Var_list
-
-
-"""
-"""
-def read_Gene_links(fname):
-    Links = {}
-    for line in open(fname):
-        var_id, alleles = line.strip().split('\t')
-        alleles = alleles.split()
-        assert not var_id in Links
-        Links[var_id] = alleles
-
-    return Links
-
-
-"""
-"""
-def genotyping_locus(base_fname,
-                     locus_list,
-                     genotype_genome,
-                     only_locus_list,
-                     partial,
-                     aligners,
-                     read_fname,
-                     fastq,
-                     alignment_fname,
-                     threads,
-                     simulate_interval,
-                     read_len,
-                     fragment_len,
-                     best_alleles,
-                     num_editdist,
-                     perbase_errorrate,
-                     perbase_snprate,
-                     skip_fragment_regions,
-                     assembly,
-                     output_base,
-                     error_correction,
-                     keep_alignment,
-                     discordant,
-                     type_primary_exons,
-                     remove_low_abundance_alleles,
-                     display_alleles,
-                     verbose,
-                     assembly_verbose,
-                     debug_instr):
-    simulation = (read_fname == [] and alignment_fname == "")
-    if genotype_genome == "":
-        if not os.path.exists("hisatgenotype_db"):
-            typing_common.clone_hisatgenotype_database()
-
-        # Download human genome and HISAT2 index
-        HISAT2_fnames = ["grch38",
-                         "genome.fa",
-                         "genome.fa.fai"]    
-        if not typing_common.check_files(HISAT2_fnames):
-            typing_common.download_genome_and_index()
-
-    # Check if the pre-existing files (hla*) are compatible with the current parameter setting
-    if genotype_genome != "":
-        if os.path.exists("%s.locus" % base_fname):
-            left = 0
-            Gene_genes = []
-            BACKBONE = False
-            for line in open("%s.locus" % base_fname):
-                Gene_name = line.strip().split()[0]
-                if Gene_name.find("BACKBONE") != -1:
-                    BACKBONE = True
-                Gene_gene = Gene_name.split('*')[0]
-                Gene_genes.append(Gene_gene)
-            delete_hla_files = False
-            if not BACKBONE:
-                delete_hla_files = True
-            if len(locus_list) == 0:
-                locus_list = Gene_genes
-            if not set(locus_list).issubset(set(Gene_genes)):
-                delete_hla_files = True
-            if delete_hla_files:
-                os.system("rm %s*" % base_fname)
-
-    # Extract variants, backbone sequence, and other sequeces  
-    if genotype_genome != "":
-        genome_fnames = [genotype_genome + ".fa",
-                         genotype_genome + ".fa.fai",
-                         genotype_genome + ".locus",
-                         genotype_genome + ".snp",
-                         genotype_genome + ".index.snp",
-                         genotype_genome + ".haplotype",
-                         genotype_genome + ".link",
-                         genotype_genome + ".clnsig",
-                         genotype_genome + ".coord",
-                         genotype_genome + ".allele",
-                         genotype_genome + ".partial"]
-        for i in range(8):
-            genome_fnames.append(genotype_genome + ".%d.ht2" % (i+1))
-
-        if not typing_common.check_files(genome_fnames):
-            print >> sys.stderr, "Error: some of the following files are not available:", ' '.join(genome_fnames)
-            sys.exit(1)
-    else:
-        typing_common.extract_database_if_not_exists(base_fname,
-                                                     only_locus_list,
-                                                     30,              # inter_gap
-                                                     50,              # intra_gap
-                                                     partial,
-                                                     verbose >= 1)        
-        for aligner, index_type in aligners:
-            typing_common.build_index_if_not_exists(base_fname,
-                                                    aligner,
-                                                    index_type,
-                                                    threads,
-                                                    verbose >= 1)
-
-    # Read alleles
-    alleles = set()
-    if genotype_genome != "":
-        for line in open("%s.allele" % genotype_genome):
-            family, allele_name = line.strip().split('\t')
-            if family == base_fname:
-                alleles.add(allele_name)
-    else:
-        for line in open("%s.allele" % base_fname):
-            alleles.add(line.strip())
-
-    # Read partial alleles
-    partial_alleles = set()
-    if genotype_genome != "":
-        for line in open("%s.partial" % genotype_genome):
-            family, allele_name = line.strip().split('\t')
-            if family == base_fname:
-                partial_alleles.add(allele_name)
-
-    else:
-        for line in open("%s.partial" % base_fname):
-            partial_alleles.add(line.strip())
-
-    # Read alleles (names and sequences)
-    refGenes, refGene_loci = {}, {}
-    if base_fname == "genome":
-        for chr, left, right in locus_list:
-            region_name = "%s:%d-%d" % (chr, left, right)
-            refGenes[region_name] = region_name
-            refGene_loci[region_name] = [region_name, chr, left, right, []]
-    else:
-        for line in open("%s.locus" % (genotype_genome if genotype_genome != "" else base_fname)):
-            fields = line.strip().split()
-            if genotype_genome != "" and base_fname != fields[0].lower():
-                continue
-            if genotype_genome != "":
-                _, Gene_name, chr, left, right, exon_str, strand = fields
-            else:
-                Gene_name, chr, left, right, _, exon_str, strand = fields
-            Gene_gene = Gene_name.split('*')[0]
-            assert not Gene_gene in refGenes
-            refGenes[Gene_gene] = Gene_name
-            left, right = int(left), int(right)
-            exons, primary_exons = [], []
-            for exon in exon_str.split(','):
-                primary = exon.endswith('p')
-                if primary:
-                    exon = exon[:-1]
-                exon_left, exon_right = exon.split('-')
-                exon_left, exon_right = int(exon_left), int(exon_right)
-                exons.append([exon_left, exon_right])
-                if primary:
-                    primary_exons.append([exon_left, exon_right])
-            refGene_loci[Gene_gene] = [Gene_name, chr, left, right, exons, primary_exons]
-    Genes = {}
-    if len(locus_list) == 0:
-        locus_list = refGene_loci.keys()
-
-    # Read variants, and link information
-    if genotype_genome:
-        Vars, Var_list = read_Gene_vars_genotype_genome("%s.snp" % genotype_genome, refGene_loci)
-        Links = read_Gene_links("%s.link" % genotype_genome)
-    else:
-        Vars, Var_list = read_Gene_vars("%s.snp" % base_fname)
-        Links = read_Gene_links("%s.link" % base_fname)
-
-    # Some loci may have only one allele such as AMELX and AMELY
-    for gene_name in refGene_loci.keys():
-        if gene_name in Vars:
-            continue
-        Vars[gene_name], Var_list[gene_name], Links[gene_name] = {}, [], {}        
-
-    # Read allele sequences
-    if genotype_genome != "":
-        read_backbone_alleles(genotype_genome, refGene_loci, Genes)
-        read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes)        
-    else:
-        read_Gene_alleles(base_fname + "_backbone.fa", Genes)
-        read_Gene_alleles_from_vars(Vars, Var_list, Links, Genes)
-
-    # alleles corresponding to backbones
-    for allele in alleles:
-        locus = allele.split('*')[0]
-        assert locus in Genes
-        if allele not in Genes[locus]:
-            Genes[locus][allele] = Genes[locus]["%s*BACKBONE" % locus]
-
-    # Sanity Check
-    if os.path.exists(base_fname + "_backbone.fa") and \
-       os.path.exists(base_fname + "_sequences.fa"):
-        Genes2 = {}
-        read_Gene_alleles(base_fname + "_backbone.fa", Genes2)
-        read_Gene_alleles(base_fname + "_sequences.fa", Genes2)
-        for gene_name, alleles in Genes.items():
-            assert gene_name in Genes2
-            for allele_name, allele_seq in alleles.items():
-                assert allele_name in Genes2[gene_name]
-                allele_seq2 = Genes2[gene_name][allele_name]
-                assert allele_seq == allele_seq2
-
-    # alleles names
-    Gene_names = {}
-    for Gene_gene, data in Genes.items():
-        Gene_names[Gene_gene] = list(data.keys())
-
-    # allele lengths
-    Gene_lengths = {}
-    for Gene_gene, Gene_alleles in Genes.items():
-        Gene_lengths[Gene_gene] = {}
-        for allele_name, seq in Gene_alleles.items():
-            Gene_lengths[Gene_gene][allele_name] = len(seq)
-
-    # Test typing
-    test_list = []
-    if simulation:
-        basic_test, pair_test = True, False
-        if debug_instr and "pair" in debug_instr:
-            basic_test, pair_test = False, True
-
-        test_passed = {}
-        test_list = []
-        genes = list(set(locus_list) & set(Gene_names.keys()))
-        if basic_test:
-            for gene in genes:
-                Gene_gene_alleles = Gene_names[gene]
-                for allele in Gene_gene_alleles:
-                    if allele.find("BACKBONE") != -1:
-                        continue
-                    test_list.append([[allele]])
-                random.shuffle(test_list)
-        if pair_test:
-            test_size = 200
-            allele_count = 2
-            for test_i in range(test_size):
-                test_pairs = []
-                for gene in genes:
-                    Gene_gene_alleles = []
-
-                    for allele in Gene_names[gene]:
-                        if allele.find("BACKBONE") != -1:
-                            continue
-
-                        if "full" in debug:
-                            if allele in partial_alleles:
-                                continue
-
-                        Gene_gene_alleles.append(allele)
-                    nums = [i for i in range(len(Gene_gene_alleles))]
-                    random.shuffle(nums)
-                    test_pairs.append(sorted([Gene_gene_alleles[nums[i]] for i in range(allele_count)]))
-                test_list.append(test_pairs)
-
-        if "test_list" in debug_instr:
-            test_list = [[debug_instr["test_list"].split('-')]]
-
-        for test_i in range(len(test_list)):
-            if "test_id" in debug_instr:
-                test_ids = debug_instr["test_id"].split('-')
-                if str(test_i + 1) not in test_ids:
-                    continue
-
-            print >> sys.stderr, "Test %d" % (test_i + 1), str(datetime.now())
-            test_locus_list = test_list[test_i]
-            num_frag_list = typing_common.simulate_reads(Genes,
-                                                         base_fname,
-                                                         test_locus_list,
-                                                         Vars,
-                                                         Links,
-                                                         simulate_interval,
-                                                         read_len,
-                                                         fragment_len,
-                                                         perbase_errorrate,
-                                                         perbase_snprate,
-                                                         skip_fragment_regions)
-
-            assert len(num_frag_list) == len(test_locus_list)
-            for i_ in range(len(test_locus_list)):
-                test_Gene_names = test_locus_list[i_]
-                num_frag_list_i = num_frag_list[i_]
-                assert len(num_frag_list_i) == len(test_Gene_names)
-                for j_ in range(len(test_Gene_names)):
-                    test_Gene_name = test_Gene_names[j_]
-                    gene = test_Gene_name.split('*')[0]
-                    test_Gene_seq = Genes[gene][test_Gene_name]
-                    seq_type = "partial" if test_Gene_name in partial_alleles else "full"
-                    print >> sys.stderr, "\t%s - %d bp (%s sequence, %d pairs)" % (test_Gene_name, len(test_Gene_seq), seq_type, num_frag_list_i[j_])
-
-            if "single-end" in debug_instr:
-                read_fname = ["%s_input_1.fa" % base_fname]
-            else:
-                read_fname = ["%s_input_1.fa" % base_fname, "%s_input_2.fa" % base_fname]
-
-            fastq = False
-            tmp_test_passed = typing(simulation,
-                                     base_fname,
-                                     test_locus_list,
-                                     genotype_genome,
-                                     partial,
-                                     partial_alleles,
-                                     refGenes,
-                                     Genes,                       
-                                     Gene_names,
-                                     Gene_lengths,
-                                     refGene_loci,
-                                     Vars,
-                                     Var_list,
-                                     Links,
-                                     aligners,
-                                     num_editdist,
-                                     assembly,
-                                     output_base,
-                                     error_correction,
-                                     keep_alignment,
-                                     discordant,
-                                     type_primary_exons,
-                                     remove_low_abundance_alleles,
-                                     display_alleles,
-                                     fastq,
-                                     read_fname,
-                                     alignment_fname,
-                                     num_frag_list,
-                                     read_len,
-                                     fragment_len,
-                                     threads,
-                                     best_alleles,
-                                     verbose,
-                                     assembly_verbose)
-
-            for aligner_type, passed in tmp_test_passed.items():
-                if aligner_type in test_passed:
-                    test_passed[aligner_type] += passed
-                else:
-                    test_passed[aligner_type] = passed
-
-                print >> sys.stderr, "\t\tPassed so far: %d/%d (%.2f%%)" % (test_passed[aligner_type], test_i + 1, (test_passed[aligner_type] * 100.0 / (test_i + 1)))
-
-
-        for aligner_type, passed in test_passed.items():
-            print >> sys.stderr, "%s:\t%d/%d passed (%.2f%%)" % (aligner_type, passed, len(test_list), passed * 100.0 / len(test_list))
-    
-    else: # With real reads or BAMs
-        if base_fname == "genome":
-            print >> sys.stderr, "\t", locus_list
-        else:
-            print >> sys.stderr, "\t", ' '.join(locus_list)
-        typing(simulation,
-               base_fname,
-               locus_list,
-               genotype_genome,
-               partial,
-               partial_alleles,
-               refGenes,
-               Genes,                       
-               Gene_names,
-               Gene_lengths,
-               refGene_loci,
-               Vars,
-               Var_list,
-               Links,
-               aligners,
-               num_editdist,
-               assembly,
-               output_base,
-               error_correction,
-               keep_alignment,
-               discordant,
-               type_primary_exons,
-               remove_low_abundance_alleles,
-               display_alleles,
-               fastq,
-               read_fname,
-               alignment_fname,
-               [],
-               read_len,
-               fragment_len,
-               threads,
-               best_alleles,
-               verbose,
-               assembly_verbose)
-
-
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='hisatgenotype_locus')
-    parser.add_argument("--base", "--base-fname",
-                        dest="base_fname",
-                        type=str,
-                        default="hla",
-                        help="base filename for backbone sequence, variants, and linking info (default: hla)")
-    parser.add_argument("--locus-list",
-                        dest="locus_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of genes (default: empty, all genes)")
-    parser.add_argument("--genotype-genome",
-                        dest="genotype_genome",
-                        type=str,
-                        default="",
-                        help="Base name for genotype genome, which the program will use instead of region-based small indexes (default: empty)")
-    parser.add_argument("-f", "--fasta",
-                        dest='fastq',
-                        action='store_false',
-                        help='FASTA format')
-    parser.add_argument("-U",
-                        dest="read_fname_U",
-                        type=str,
-                        default="",
-                        help="filename for single-end reads")
-    parser.add_argument("-1",
-                        dest="read_fname_1",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")
-    parser.add_argument("-2",
-                        dest="read_fname_2",
-                        type=str,
-                        default="",
-                        help="filename for paired-end reads")    
-    parser.add_argument("--alignment",
-                        dest="alignment_fname",
-                        type=str,
-                        default="",
-                        help="BAM file name")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument('--no-partial',
-                        dest='partial',
-                        action='store_false',
-                        help='Include partial alleles (e.g. A_nuc.fasta)')
-    parser.add_argument("--aligner-list",
-                        dest="aligners",
-                        type=str,
-                        default="hisat2.graph",
-                        help="A comma-separated list of aligners such as hisat2.graph,hisat2.linear,bowtie2.linear (default: hisat2.graph)")
-    parser.add_argument("--simulate-interval",
-                        dest="simulate_interval",
-                        type=int,
-                        default=10,
-                        help="Reads simulated at every these base pairs (default: 10)")
-    parser.add_argument("--read-len",
-                        dest="read_len",
-                        type=int,
-                        default=100,
-                        help="Length of simulated reads (default: 100)")
-    parser.add_argument("--fragment-len",
-                        dest="fragment_len",
-                        type=int,
-                        default=350,
-                        help="Length of fragments (default: 350)")
-    parser.add_argument("--best-alleles",
-                        dest="best_alleles",
-                        action='store_true',
-                        help="")
-    parser.add_argument("--random-seed",
-                        dest="random_seed",
-                        type=int,
-                        default=1,
-                        help="A seeding number for randomness (default: 1)")
-    parser.add_argument("--num-editdist",
-                        dest="num_editdist",
-                        type=int,
-                        default=2,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 2)")
-    parser.add_argument("--perbase-errorrate",
-                        dest="perbase_errorrate",
-                        type=float,
-                        default=0.0,
-                        help="Per basepair error rate in percentage when simulating reads (default: 0.0)")
-    parser.add_argument("--perbase-snprate",
-                        dest="perbase_snprate",
-                        type=float,
-                        default=0.0,
-                        help="Per basepair SNP rate in percentage when simulating reads (default: 0.0)")
-    parser.add_argument("--skip-fragment-regions",
-                        dest="skip_fragment_regions",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of regions from which no reads originate, e.g., 500-600,1200-1400 (default: None).")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument('--verbose-level',
-                        dest='verbose_level',
-                        type=int,
-                        default=0,
-                        help='also print some statistics to stderr (default: 0)')
-    parser.add_argument("--debug",
-                        dest="debug",
-                        type=str,
-                        default="",
-                        help="e.g., test_id:10,read_id:10000,basic_test")
-    parser.add_argument("--output-base", "--assembly-base",
-                        dest="output_base",
-                        type=str,
-                        default="assembly_graph",
-                        help="base file name (default: assembly_graph)")
-    parser.add_argument("--assembly",
-                        dest="assembly",
-                        action="store_true",
-                        help="Perform assembly")
-    parser.add_argument("--no-error-correction",
-                        dest="error_correction",
-                        action="store_false",
-                        help="Correct sequencing errors")
-    parser.add_argument("--keep-alignment",
-                        dest="keep_alignment",
-                        action="store_true",
-                        help="Keep alignment file")
-    parser.add_argument("--only-locus-list",
-                        dest="only_locus_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of genes (default: empty, all genes)")
-    parser.add_argument("--discordant",
-                        dest="discordant",
-                        action="store_true",
-                        help="Allow discordantly mapped pairs or singletons")
-    parser.add_argument("--type-primary-exons",
-                        dest="type_primary_exons",
-                        action="store_true",
-                        help="Look at primary exons first")
-    parser.add_argument("--keep-low-abundance-alleles",
-                        dest="remove_low_abundance_alleles",
-                        action="store_false",
-                        help="Do not remove alleles with low abundance while performing typing")
-    parser.add_argument("--assembly-verbose",
-                        dest="assembly_verbose",
-                        action="store_true",
-                        help="Output intermediate assembly information")
-    parser.add_argument("--display-alleles",
-                        dest="display_alleles",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of alleles to display in HTML (default: empty)")
-
-    args = parser.parse_args()
-    if args.locus_list == "":
-        locus_list = []
-    else:
-        locus_list = args.locus_list.split(',')
-        if args.base_fname == "genome":
-            assert ':' in args.locus_list
-            for i in range(len(locus_list)):
-                assert ':' in locus_list[i] and '-' in locus_list[i]
-                chr, coord = locus_list[i].split(':')
-                left, right = coord.split('-')
-                locus_list[i] = [chr, int(left), int(right)]
-
-    if args.only_locus_list == "":
-        only_locus_list = []
-    else:
-        locus_list = only_locus_list = args.only_locus_list.split(',')
-    if args.aligners == "":
-        print >> sys.stderr, "Error: --aligners must be non-empty."
-        sys.exit(1)    
-    args.aligners = args.aligners.split(',')
-    for i in range(len(args.aligners)):
-        args.aligners[i] = args.aligners[i].split('.')
-    if args.read_fname_U != "":
-        args.read_fname = [args.read_fname_U]
-    elif args.read_fname_1 != "" or args.read_fname_2 != "":
-        if args.read_fname_1 == "" or args.read_fname_2 == "":
-            print >> sys.stderr, "Error: please specify both -1 and -2."
-            sys.exit(1)
-        args.read_fname = [args.read_fname_1, args.read_fname_2]
-    else:
-        args.read_fname = []
-    if args.alignment_fname != "" and \
-            not os.path.exists(args.alignment_fname):
-        print >> sys.stderr, "Error: %s doesn't exist." % args.alignment_fname
-        sys.exit(1)
-
-    if args.verbose and args.verbose_level == 0:
-        args.verbose_level = 1
-        
-    debug = {}
-    if args.debug != "":
-        for item in args.debug.split(','):
-            if ':' in item:
-                fields = item.split(':')
-                assert len(fields) >= 2
-                key, value = fields[0], ':'.join(fields[1:])
-                debug[key] = value
-            else:
-                debug[item] = 1
-
-    if not args.partial:
-        print >> sys.stderr, "Warning: --no-partial should be used for debugging purpose only."
-
-    if args.read_len * 2 > args.fragment_len:
-        print >> sys.stderr, "Warning: fragment might be too short (%d)" % (args.fragment_len)
-
-    skip_fragment_regions = []
-    if args.skip_fragment_regions != "":
-        prev_left, prev_right = -1, -1
-        for region in args.skip_fragment_regions.split(','):
-            left, right = region.split('-')
-            left, right = int(left), int(right)
-            assert left < right
-            assert prev_right < left
-            prev_left, prev_right = left, right
-            skip_fragment_regions.append([left, right])
-
-    if args.display_alleles == "":
-        display_alleles = []
-    else:
-        display_alleles = args.display_alleles.split(',')
-
-    random.seed(args.random_seed)
-    genotyping_locus(args.base_fname,
-                     locus_list,
-                     args.genotype_genome,
-                     only_locus_list,
-                     args.partial,
-                     args.aligners,
-                     args.read_fname,
-                     args.fastq,
-                     args.alignment_fname,
-                     args.threads,
-                     args.simulate_interval,
-                     args.read_len,
-                     args.fragment_len,
-                     args.best_alleles,
-                     args.num_editdist,
-                     args.perbase_errorrate,
-                     args.perbase_snprate,
-                     skip_fragment_regions,
-                     args.assembly,
-                     args.output_base,
-                     args.error_correction,
-                     args.keep_alignment,
-                     args.discordant,
-                     args.type_primary_exons,
-                     args.remove_low_abundance_alleles,
-                     display_alleles,
-                     args.verbose_level,
-                     args.assembly_verbose,
-                     debug)
-
diff --git a/hisatgenotype_modules/__init__.py b/hisatgenotype_modules/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/hisatgenotype_modules/hisatgenotype_assembly_graph.py b/hisatgenotype_modules/hisatgenotype_assembly_graph.py
deleted file mode 100755
index 16794f40..00000000
--- a/hisatgenotype_modules/hisatgenotype_assembly_graph.py
+++ /dev/null
@@ -1,1902 +0,0 @@
-#!/usr/bin/env python
-
-import sys
-import math, random
-from datetime import datetime, date, time
-from collections import deque
-from copy import deepcopy
-
-
-#
-def get_major_nt(nt_dic):
-    nt = ''
-    max_count = 0
-    for tmp_nt, tmp_value in nt_dic.items():
-        tmp_count, tmp_var_id = tmp_value
-        if len(tmp_nt) == 1:
-            assert tmp_nt in "ACGTDN"
-        else:
-            assert len(tmp_nt) == 2 and tmp_nt[0] == 'I' and tmp_nt[1] in "ACGT"
-        if tmp_count > max_count:
-            max_count = tmp_count
-            nt = tmp_nt
-    if len(nt) == 1:
-        assert nt in "ACGTDN"
-    else:
-        assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT"
-    return nt                
-
-
-#
-def match_score(nt_dic1, nt_dic2):
-    sum_1 = sum([count for count, _ in nt_dic1.values()])
-    sum_2 = sum([count for count, _ in nt_dic2.values()])
-    total1, total2 = sum_1 * 2.0, sum_2 * 2.0
-    best = 0.0
-    for nt in "ACGT":
-        if nt not in nt_dic1 or nt not in nt_dic2:
-            continue
-        tmp_best = nt_dic1[nt][0] / total1 + nt_dic2[nt][0] / total2
-        if tmp_best > best:
-            best = tmp_best
-    return best
-
-
-#
-def get_ungapped_seq(seq):
-    ungapped_seq = []
-    for i in range(len(seq)):
-        nt_dic = seq[i]
-        nt = get_major_nt(nt_dic)
-        if nt == 'D':
-            continue
-        ungapped_seq.append(nt_dic)
-    return ungapped_seq
-
-
-#
-def get_ungapped_seq_pos(seq, pos):
-    tot_del_len, tot_ins_len = 0, 0
-    for i in range(len(seq)):
-        nt_dic = seq[i]
-        nt = get_major_nt(nt_dic)
-        if nt == 'D':
-            tot_del_len += 1
-        elif nt[0] == 'I':
-            tot_ins_len += 1
-        if i - tot_ins_len == pos:
-            return pos - tot_del_len
-    return -1
-
-
-# Get mate node id
-#  HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|L to HSQ1008:141:D0CC8ACXX:3:2304:4780:36964|R or vice versa
-def get_mate_node_id(node_id):
-    node_id2, end = node_id.split('|')
-    if end == 'L':
-        end = 'R'
-    else:
-        end = 'L'
-    node_id2 = '|'.join([node_id2, end])
-    return node_id2
-
-
-
-class Node:
-    # Initialize
-    def __init__(self,
-                 id,
-                 left,
-                 seq,
-                 qual,
-                 var,
-                 ref_seq,
-                 ref_vars,
-                 mpileup,
-                 simulation):
-        self.next = [] # list of next nodes
-
-        if simulation:
-            id = id.split('_')[0]
-        self.id = id # Node ID
-        self.left = left # starting position
-
-        # sequence that node represents
-        #   with information about how the sequence is related to backbone
-        assert len(seq) == len(var)
-        assert len(seq) == len(qual)
-        self.seq = []
-        self.ins_len = 0
-        for s in range(len(seq)):
-            nt = seq[s]
-            if len(nt) == 1:
-                assert nt in "ACGTDN"
-            else:
-                assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT"
-                self.ins_len += 1                
-            var_id = var[s]
-            self.seq.append({nt : [1, var_id]})
-        self.qual = []
-        for q in qual:
-            if q != '':
-                self.qual.append(max(0, ord(q) / 10 - 3))
-            else:
-                self.qual.append(0)
-
-        self.right = self.left + len(seq) - 1 - self.ins_len
-
-        self.read_ids = set([id])
-        self.mate_ids = set([id.split('|')[0]])
-
-        self.calculate_avg_cov()
-
-        self.ref_seq = ref_seq
-        self.ref_vars = ref_vars
-
-        self.mpileup = mpileup
-
-        
-    # Check how compatible allele is in regard to read or pair
-    def compatible_with_rnode(self, rnode):
-        assert False
-        assert rnode.left + len(rnode.seq) <= len(self.seq)
-        score = 0
-        for i in range(len(rnode.seq)):
-            allele_bp = self.seq[rnode.left + i]
-            read_bp = rnode.seq[i]
-            if allele_bp == read_bp:
-                score += 1
-
-        return float(score) / len(rnode.seq)
-
-
-    # Check how nodes overlap with each other without considering deletions
-    def overlap_with(self, other, vars, skipN = False, debug = False):
-        assert self.left <= other.left
-        if self.right < other.left:
-            return -1, -1
-        seq = get_ungapped_seq(self.seq)
-        other_seq = get_ungapped_seq(other.seq)
-        add_mm = len(self.mate_ids & other.mate_ids)
-        i_left = get_ungapped_seq_pos(self.seq, other.left - self.left)
-        for i in range(i_left - 5, i_left + 6):
-            max_mm = 0.012 * (len(seq) - i) # 1 mismatch per 83 bases
-            tmp_mm = 0.0
-            for j in range(len(other_seq)):
-                if i + j >= len(seq):
-                    break
-                nt_dic, other_nt_dic = seq[i+j], other_seq[j]
-                nt, other_nt = get_major_nt(nt_dic), get_major_nt(other_nt_dic)
-                mismatch = 0.0
-                if skipN and (nt == 'N' or other_nt == 'N'):
-                    mismatch = 0.0
-                elif nt != other_nt:
-                    mismatch = 1.0 - match_score(seq[i+j], other_seq[j])
-                    
-                    # Higher penalty for mismatches in variants
-                    nt_var, other_nt_var = nt_dic[nt][1], other_nt_dic[other_nt][1]
-                    if nt_var != other_nt_var:
-                        mismatch = 5.0
-                        adjust = min(1.0, nt_dic[nt][0] / self.get_avg_cov()) * \
-                                 min(1.0, other_nt_dic[other_nt][0] / other.get_avg_cov())
-                        mismatch *= adjust
-                        if mismatch < 1.0:
-                            mismatch = 1.0
-
-                assert mismatch >= 0.0
-                tmp_mm += mismatch
-                if tmp_mm > max_mm:
-                    break
-
-            if debug:
-                print "at %d (%d) with overlap of %d and mismatch of %.2f" % (i, self.left + i, j, tmp_mm)
-
-            if tmp_mm <= max_mm:
-                return i, min(len(seq) - i, len(other_seq)), tmp_mm
-                
-        return -1, -1, sys.maxint
-
-    
-    # Combine two nodes with considering deletions
-    def combine_with(self, other):
-        # DK - debugging purposes
-        if self.left > other.left:
-            self.print_info()
-            other.print_info()
-            return
-        
-        assert self.left <= other.left
-
-        # Merge two sequences
-        assert len(other.seq) > 0 and 'D' not in other.seq[0].keys()
-        j = 0        
-        # Merge the overlapped parts
-        if self.right >= other.left:
-            overlap, ins_len = False, 0
-            for i in range(len(self.seq)):
-                nt_dic = self.seq[i]
-                nt = get_major_nt(nt_dic)
-                if nt.startswith('I'):
-                    ins_len += 1
-                if i == other.left - self.left + ins_len:
-                    overlap = True
-                    break
-            assert overlap
-            new_seq = self.seq[:i]
-            while i < len(self.seq) and j < len(other.seq):
-                nt_dic, nt_dic2 = self.seq[i], other.seq[j]
-                for nt, value in nt_dic2.items():
-                    count, var_id = value
-                    if nt in nt_dic:
-                        nt_dic[nt][0] += count
-                        # if nt != 'D':
-                        #    assert nt_dic[nt][1] == var_id
-                    else:
-                        nt_dic[nt] = [count, var_id]
-                new_seq.append(nt_dic)
-                i += 1
-                j += 1
-            # this node contains the other node
-            if i < len(self.seq):
-                new_seq += self.seq[i:]
-        # Fill in the gap between the two nodes if exists
-        else:
-            new_seq = self.seq[:]
-            sum_1 = sum([count for count, _ in self.seq[-1].values()])
-            sum_2 = sum([count for count, _ in other.seq[0].values()])
-            flank_cov = (sum_1 + sum_2) / 2.0
-            for k in range(other.left - self.right - 1):
-                ref_nt_dic = self.mpileup[k + 1 + self.right][1]
-                nt_dic = {}
-                # Fill in the gap with Ns for now
-                if len(ref_nt_dic) == 0 or True:
-                    nt_dic = {'N' : [1, ""]}
-                else:
-                    weight = flank_cov / max(1.0, sum([count for count, _ in ref_nt_dic.values()]))
-                    for nt, value in ref_nt_dic.items():
-                        count, var_id = value
-                        nt_dic[nt] = [count * weight, var_id]
-                new_seq.append(nt_dic)
-
-        # Append the rest of the other sequence to it
-        if j < len(other.seq):
-            new_seq += deepcopy(other.seq[j:])
-        self.read_ids |= other.read_ids
-        self.mate_ids |= other.mate_ids
-
-        self.seq = new_seq
-        self.ins_len = 0
-        for i in range(len(self.seq)):
-            nt_dic = self.seq[i]
-            nt = get_major_nt(nt_dic)
-            if nt[0] == 'I':
-                self.ins_len += 1
-        self.right = self.left + len(self.seq) - 1 - self.ins_len
-        
-        # Update coverage
-        self.calculate_avg_cov()
-
-
-    # Return the length of the ungapped sequence
-    def ungapped_length(self):
-        return len(get_ungapped_seq(self.seq))
-
-
-    # Contains Ns?
-    def contain_Ns(self):
-        for i in range(len(self.seq)):
-            nt_dic = self.seq[i]
-            nt = get_major_nt(nt_dic)
-            if nt == 'N':
-                return True
-        return False
-
-    
-    # Get variant ids
-    def get_var_ids(self, left = 0, right = sys.maxint):
-        vars = []
-        left = max(left, self.left)
-        right = min(right, self.right)
-        ins_len = 0
-        for pos in range(left, right + 1):
-            var_i = pos - self.left + ins_len
-            while var_i < len(self.seq):
-                nt_dic = self.seq[var_i]
-                nt = get_major_nt(nt_dic)
-                if nt.startswith('I'):
-                    var_i += 1
-                    ins_len += 1
-                else:
-                    break            
-            for _, var in nt_dic.values():
-                if var == "" or \
-                   var == "unknown":
-                    continue
-                assert var in self.ref_vars
-                if len(vars) > 0 and var == vars[-1]:
-                    continue
-                type, pos, data = self.ref_vars[var]
-                if (type == "single" and data == nt) or \
-                   (type == "deletion" and nt == 'D') or \
-                   (type == "insertion" and len(nt) == 2 and nt[1] == data):
-                    vars.append(var)
-
-        return vars
-
-    
-    # Get variant ids
-    #   left and right are gene-level coordinates
-    def get_vars(self, left = 0, right = sys.maxint):
-        vars = []
-        left = max(left, self.left)
-        right = min(right, self.right)
-        skip_pos = -1
-        ins_len = 0
-        for pos in range(left, right + 1):
-            if pos <= skip_pos:
-                continue
-            var_i = pos - self.left + ins_len
-            while var_i < len(self.seq):
-                nt_dic = self.seq[var_i]
-                nt = get_major_nt(nt_dic)
-                if nt.startswith('I'):
-                    var_i += 1
-                    ins_len += 1
-                    var = nt_dic[nt][1]
-                    if len(vars) > 0 and var != vars[-1][0]:
-                        vars.append([var, pos])
-                else:
-                    break
-            if nt == self.ref_seq[pos]:
-                continue
-            if nt == 'N':
-                vars.append(["gap", pos])
-                continue            
-            added = False
-            for _, var in nt_dic.values():
-                if var == "" or \
-                   var == "unknown":
-                    continue
-                if len(vars) > 0 and var == vars[-1][0]:
-                    continue
-                assert var in self.ref_vars
-                type, var_pos, data = self.ref_vars[var]                    
-                if data == nt or (type == "deletion" and nt == 'D'):
-                    assert pos + ins_len >= var_pos
-                    if type == "deletion" and pos > var_pos:
-                        continue                    
-                    if type == "deletion":
-                        skip_pos = pos + int(data) - 1
-                    added = True
-                    vars.append([var, pos])
-            if not added and "unknown" in [var_id for _, var_id in nt_dic.values()]:
-                vars.append(["unknown", pos])
-
-        return vars
-
-
-    # Get average coverage
-    def get_avg_cov(self):
-        return self.avg
-
-    
-    # Calculate average coverage
-    def calculate_avg_cov(self):
-        self.avg = 0.0
-        for nt_dic in self.seq:
-            for count, _ in nt_dic.values():
-                self.avg += count
-        self.avg /= len(self.seq)
-        return self.avg
-
-        
-    # Display node information
-    def print_info(self, output=sys.stderr):
-        seq, var_str = "", ""
-        prev_var = ""
-        ins_len = 0
-        for i in range(len(self.seq)):
-            if (self.left + i - ins_len) % 100 == 0:
-                seq += ("|%d|" % (self.left + i - ins_len))
-            elif (self.left + i - ins_len) % 20 == 0:
-                seq += '|'
-            nt_dic = self.seq[i]
-            nt = get_major_nt(nt_dic)
-            if nt[0] == 'I':
-                seq += "\033[93m"
-            elif nt != self.ref_seq[self.left + i - ins_len]:
-                var_id = nt_dic[nt][1]
-                if var_id == "unknown" or var_id.startswith("nv"):
-                    seq += "\033[91m" # red
-                else:
-                    seq += "\033[94m" # blue
-            if nt[0] == 'I':
-                seq += nt[1]
-            else:
-                seq += nt
-            if nt[0] == 'I' or nt != self.ref_seq[self.left + i - ins_len]:
-                seq += "\033[00m"
-
-            var = []
-            for _, var_id in nt_dic.values():
-                if var_id == "":
-                    continue
-                var.append(var_id)
-            var = '-'.join(var)
-            if var != "" and var != prev_var:
-                var_str += "\t%d: %s %s" % (self.left + i - ins_len, var, str(nt_dic))
-            prev_var = var
-            if nt[0] == 'I':
-                ins_len += 1
-        
-        print >> output, "Node ID:", self.id
-        print >> output, "Pos: [%d, %d], Avg. coverage: %.1f" % (self.left, self.right, self.get_avg_cov())
-        print >> output, "\t", seq
-        print >> output, "\t", var_str
-        print >> output, "mates:", len(self.mate_ids) # sorted(self.mate_ids)
-        print >> output, "reads:", len(self.read_ids) # sorted(self.read_ids)
-        print >> output
-
-                
-class Graph:
-    def __init__(self,
-                 backbone,
-                 gene_vars,
-                 exons,
-                 primary_exons,
-                 partial_allele_ids,
-                 true_allele_nodes = {},
-                 predicted_allele_nodes = {},
-                 display_allele_nodes = {},
-                 simulation = False):
-        self.backbone = backbone # backbone sequence
-        self.gene_vars = gene_vars
-        self.exons = exons
-        self.primary_exons = primary_exons
-        self.partial_allele_ids = partial_allele_ids
-        self.true_allele_nodes = true_allele_nodes
-        self.predicted_allele_nodes = predicted_allele_nodes
-        self.allele_node_order = []
-        self.display_allele_nodes = display_allele_nodes
-        self.simulation = simulation
-
-        self.read_nodes = self.nodes = {}
-        self.other_nodes = {}
-        self.edges = {}
-        self.to_node, self.from_node = {}, {}
-
-        self.left_margin = 350
-        self.right_margin = 20
-        self.top_margin = 20
-        self.bottom_margin = 20
-
-        self.scalex, self.scaley = 5, 2
-        self.width = len(self.backbone) * self.scalex + self.left_margin + self.right_margin
-        self.unscaled_height = 6000
-        self.height = self.unscaled_height * self.scaley
-        self.coverage = {}
-
-
-    # Add node, which is an alignment w.r.t. the reference
-    def add_node(self, id, id_i, node, simulation = False):
-        if simulation:
-            id = id.split('_')[0]
-            
-        if id_i == 0:
-            if id in self.nodes:
-                print >> sys.stderr, "Warning) multi-mapped read:", id
-                # assert False
-                return
-            assert id not in self.nodes
-            self.nodes[id] = node
-        else:
-            if id not in self.other_nodes:
-                self.other_nodes[id] = []
-            self.other_nodes[id].append(node)
-
-        
-    # Remove nodes that are inside other nodes or with low coverage
-    def remove_nodes(self, nodes):
-        delete_ids = set()
-        node_list = [[id, node.left, node.right] for id, node in nodes.items()]
-        def node_cmp(a, b):
-            if a[2] != b[2]:
-                return a[2] - b[2]
-            else:
-                return a[1] - b[1]
-        node_list = sorted(node_list, cmp=node_cmp)
-        for n in range(len(node_list)):
-            id, left, right = node_list[n]
-            node = nodes[id]
-            i = n - 1
-            while i >= 0:
-                id2, left2, right2 = node_list[i]
-                if right2 < left:
-                    break
-                node2 = nodes[id2]
-                if left <= left2 and right2 <= right:
-                    at, overlap, mm = node.overlap_with(node2, self.gene_vars)
-
-                    # DK - debugging purposes
-                    """
-                    print node.id, "vs.", node2.id
-                    print "at %d: overlap of %d with %d mismatches (mult: %.2f)" % \
-                        (at, overlap, mm, mult)
-                    """
-                    if mm < 1.0:
-                        mult = overlap / float(max(right - left, right2 - left2))
-                        if node2.get_avg_cov() * mult * 10 < node.get_avg_cov():
-                            delete_ids.add(id2)
-                        elif left == left2 and right == right2:
-                            delete_ids.add(id)
-                    elif overlap > 0:
-                        if node2.get_avg_cov() * 10 < node.get_avg_cov():
-                            delete_ids.add(id2)
-                        elif node.get_avg_cov() * 10 < node2.get_avg_cov():
-                            delete_ids.add(id)
-                i -= 1
-
-        for delete_id in delete_ids:
-            del nodes[delete_id]
-
-            
-    #
-    # 
-    def guided_DeBruijn(self,
-                        print_msg = False):
-        assert len(self.nodes) > 0
-        k = 60 # k-mer
-
-        DRB1_debug = False
-
-        node_seq = {}
-        def add_node_seq(node_seq, id):
-            nodes = [self.nodes[id]]
-            if id in self.other_nodes:
-                nodes += self.other_nodes[id]
-            for node_i in range(len(nodes)):
-                node = nodes[node_i]
-                s, seq = 0, []
-                while s < len(node.seq):
-                    nt_dic = node.seq[s] # {'C': [1, '']}
-                    nt = get_major_nt(nt_dic)
-                    if nt in "ACGTND":
-                        seq.append(nt)
-                    else:
-                        assert len(nt) == 2 and nt[0] == 'I' and nt[1] in "ACGT"
-                    s += 1
-
-                if len(seq) < k:
-                    continue
-
-                def leftshift(seq, ref_seq):
-                    seq_len = len(seq)
-                    assert seq_len > 0 and seq[0] != 'D'
-
-                    bp_i = 0
-                    while bp_i < seq_len:
-                        bp = seq[bp_i]
-                        if bp != 'D':
-                            bp_i += 1
-                            continue
-                        bp_j = bp_i + 1
-                        while bp_j < seq_len:
-                            bp2 = seq[bp_j]
-                            if bp2 != 'D':
-                                break
-                            else:
-                                bp_j += 1
-
-                        if bp_j >= seq_len:
-                            bp_i = bp_j
-                            break
-
-                        prev_i, prev_j = bp_i, bp_j
-                        while bp_i > 0 and seq[bp_i-1] in "ACGT" and ref_seq[bp_j-1] in "ACGT":
-                            if seq[bp_i-1] != ref_seq[bp_j-1]:
-                                break
-                            seq[bp_j-1] = seq[bp_i-1]
-                            seq[bp_i-1] = 'D'
-                            bp_i -= 1
-                            bp_j -= 1
-                        bp_i = bp_j
-                        while bp_i < seq_len:
-                            if seq[bp_i] in "ACGT":
-                                break
-                            bp_i += 1
-
-                if DRB1_debug:
-                    leftshift(seq, self.backbone[node.left:node.left + len(seq)])
-                node_seq["%s.%d" % (id, node_i)] = seq
-            
-        for id in self.nodes.keys():
-            add_node_seq(node_seq, id)
-            
-        # AAA.1 => AAA, 1
-        def get_id_and_sub(id):
-            id_split = id.split('.')
-            return '.'.join(id_split[:-1]), int(id_split[-1])
-
-        try_hard = False
-        while True:
-            delete_ids = set()
-            nodes = []
-            for id, node in self.nodes.items():
-                nodes_ = [node]
-                if id in self.other_nodes:
-                    nodes_ += self.other_nodes[id]
-                for node_i in range(len(nodes_)):
-                    node = nodes_[node_i]
-                    id_ = "%s.%d" % (id, node_i)
-                    if id_ not in node_seq:
-                        continue
-                    seq = node_seq[id_]
-
-                    if len(seq) < k or \
-                       'N' in seq:
-                        continue
-                    kmer, seq = seq[:k], seq[k:]
-                    nodes.append([id_, node.left, node.right, kmer, seq])
-                
-            def node_cmp(a, b):
-                if a[1] != b[1]:
-                    return a[1] - b[1]
-                else:
-                    return a[2] - b[2]
-            nodes = sorted(nodes, cmp=node_cmp)
-
-            # Generate numerical read IDs
-            id_to_num = {}
-            num_to_id = []
-            for id in [node[0] for node in nodes]:
-                id_to_num[id] = len(id_to_num)
-                num_to_id.append(id)
-
-            # Construct De Bruijn graph with 60-mer
-            self.debruijn = debruijn = [[] for i in range(len(self.backbone) - k + 1)]
-            min_n = 0
-            for pos in range(len(debruijn)):
-                for n in range(min_n, len(nodes)):
-                    id, node_pos, node_right, kmer, seq = nodes[n]
-                    if node_pos < pos:
-                        min_n = n + 1
-                        continue
-                    elif node_pos > pos:
-                        break
-
-                    assert len(kmer) == k
-
-                    # Add a new node or update the De Bruijn graph
-                    curr_vertices = debruijn[pos]
-                    found = False
-                    kmer_seq = ''.join(kmer)
-                    for v in range(len(curr_vertices)):
-                        cmp_nt, cmp_k_m1_mer = curr_vertices[v][:2]
-                        if kmer_seq == cmp_k_m1_mer + cmp_nt:                        
-                            curr_vertices[v][3].append(n)
-                            found = True
-                            break
-
-                    if not found:
-                        predecessors = []
-                        if pos > 0:
-                            prev_vertices = debruijn[pos - 1]
-                            for v in range(len(prev_vertices)):
-                                cmp_nt, cmp_k_m1_mer = prev_vertices[v][:2]
-                                if kmer_seq[:-1] == cmp_k_m1_mer[1:] + cmp_nt:
-                                    predecessors.append(v)
-                        debruijn[pos].append([kmer_seq[-1],           # base
-                                              ''.join(kmer_seq[:-1]), # (k-1)-mer
-                                              predecessors,           # predecessors
-                                              [n]])                   # numeric read IDs
-
-                    # Update k-mer
-                    if len(seq) > 0:
-                        kmer, seq = kmer[1:] + seq[:1], seq[1:]
-                        nodes[n] = [id, node_pos + 1, node_right, kmer, seq]
-
-            # Average number of kmers
-            total_kmers = 0
-            for pos in range(len(debruijn)):
-                vertices = debruijn[pos]
-                for _, _, _, num_ids in vertices:
-                    total_kmers += len(num_ids)
-            avg_kmers = float(total_kmers) / len(debruijn)
-
-            # Filter out reads
-            for pos in range(len(debruijn)):
-                vertices = debruijn[pos]
-                num_vertices = 0
-                num_kmers = 0
-                for v in range(len(vertices)):
-                    _, _, predecessors, num_ids = vertices[v]
-                    if not (set(num_ids) <= delete_ids):
-                        num_vertices += 1
-                        if DRB1_debug:
-                            num_kmers = len(set(num_ids) - delete_ids)
-                if num_vertices <= 1:
-                    if DRB1_debug:
-                        if pos > 300 and pos + 300 < len(debruijn):
-                            if num_vertices == 1 and num_kmers * 8 < avg_kmers:
-                                for _, _, _, num_ids in vertices:
-                                    delete_ids |= set(num_ids)
-                    continue
-                
-                vertice_count = [0] * len(vertices)
-                for v in range(len(vertices)):
-                    _, _, predecessors, num_ids = vertices[v]
-                    for num_id in num_ids:
-                        if num_id in delete_ids:
-                            continue
-                        read_id = get_id_and_sub(num_to_id[num_id])[0]
-                        if read_id in self.other_nodes:
-                            continue
-                        mate_read_id = get_mate_node_id(read_id)
-                        if mate_read_id in self.nodes:
-                            vertice_count[v] += 1
-
-                # First look at and remove reads that are multi-aligned locally
-                first_pair = None
-                for v in range(len(vertices)):
-                    read_ids = set([get_id_and_sub(num_to_id[num_id])[0] for num_id in vertices[v][3]])
-                    for v2 in range(v + 1, len(vertices)):
-                        read_ids2 = set([get_id_and_sub(num_to_id[num_id])[0] for num_id in vertices[v2][3]])
-                        if read_ids & read_ids2:
-                            first_pair = [v, v2, read_ids & read_ids2]
-                            break
-
-                debug_msg = False
-                if debug_msg:
-                    print >> sys.stderr, "at", pos, vertices
-                    print >> sys.stderr, "count:", vertice_count
-
-                if try_hard:
-                    vertice_with_id = [[vertice_count[v], v] for v in range(len(vertice_count))]
-                    vertice_with_id = sorted(vertice_with_id, key=lambda a: a[0])
-                    for v in range(len(vertice_count) - 2):
-                        v = vertice_with_id[v][1]
-                        num_ids = vertices[v][3]
-                        delete_ids |= set(num_ids)
-                        if debug_msg:
-                            print >> sys.stderr, v, "is removed with", num_ids
-                else:
-                    if first_pair:
-                        v, v2, multi_read_ids = first_pair
-                        v_ = v if vertice_count[v] < vertice_count[v2] else v2
-                        for num_id in vertices[v_][3]:
-                            id = get_id_and_sub(num_to_id[num_id])[0]
-                            if id in multi_read_ids:
-                                delete_ids.add(num_id)
-                    else:
-                        assert len(vertices) >= 2
-                        relative_avg = (sum(vertice_count) - vertice_count[v]) / float(len(vertice_count) - 1)
-                        if len(vertices) == 2:
-                            for v in range(len(vertices)):
-                                # Eliminate reads that have conflicts with other reads due to a deletion
-                                if vertice_count[v] * 2 < relative_avg:
-                                    nt, kmer, _, num_ids = vertices[1-v]
-                                    if nt == 'D':
-                                        num_id = num_ids[0]
-                                        id_sub = num_to_id[num_id]
-                                        id, sub = get_id_and_sub(id_sub)
-                                        if sub == 0:
-                                            left = pos - self.nodes[id].left
-                                        else:
-                                            left = pos - self.other_nodes[id][sub - 1].left
-                                        seq = node_seq[id_sub]
-                                        seq_right = ''.join(seq[left+k:])
-                                        seq_right = seq_right.replace('D', '')
-                                        success = True
-                                        for num_id2 in vertices[v][3]:
-                                            id_sub2 = num_to_id[num_id2]
-                                            id2, sub2 = get_id_and_sub(id_sub2)
-                                            if sub2 == 0:
-                                                left2 = pos - self.nodes[id2].left
-                                            else:
-                                                left2 = pos - self.other_nodes[id2][sub2 - 1].left
-                                            seq2 = node_seq[id_sub2]
-                                            seq2_right = ''.join(seq2[left2+k:])
-                                            if seq_right.find(seq2_right) != 0:
-                                                success = False
-                                                break
-                                        if success:
-                                            delete_ids |= set(vertices[v][3])
-
-                                # DK - working on ...
-                                if DRB1_debug:
-                                    if vertice_count[v] * 8 < relative_avg:
-                                        num_ids = vertices[v][3]
-                                        delete_ids |= set(num_ids)
-                                        if debug_msg:
-                                            print >> sys.stderr, v, "is removed with", num_ids
-                                    elif vertice_count[v] * 8 < avg_kmers:
-                                        num_ids = vertices[v][3]
-                                        delete_ids |= set(num_ids)
-                        else:
-                            second2last = sorted(vertice_count)[1]
-                            for v in range(len(vertices)):
-                                # if vertice_count[v] * 3 < relative_avg:
-                                if vertice_count[v] < second2last:
-                                    num_ids = vertices[v][3]
-                                    delete_ids |= set(num_ids)
-                                    if debug_msg:
-                                        print >> sys.stderr, v, "is removed with", num_ids
-
-                if debug_msg:
-                    print >> sys.stderr
-                    print >> sys.stderr           
-                
-            # delete nodes
-            ids_to_be_updated = set()
-            for num_id in delete_ids:
-                id_sub = num_to_id[num_id]
-                id, sub = get_id_and_sub(id_sub)
-                ids_to_be_updated.add(id)
-                if sub == 0:
-                    self.nodes[id] = None
-                else:
-                    self.other_nodes[id][sub-1] = None
-            
-            for id in self.nodes.keys():
-                other_nodes = []
-                if id in self.other_nodes:
-                    for other_node in self.other_nodes[id]:
-                        if other_node != None:
-                            other_nodes.append(other_node)
-                if self.nodes[id] == None:
-                    if len(other_nodes) == 0:
-                        del self.nodes[id]
-                    else:
-                        self.nodes[id] = other_nodes[0]
-                        del other_nodes[0]
-                if id in self.other_nodes:
-                    if len(other_nodes) == 0:
-                        del self.other_nodes[id]
-                    else:
-                        self.other_nodes[id] = other_nodes
-
-            for id in ids_to_be_updated:
-                if id in self.nodes:
-                    add_node_seq(node_seq, id)
-
-            if len(delete_ids) == 0:
-                if try_hard:
-                    break
-                else:
-                    try_hard = True
-
-        # Print De Bruijn graph
-        for i in range(len(debruijn)):
-            curr_vertices = debruijn[i]
-            if len(curr_vertices) == 0:
-                continue
-            consensus_seq = [{} for j in range(k)]
-            for v in range(len(curr_vertices)):
-                nt, k_m1_mer = curr_vertices[v][:2]
-                kmer = k_m1_mer + nt
-                assert len(kmer) == k
-                for j in range(k):
-                    nt = kmer[j]
-                    if nt not in consensus_seq[j]:
-                        consensus_seq[j][nt] = 1
-                    else:
-                        consensus_seq[j][nt] += 1
-
-            if print_msg: print >> sys.stderr, i
-            for v in range(len(curr_vertices)):
-                nt, k_m1_mer, predecessors, num_ids = curr_vertices[v]
-                kmer = k_m1_mer + nt
-                kmer_seq = ""
-                for j in range(k):
-                    nt = kmer[j]
-                    if len(consensus_seq[j]) >= 2:
-                        kmer_seq += "\033[94m"
-                    kmer_seq += nt
-                    if len(consensus_seq[j]) >= 2:
-                        kmer_seq += "\033[00m"
-                    
-                if print_msg: print >> sys.stderr, "\t%d:" % v, kmer_seq, len(num_ids), predecessors, num_ids
-
-        id_to_num = {}
-        for num in range(len(num_to_id)):
-            id_sub = num_to_id[num]
-            id = get_id_and_sub(id_sub)[0]
-            num_to_id[num] = id
-            if id not in id_to_num:
-                id_to_num[id] = set()
-            id_to_num[id].add(num)          
-                    
-        # Generate compressed nodes
-        paths = []
-        path_queue, done = deque(), set()
-        for i in range(len(debruijn)):
-            if len(debruijn[i]) == 0:
-                continue
-            for i2 in range(len(debruijn[i])):
-                path_queue.append("%d-%d" % (i, i2))
-            break
-
-        while len(path_queue) > 0:
-            i_str = path_queue.popleft()
-            if i_str in done:
-                continue
-
-            i, i2 = i_str.split('-')
-            i, i2 = int(i), int(i2)
-            num_ids = debruijn[i][i2][3]
-            j = i + 1
-            while j < len(debruijn):
-                merge, branch = len(debruijn[j-1]) > len(debruijn[j]), len(debruijn[j-1]) < len(debruijn[j])
-                new_i2 = -1
-                tmp_num_ids = []
-                found = False
-                for j2 in range(len(debruijn[j])):
-                    _, _, predecessors, add_read_ids = debruijn[j][j2]
-                    if len(predecessors) == 0:
-                        branch = True
-                        path_queue.append("%d-%d" % (j, j2))
-                    elif i2 in predecessors:
-                        found = True
-                        # merge into one node
-                        if len(predecessors) > 1:
-                            merge = True
-                        if new_i2 >= 0:
-                            branch = True
-                        new_i2 = j2
-                        tmp_num_ids += add_read_ids
-
-                if merge or branch:
-                    for j2 in range(len(debruijn[j])):
-                        _, _, predecessors, add_num_ids = debruijn[j][j2]
-                        if i2 in predecessors:
-                            path_queue.append("%d-%d" % (j, j2))
-                    break
-                if not found:
-                    break
-                
-                num_ids += tmp_num_ids
-                i2 = new_i2
-                j += 1
-
-            done.add(i_str)
-
-            num_ids = set(num_ids)
-            paths.append([i, j, num_ids])
-
-            if j < len(debruijn) and len(debruijn[j]) == 0:
-                j += 1
-                while j < len(debruijn) and len(debruijn[j]) == 0:
-                    j += 1
-                if j < len(debruijn):
-                    for j2 in range(len(debruijn[j])):
-                        path_queue.append("%d-%d" % (j, j2))
-                        
-
-        def get_mate_num_ids(num_ids):
-            mate_num_ids = set()
-            for num_id in num_ids:
-                read_id = num_to_id[num_id]
-                mate_read_id = get_mate_node_id(read_id)
-                if mate_read_id in id_to_num:
-                    mate_num_id = id_to_num[mate_read_id]
-                    mate_num_ids |= mate_num_id
-                    
-            return mate_num_ids
-        
-
-        # Generate a compressed assembly graph
-        def path_cmp(a, b):
-            if a[0] != b[0]:
-                return a[0] - b[0]
-            else:
-                return a[1] - b[1]
-        paths = sorted(paths, cmp=path_cmp)
-
-        for p in range(len(paths)):
-            if print_msg: print >> sys.stderr, "path:", p, paths[p]
-
-        excl_num_ids = set() # exclusive num ids
-        equiv_list = []
-        p = 0
-        while p < len(paths):
-            left, right, num_ids = paths[p]
-            p2 = p + 1
-            while p2 < len(paths):
-                next_left, next_right, next_num_ids = paths[p2]
-                if next_left >= right:
-                    break
-                p2 += 1
-
-            equiv_list.append([])
-            for i in range(p, p2):
-                left, right, num_ids = paths[i]
-                equiv_list[-1].append([[i], num_ids, num_ids | get_mate_num_ids(num_ids), []])
-                if p + 1 < p2:
-                    assert p + 2 == p2
-                    excl_num_ids |= num_ids
-
-            p = p2
-
-        new_equiv_list = []
-        for classes in equiv_list:
-            if len(classes) > 1:
-                new_equiv_list.append(classes)
-                continue
-            assert len(classes) == 1
-            num_ids = classes[0][1] - excl_num_ids
-            if len(num_ids) <= 0:
-                continue
-            classes[0][1] = num_ids
-            classes[0][2] = num_ids | get_mate_num_ids(num_ids)
-            new_equiv_list.append(classes)
-        equiv_list = new_equiv_list
-
-        known_alleles = False
-        while True:
-            for i in range(len(equiv_list)):
-                classes = equiv_list[i]
-                for j in range(len(classes)):
-                    ids, num_ids, all_ids, alleles = classes[j]
-                    if print_msg: print >> sys.stderr, i, j, ids, len(num_ids), sorted(list(num_ids))[:20], alleles
-
-                if print_msg: print >> sys.stderr
-
-            if known_alleles:
-                for i in range(len(equiv_list)):
-                    classes = equiv_list[i]
-                    for j in range(len(classes)):
-                        num_ids = sorted(list(classes[j][1]))
-                        node_id = "(%d-%d)%s" % (i, j, num_to_id[num_ids[0]])
-                        node = self.nodes2[node_id]
-                        node_vars = node.get_var_ids()
-                        max_alleles, max_common = set(), -sys.maxint
-                        for anode in self.predicted_allele_nodes.values():
-                            allele_vars = anode.get_var_ids(node.left, node.right)
-                            tmp_common = len(set(node_vars) & set(allele_vars)) - len(set(node_vars) | set(allele_vars))
-                            if tmp_common > max_common:
-                                max_common = tmp_common
-                                max_alleles = set([anode.id])
-                            elif tmp_common == max_common:
-                                max_alleles.add(anode.id)
-                        classes[j][3] = max_alleles
-
-            
-            best_common_mat, best_stat, best_i, best_i2 = [], -sys.maxint, -1, -1
-            for i in range(len(equiv_list) - 1):
-                classes = equiv_list[i]
-                for i2 in range(i + 1, len(equiv_list)):
-                    classes2 = equiv_list[i2]
-                    common_mat = []
-                    for j in range(len(classes)):
-                        common_mat.append([])
-                        if known_alleles:
-                            ids = classes[j][3]
-                        else:
-                            ids = classes[j][2]
-                        for j2 in range(len(classes2)):
-                            if known_alleles:
-                                ids2 = classes2[j2][3]
-                            else:
-                                ids2 = classes2[j2][2]
-                            common_mat[-1].append(len(ids & ids2))
-
-                    # Calculate stat
-                    common_stat = 0
-                    if len(classes) == 1 or len(classes2) == 1:
-                        for row in common_mat:
-                            common_stat += sum(row)
-                    else:
-                        for row in common_mat:
-                            sorted_row = sorted(row, reverse=True)
-                            common_stat += (sorted_row[0] - sorted_row[1])
-                        if common_mat[0][0] + common_mat[1][1] == \
-                           common_mat[1][0] + common_mat[0][1]:
-                            common_stat = -1
-
-                    if common_stat > best_stat:
-                        best_common_mat, best_stat, best_i, best_i2 = common_mat, common_stat, i, i2
-
-            if print_msg:
-                print >> sys.stderr, "best:", best_i, best_i2, best_stat, best_common_mat
-                print >> sys.stderr
-                print >> sys.stderr
-
-            if known_alleles and best_stat < 0:
-                self.remove_nodes(self.nodes2)
-                break
-            if best_stat < 0:
-                known_alleles = True
-                new_nodes = {}
-                for i in range(len(equiv_list)):
-                    classes = equiv_list[i]
-                    for j in range(len(classes)):
-                        ids, num_ids, all_ids, alleles = classes[j]
-                        num_ids = sorted(list(num_ids))
-
-                        if print_msg: print >> sys.stderr, i, j, num_ids
-
-                        assert (num_ids) > 0
-                        read_id = num_to_id[num_ids[0]]
-                        node = deepcopy(self.nodes[read_id])
-                        for num_id2 in num_ids[1:]:
-                            read_id2 = num_to_id[num_id2]
-                            node2 = self.nodes[read_id2]
-                            node.combine_with(node2)
-
-                        new_read_id = "(%d-%d)%s" % (i, j, read_id)
-                        node.id = new_read_id
-                        new_read_id not in new_nodes
-                        new_nodes[new_read_id] = node
-                        
-                self.nodes = new_nodes                
-                self.nodes2 = deepcopy(self.nodes)
-                self.remove_nodes(self.nodes)
-                continue
-
-            mat = best_common_mat
-            classes, classes2 = equiv_list[best_i], equiv_list[best_i2]
-
-            # Filter vertices further if necessary
-            def del_row(classes, mat, r):
-                return classes[:r] + classes[r+1:], mat[:r] + mat[r+1:]
-            
-            def del_col(classes, mat, c):                    
-                new_mat = []
-                for row in mat:
-                    row = row[:c] + row[c+1:]
-                    new_mat.append(row)
-                return classes[:c] + classes[c+1:], new_mat
-                
-            assert len(classes) <= 2 and len(classes2) <= 2
-            if len(classes) == 2 and len(classes2) == 2:
-                # Check row
-                num_ids1, num_ids2 = len(classes[0][1]), len(classes[1][1])
-                if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1:
-                    row_sum1, row_sum2 = sum(mat[0]), sum(mat[1])
-                    if row_sum1 > max(2, row_sum2 * 6):
-                        classes, mat = del_row(classes, mat, 1)
-                        classes[0][1] -= excl_num_ids
-                    elif row_sum2 > max(2, row_sum1 * 6):
-                        classes, mat = del_row(classes, mat, 0)
-                        classes[0][1] -= excl_num_ids
-                # Check column
-                if len(classes) == 2:
-                    num_ids1, num_ids2 = len(classes2[0][1]), len(classes2[1][1])
-                    if num_ids1 * 6 < num_ids2 or num_ids2 * 6 < num_ids1:
-                        col_sum1, col_sum2 = mat[0][0] + mat[1][0], mat[0][1] + mat[1][1]
-                        if col_sum1 > max(2, col_sum2 * 6):
-                            classes2, mat = del_col(classes2, mat, 1)
-                            classes2[0][1] -= excl_num_ids
-                        elif col_sum2 > max(2, col_sum1 * 6):
-                            classes2, mat = del_col(classes2, mat, 0)
-                            classes2[0][1] -= excl_num_ids
-
-            merge_list = []
-            def add_merge(classes, classes2, i, j, k):
-                if known_alleles:
-                    num_ids1, num_ids2 = classes[i][1], classes2[j][1]
-                    num_ids1, num_ids2 = sorted(list(num_ids1)), sorted(list(num_ids2))
-                    num_id1, num_id2 = num_ids1[0], num_ids2[0]
-                    node_id1 = "(%d-%d)%s" % (best_i, i, num_to_id[num_id1])
-                    node_id2 = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id2])
-                    node_id3 = "(%d-%d)%s" % (best_i, k, num_to_id[min(num_id1, num_id2)])
-                    merge_list.append([node_id1, node_id2, node_id3])
-
-                classes[i][0] = sorted(classes[i][0] + classes2[j][0])
-                classes[i][1] |= classes2[j][1]
-
-            copy_list = []
-            def add_copy(classes, classes2, i, j, k):
-                if known_alleles:
-                    num_ids = classes2[j][1]
-                    num_ids = sorted(list(num_ids))
-                    num_id = num_ids[0]
-                    node_id = "(%d-%d)%s" % (best_i2, j, num_to_id[num_id])
-                    node_id2 = "(%d-%d)%s" % (best_i, k, num_to_id[num_id])
-                    copy_list.append([node_id, node_id2])
-
-                classes[i] = classes2[j]
-
-            remove_list = []
-            def add_remove(classes, i):
-                if known_alleles:
-                    num_ids = classes[i][1]
-                    num_ids = sorted(list(num_ids))
-                    num_id = num_ids[0]
-                    node_id = "(%d-%d)%s" % (best_i, i, num_to_id[num_id])
-                    remove_list.append([node_id])
-
-                classes = [classes[1-i]]
-                         
-            if len(classes) == 1 and len(classes2) == 1:
-                add_merge(classes, classes2, 0, 0, 0)
-                
-            elif len(classes) == 1:
-                if 0 not in classes[0][0] and \
-                   mat[0][0] > max(2, mat[0][1] * 6) and \
-                   len(classes2[0][1]) > len(classes2[1][1]) * 2:
-                    add_merge(classes, classes2, 0, 0, 0)
-                elif 0 not in classes[0][0] and \
-                     mat[0][1] > max(2, mat[0][0] * 6) and \
-                     len(classes2[1][1]) > len(classes2[0][1]) * 2:
-                    add_merge(classes, classes2, 0, 1, 0)
-                else:
-                    classes.append(deepcopy(classes[0]))
-
-                    # Handle a special case at 5' end
-                    if 0 in classes[0][0] and \
-                       len(classes[0][0]) == 1 and \
-                       (mat[0][0] > mat[0][1] * 2 or mat[0][1] > mat[0][0] * 2):
-                        if mat[0][0] > mat[0][1]:
-                            add_merge(classes, classes2, 0, 0, 0)
-                            add_copy(classes, classes2, 1, 1, 1)
-                        else:
-                            assert mat[0][1] > mat[0][0]
-                            add_copy(classes, classes2, 0, 0, 0)
-                            add_merge(classes, classes2, 1, 1, 1)
-                    else:
-                        add_merge(classes, classes2, 0, 0, 0)
-                        add_merge(classes, classes2, 1, 1, 1)
-                        
-            elif len(classes2) == 1:
-                if mat[0][0] > max(2, mat[1][0] * 6):
-                    add_merge(classes, classes2, 0, 0, 0)
-                    if len(classes[0][1]) > len(classes[1][1]) * 6:
-                        add_remove(classes, 1)
-                elif mat[1][0] > max(2, mat[0][0] * 6):
-                    add_merge(classes, classes2, 1, 0, 0)
-                    if len(classes[1][1]) > len(classes[0][1]) * 6:
-                        add_remove(classes, 0)
-                else:
-                    add_merge(classes, classes2, 0, 0, 0)
-                    add_merge(classes, classes2, 1, 0, 1)
-                    
-            else:                
-                score00 = mat[0][0] + mat[1][1]
-                score01 = mat[0][1] + mat[1][0]
-                if score00 > score01:
-                    add_merge(classes, classes2, 0, 0, 0)
-                    add_merge(classes, classes2, 1, 1, 1)
-                elif score00 < score01:
-                    add_merge(classes, classes2, 0, 1, 0)
-                    add_merge(classes, classes2, 1, 0, 1)
-                else:
-                    break
-
-            for c in range(len(classes)):
-                classes[c][2] = classes[c][1] | get_mate_num_ids(classes[c][1])
-
-            equiv_list[best_i] = classes            
-            equiv_list = equiv_list[:best_i2] + equiv_list[best_i2+1:]
-            
-            if known_alleles:
-                exclude_ids = set()
-                new_nodes = {}
-                for node_id1, node_id2, node_id3 in merge_list:
-                    if self.nodes2[node_id1].left <= self.nodes2[node_id2].left:
-                        node = deepcopy(self.nodes2[node_id1])
-                        node2 = self.nodes2[node_id2]
-                    else:                        
-                        node = deepcopy(self.nodes2[node_id2])
-                        node2 = self.nodes2[node_id1]
-                    node.combine_with(node2)
-                    node.id = node_id3
-                    new_nodes[node_id3] = node
-                    exclude_ids.add(node_id1)
-                    exclude_ids.add(node_id2)
-
-                for node_id1, node_id2 in copy_list:
-                    node = self.nodes2[node_id1]
-                    node.id = node_id2
-                    new_nodes[node_id2] = node
-                    exclude_ids.add(node_id1)
-
-                exclude_ids |= set(remove_list)
-
-                for node_id, node in self.nodes2.items():
-                    if node_id in exclude_ids:
-                        continue
-                    num, id = node_id.split(')')
-                    i, i2 = num[1:].split('-')
-                    i, i2 = int(i), int(i2)
-                    if i > best_i2:
-                        i -= 1
-                    node_id = "(%d-%d)%s" % (i, i2, id)
-                    node.id = node_id
-                    new_nodes[node_id] = node
-                        
-                self.nodes2 = new_nodes
-            
-        
-    # Display graph information
-    def print_info(self): 
-        print >> sys.stderr, "Backbone len: %d" % len(self.backbone)
-        print >> sys.stderr, "\t%s" % self.backbone   
-
-
-    # Compare nodes and get information
-    def get_node_comparison_info(self, node_dic):
-        assert len(node_dic) > 0
-        nodes = [[id, node.left, node.right] for id, node in node_dic.items()]
-        def node_cmp(a, b):
-            if a[1] != b[1]:
-                return a[1] - b[1]
-            else:
-                return a[2] - b[2]
-        nodes = sorted(nodes, cmp=node_cmp)
-        seqs, colors = [], []
-        for p in range(len(self.backbone)):
-            nts = set()
-            for n in range(len(nodes)):
-                id, left, right = nodes[n]
-                node = node_dic[id]
-                if p >= left and p <= right:
-                    nt_dic = node.seq[p - left]
-                    nt = get_major_nt(nt_dic)
-                    nts.add(nt)
-
-            for n in range(len(nodes)):
-                if p == 0:
-                    seqs.append([])
-                    colors.append([])
-                id, left, right = nodes[n]
-                node = node_dic[id]
-                if p >= left and p <= right:
-                    nt_dic = node.seq[p - left]
-                    nt = get_major_nt(nt_dic)
-                    seqs[n].append(nt)
-                    if nt != self.backbone[p]:
-                        if len(nts) > 1:
-                            colors[n].append('R')
-                        else:
-                            colors[n].append('B')
-                    else:
-                        colors[n].append('N')
-                else:
-                    seqs[n].append(' ')
-
-        assert len(nodes) == len(seqs)
-        for n in range(len(nodes)):
-            node, seq, color = nodes[n], seqs[n], colors[n]
-            new_left, new_right = 0, len(seq) - 1
-            while seq[new_left] == 'D':
-                new_left += 1
-            while seq[new_right] == 'D':
-                new_right -= 1
-
-            node[1] = new_left
-            node[2] = new_right
-            seqs[n] = seq[new_left:new_right+1]
-            colors[n] = color[new_left:new_right+1]
-
-        return nodes, seqs, colors
-
-
-    # Compare nodes
-    def print_node_comparison(self, node_dic):
-        nodes, seqs, colors = self.get_node_comparison_info(node_dic)
-        interval = 100
-        for p in range(0, (len(self.backbone) + interval - 1) / interval * interval, interval):
-            cur_seqs = []
-            for n in range(len(nodes)):
-                id, left, right = nodes[n] # inclusive coordinate
-                right += 1
-                seq = []
-                seq_left, seq_right = max(p, left), min(p+interval, right)
-                if seq_left >= seq_right:
-                    continue
-                if p < left:
-                    seq += ([' '] * (left - p))
-                for s in range(seq_left, seq_right):
-                    nt, color = seqs[n][s-left], colors[n][s-left]
-                    if color in "RB":
-                        if color == 'R':
-                            nt = "\033[91m" + nt
-                        else:
-                            nt = "\033[94m" + nt
-                        nt += "\033[00m"        
-                    seq.append(nt)
-                if right < p + interval:
-                    seq += ([' '] * (p + interval - right))
-                seq = ''.join(seq)
-                cur_seqs.append([seq, id])
-
-            if len(cur_seqs) <= 0:
-                continue
-                
-            print >> sys.stderr, p
-            for seq, id in cur_seqs:
-                print >> sys.stderr, "\t", seq, id
-
-                
-    # Calculate coverage
-    def calculate_coverage(self):
-        allele_nodes = self.true_allele_nodes if self.simulation else self.predicted_allele_nodes
-        allele_nodes = [[id, node.left, node.right] for id, node in allele_nodes.items()]
-        coverage = {}
-        for allele_id, _, _ in allele_nodes:
-            coverage[allele_id] = [0.0 for _ in range(len(self.backbone))]
-
-        nodes = [[id, node.left, node.right] for id, node in self.nodes.items()]
-        for id, left, right in nodes:
-            node = self.nodes[id]
-            nodes2 = [[node, left, right]]
-            if id in self.other_nodes:
-                for node in self.other_nodes[id]:
-                    nodes2.append([node, node.left, node.right])
-
-            for node, left, right in nodes2:
-                node_vars = node.get_vars()
-                node_var_ids = node.get_var_ids()
-                max_common = -sys.maxint
-                max_allele_node_ids = []
-                for allele_node_id, allele_left, allele_right in allele_nodes:
-                    if right - left <= 500 and (left < allele_left or right > allele_right):
-                        continue
-                    if self.simulation:
-                        allele_node = self.true_allele_nodes[allele_node_id]
-                    else:
-                        allele_node = self.predicted_allele_nodes[allele_node_id]
-                    allele_vars = allele_node.get_var_ids(left, right)
-                    common_vars = set(node_var_ids) & set(allele_vars)
-                    tmp_common = len(common_vars) - len(set(node_var_ids) | set(allele_vars))
-                    if max_common < tmp_common:
-                        max_common = tmp_common
-                        max_allele_node_ids = [allele_node_id]
-                    elif max_common == tmp_common:
-                        max_allele_node_ids.append(allele_node_id)
-                if len(max_allele_node_ids) <= 0:
-                    continue
-                add_cov = 1.0 / len(nodes2) / len(max_allele_node_ids)
-                assert add_cov > 0.0
-                for allele_node_id in max_allele_node_ids:
-                    for p in range(left, right + 1):
-                        coverage[allele_node_id][p] += add_cov
-
-        max_cov = 0.0
-        for allele_id, cov in coverage.items():
-            max_cov = max(max_cov, max(cov))
-        for allele_id, cov in coverage.items():
-            cov2 = [c / max_cov for c in cov]
-            coverage[allele_id] = cov2
-        self.coverage = coverage
-                                
-        
-    # Begin drawing graph
-    def begin_draw(self, fname_base):
-        pdfDraw = self.pdfDraw = open(fname_base + '.pdf', 'w')
-        print >> pdfDraw, r'%PDF-1.7'
-        self.objects, self.stream = [], []
-        self.draw_items = []
-        
-    # End drawing graph
-    def end_draw(self):
-        self.unscaled_height += 50
-        self.height = self.unscaled_height * self.scaley
-        
-        def get_x(x):
-            return self.left_margin + x * self.scalex
-
-        def get_y(y):
-            return self.height - self.top_margin - y * self.scaley
-
-        # Get scalar
-        def get_sx(x):
-            return x * self.scalex
-
-        def get_sy(y):
-            return y * self.scaley
-        
-        pdfDraw = self.pdfDraw
-        self.add_pdf_object('<</Type /Catalog /Pages 2 0 R>>')
-        self.add_pdf_object('<</Type /Pages /Kids [3 0 R] /Count 1>>')
-        self.add_pdf_object('<</Type /Page /Parent 2 0 R /Resources 4 0 R /MediaBox [0 0 %d %d] /Contents 6 0 R>>' % \
-                   (self.width, self.height))
-        self.add_pdf_object('<</Font <</F1 5 0 R>>>>')
-        self.add_pdf_object('<</Type /Font /Subtype /Type1 /BaseFont /Helvetica>>')
-
-        # Draw vertical dotted lines at every 100nt and thick lines at every 500nt
-        pre_items = []
-        for pos in range(0, len(self.backbone), 100):
-            main_line = (pos != 0 and pos % 500 == 0)
-            dic = {"coord": [pos, 2, pos, self.unscaled_height - 2],
-                   "stroke" : "0.5 0.5 0.5",
-                   "line_width" : 1 if main_line else 0.2}
-            if not main_line:
-                dic["line_dash"] = "[3] 0"
-            pre_items.append(["line", dic])
-        self.draw_items = pre_items + self.draw_items
-
-        fill, stroke, line_width, line_dash = "0 0 0", "0 0 0", 2.0, ""
-        for type, dic in self.draw_items:
-            commands = []
-            if type != "state":
-                assert "coord" in dic
-
-            if "fill" in dic and dic["fill"] != fill:
-                fill = dic["fill"]
-                commands.append("%s rg" % fill)
-            if "stroke" in dic and dic["stroke"] != stroke:
-                stroke = dic["stroke"]
-                commands.append("%s RG" % stroke)
-            if "line_width" in dic and dic["line_width"] != line_width:
-                line_width = dic["line_width"]
-                commands.append("%.1f w" % line_width)
-            if "line_dash" in dic:
-                if dic["line_dash"] != line_dash:
-                    line_dash = dic["line_dash"]
-                    commands.append("%s d" % line_dash)
-            elif line_dash != "":
-                line_dash = ""
-                commands.append("[] 0 d")
-                    
-            if type == "rect":
-                x, y, sx, sy = dic["coord"]
-                re_str = "%d %d %d %d" % (get_x(x), get_y(y), get_sx(sx), get_sy(sy))
-                if "fill" in dic:
-                    commands.append("%s re f" % re_str)
-                if "stroke" in dic:
-                    commands.append("%s re S" % re_str)
-                    
-            elif type == "line":
-                x, y, x2, y2 = dic["coord"]
-                commands.append("%d %d m %d %d l h S" % \
-                                (get_x(x), get_y(y), get_x(x2), get_y(y2)))
-            elif type == "text":
-                assert "text" in dic and "font_size" in dic
-                x, y = dic["coord"]
-                commands.append("BT /F1 %d Tf %d %d Td (%s) Tj ET" % \
-                                (dic["font_size"], get_x(x), get_y(y), dic["text"]))
-            else:
-                assert type == "state"
-                
-            self.stream.append(' '.join(commands))
-
-        # Write stream
-        self.add_pdf_stream('\n'.join(self.stream))
-
-        # Write xref and trailer
-        to_xref = pdfDraw.tell()
-        print >> pdfDraw, 'xref'
-        print >> pdfDraw, "0 %d" % (len(self.objects) + 1)
-        print >> pdfDraw, r'0000000000 65535 f'
-        for object in self.objects:
-            print >> pdfDraw, "%s 00000 n" % "{:010}".format(object)
-        print >> pdfDraw, 'trailer <</Size %d /Root 1 0 R>>' % (len(self.objects) + 1)
-        print >> pdfDraw, 'startxref'
-        print >> pdfDraw, str(to_xref)
-        print >> pdfDraw, r'%%EOF'
-        
-        self.pdfDraw.close()
-
-        
-    def add_pdf_object(self, obj):
-        self.objects.append(self.pdfDraw.tell())
-        print >> self.pdfDraw, "%d 0 obj %s" % (len(self.objects), obj)
-        print >> self.pdfDraw, 'endobj'
-
-
-    def add_pdf_stream(self, stream):
-        self.add_pdf_object("<</Length %d>>\nstream\n%s\nendstream" % (len(stream), stream))
-
-        
-    # Draw graph
-    #   Top left as (0, 0) and Bottom right as (width, height)
-    def draw(self,
-             begin_y,
-             title = ""):
-        assert len(self.nodes) > 0
-        nodes = [[id, node.left, node.right] for id, node in self.nodes.items()]
-        def node_cmp(a, b):
-            return a[1] - b[1]
-        nodes = sorted(nodes, cmp=node_cmp)
-        max_right = len(self.backbone)
-
-        # display space
-        end_y = begin_y + 10000
-        dspace = [[[begin_y, end_y]]] * (max_right + 1)
-        def get_dspace(left, right, height):
-            assert left < len(dspace) and right < len(dspace)
-            range1 = dspace[left]
-            for range2 in dspace[left + 1:right + 1]:
-                new_range = []
-                # sub range
-                for t1, b1 in range1:
-                    for t2, b2 in range2:
-                        if b1 < t2:
-                            break
-                        if b2 < t1:
-                            continue
-                        t, b = max(t1, t2), min(b1, b2)
-                        if b - t >= height:
-                            new_range.append([t, b])
-
-                range1 = new_range
-            if len(range1) <= 0:
-                return -1
-
-            t, b = range1[0]
-            assert b - t >= height
-            b = t + height
-            for i in range(left, right+1):
-                range1 = dspace[i]
-                range2 = []
-                found = False
-                for j in range(len(range1)):
-                    t2, b2 = range1[j]
-                    if t2 <= t and b <= b2:
-                        found = True
-                        if t2 < t:
-                            range2.append([t2, t])
-                        if b < b2:
-                            range2.append([b, b2])
-                    else:
-                        range2.append([t2, b2])
-                dspace[i] = range2
-                assert found
-            return t
-
-        def get_x(x):
-            return self.left_margin + x * self.scalex
-
-        def get_y(y):
-            return self.height - self.top_margin - y * self.scaley
-
-        # Get scalar
-        def get_sx(x):
-            return x * self.scalex
-
-        def get_sy(y):
-            return y * self.scaley
-
-        # Draw exons
-        y = get_dspace(0, max_right, 14)
-        for e in range(len(self.exons)):
-            left, right = self.exons[e]
-            right += 1
-
-            # Draw exon
-            self.draw_items.append(["rect",
-                                    {"coord" : [left, y + 10, right - left, 10],
-                                     "fill" : "1 1 1",
-                                     "stroke" : "0 0 0",
-                                     "line_width" : 2}])
-
-            primary = False
-            for left_, _ in self.primary_exons:
-                if left == left_:
-                    primary = True
-                    break                
-
-            # Draw label
-            self.draw_items.append(["text",
-                                    {"coord" : [left + 2, y + 7],
-                                     "text" : "Exon %d%s" % (e+1, " (primary)" if primary else ""),
-                                     "fill" : "0 0 0",
-                                     "font_size" : 12}])
-            if e > 0:
-                prev_right = self.exons[e-1][1] + 1
-                self.draw_items.append(["line",
-                                        {"coord": [prev_right, y + 5, left, y + 5],
-                                         "line_width" : 2}])
-
-        # Draw backbone sequence
-        y = get_dspace(0, max_right, 4)
-        for pos in range(len(self.backbone)):
-            base = self.backbone[pos]
-            self.draw_items.append(["text",
-                                    {"coord" : [pos, y + 2],
-                                     "text" : base,
-                                     "fill" : "0.5 0 0.5",
-                                     "font_size" : 8}])
-
-        # Draw true or predicted alleles
-        node_colors = ["1 1 0", "0 1 0", "1 0.8 0.64", "0.76 0.27 0.5"]
-        allele_node_colors = ["0.87 0.87 0", "0 0.53 0", "0.87 0.66 0.5", "0.63 0.14 0.38"]
-        def draw_alleles(allele_node_dic, allele_node_colors, display = False):
-            if len(allele_node_dic) <= 0:
-                return
-            allele_nodes, seqs, colors = self.get_node_comparison_info(allele_node_dic)
-
-            def draw_coverage(allele_node, allele_id, left, right, allele_node_color):
-                if allele_id not in self.coverage:
-                    return
-                y = get_dspace(0, max_right, 14)
-                for p in range(left, right):
-                    cov = math.ceil(self.coverage[allele_id][p] * 12)
-                    self.draw_items.append(["rect",
-                                            {"coord" : [p, y + 13, 1, cov],
-                                             "fill" : allele_node_color}])
-
-
-            for n_ in range(len(allele_nodes)):
-                n = -1
-                prob = ""
-                if not display and \
-                   not self.simulation and \
-                   len(self.allele_node_order) == len(allele_node_dic):
-                    allele_id, prob = self.allele_node_order[n_]
-                    for n2_ in range(len(allele_nodes)):
-                        if allele_id == allele_nodes[n2_][0]:
-                            n = n2_
-                            break
-                    prob = ": %.2f" % prob
-                else:
-                    n = n_
-                assert n >= 0 and n < len(allele_nodes)
-                allele_id, left, right = allele_nodes[n]
-                right += 1
-                allele_node = allele_node_dic[allele_id]
-                allele_node_color = allele_node_colors[n % len(allele_node_colors)]
-
-                draw_coverage(allele_node, allele_id, left, right, allele_node_color)
-                
-                y = get_dspace(0, max_right, 14)
-
-                # Draw allele name
-                if display:
-                    allele_type = "display"
-                else:
-                    if self.simulation:
-                        allele_type = "true"
-                    else:
-                        allele_type = "predicted"
-                self.draw_items.append(["text",
-                                    {"coord" : [-55, y + 7],
-                                     "text" : "%s (%s, %s)" % (allele_id, "partial" if allele_id in self.partial_allele_ids else "full", allele_type),
-                                     "fill" : "0 0 1",
-                                     "font_size" : 18}])
-                # Draw node
-                self.draw_items.append(["rect",
-                                        {"coord" : [left, y + 10, right - left, 10],
-                                         "fill" : allele_node_color,
-                                         "stroke" : "0 0 0",
-                                         "line_width" : 2}])
-
-
-                color_boxes = []
-                c = 0
-                while c < len(colors[n]):
-                    color = colors[n][c]
-                    c2 = c + 1
-                    if color != 'N':                        
-                        while c2 < len(colors[n]):
-                            color2 = colors[n][c2]
-                            if color != color2:
-                                break
-                            c2 += 1
-                        color_boxes.append([c, c2, color])
-                    c = c2
-
-                # Draw variants
-                for color_box in color_boxes:
-                    cleft, cright, color = color_box
-                    cleft += left; cright += left
-                    if color == 'B':
-                        color = "0 0 1" # blue 
-                    else:
-                        color = "0.12 0.56 1"
-                    # DK - debugging purposes
-                    color = "0 0 1"
-                    self.draw_items.append(["rect",
-                                            {"coord" : [cleft, y + 9, cright - cleft, 8],
-                                             "fill" : color}])
-
-            return allele_nodes, seqs, colors
-
-        allele_nodes, seqs, colors = draw_alleles(self.true_allele_nodes if self.simulation else self.predicted_allele_nodes,
-                                                  allele_node_colors)
-        draw_alleles(self.display_allele_nodes,
-                     ["1 0.96 0.95"],
-                     True) # display alleles?
-
-        # Draw location at every 100bp
-        y = get_dspace(0, nodes[-1][2], 14)
-        for pos in range(0, nodes[-1][2], 100):
-            # Draw label
-            self.draw_items.append(["text",
-                                    {"coord" : [pos + 1, y + 2],
-                                     "text" : "%d" % (pos + 1),
-                                     "fill" : "0 0 0",
-                                     "font_size" : 10}])
-                
-        # Draw nodes
-        node_to_y = {}
-        draw_title = False
-        for id, left, right in nodes:
-            node = self.nodes[id]
-            nodes2 = [[node, left, right]]
-            if id in self.other_nodes:
-                for node in self.other_nodes[id]:
-                    nodes2.append([node, node.left, node.right])
-                    if left > node.left:
-                        left = node.left
-                    if right < node.right:
-                        right = node.right
-
-            # Get y position
-            y = get_dspace(left, right, 14 * len(nodes2))
-            for node, left, right in nodes2:
-                if y < 0:
-                    continue
-                node_to_y[id] = y
-
-                node_vars = node.get_vars()
-                node_var_ids = node.get_var_ids()
-                if len(nodes2) > 1:
-                    color = "0.85 0.85 0.85"
-                elif len(allele_nodes) > 0:
-                    color = "1 1 1"
-                    max_common = -sys.maxint
-                    for a in range(len(allele_nodes)):
-                        allele_node_id, allele_left, allele_right = allele_nodes[a]
-                        if right - left <= 500 and (left < allele_left or right > allele_right):
-                            continue
-                        if self.simulation:
-                            allele_node = self.true_allele_nodes[allele_node_id]
-                        else:
-                            allele_node = self.predicted_allele_nodes[allele_node_id]
-                        allele_vars = allele_node.get_var_ids(left, right)
-                        common_vars = set(node_var_ids) & set(allele_vars)
-                        tmp_common = len(common_vars) - len(set(node_var_ids) | set(allele_vars))
-                        if max_common < tmp_common:
-                            max_common = tmp_common
-                            color = node_colors[a % len(node_colors)]
-                        elif max_common == tmp_common:
-                            color = "1 1 1"
-                else:
-                    color = "1 1 0" # yellow
-
-                # Draw node
-                right += 1
-                self.draw_items.append(["rect",
-                                        {"coord" : [left, y + 10, right - left, 10],
-                                         "fill" : color,
-                                         "stroke" : "0 0 0",
-                                         "line_width" : 2}])
-                
-                # Draw variants
-                for var_id, pos in node_vars:
-                    if var_id == "gap":
-                        var_type, var_left = "single", pos
-                        color = "0 0 0"
-                    elif var_id == "unknown" or var_id.startswith("nv"):
-                        var_type, var_left = "single", pos
-                        color = "1 0 0"
-                    else:
-                        var_type, var_left, var_data = self.gene_vars[var_id]
-                        color = "0 0 1"
-                    if var_type == "single":
-                        var_right = var_left + 1
-                    elif var_type == "insertion":
-                        var_right = var_left + len(var_data)
-                    else:
-                        assert var_type == "deletion"
-                        var_right = var_left + int(var_data)
-                    self.draw_items.append(["rect",
-                                            {"coord" : [var_left, y + 9, var_right - var_left, 8],
-                                             "fill" : color}])
-
-                # Draw label
-                if get_sx(right - left) >= 300:
-                    self.draw_items.append(["text",
-                                            {"coord" : [left + 2, y + 7],
-                                             "text" : node.id,
-                                             "fill" : "0 0 1",
-                                             "font_size" : 12}])
-            
-
-                if not draw_title:
-                    draw_title = True
-                    self.draw_items.append(["text",
-                                            {"coord" : [-68, y + 7],
-                                             "text" : title,
-                                             "fill" : "0 0 0",
-                                             "font_size" : 24}])
-                    
-                y += 14
-
-        curr_y = get_dspace(0, nodes[-1][2], 1)
-        self.unscaled_height = curr_y if curr_y > 0 else end_y
-        return self.unscaled_height
-
diff --git a/hisatgenotype_modules/hisatgenotype_typing_common.py b/hisatgenotype_modules/hisatgenotype_typing_common.py
deleted file mode 100755
index 04cb95f3..00000000
--- a/hisatgenotype_modules/hisatgenotype_typing_common.py
+++ /dev/null
@@ -1,1552 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright 2017, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT-genotype.
-#
-# HISAT-genotype is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT-genotype is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import math
-import random
-from copy import deepcopy
-from datetime import datetime
-
-
-##################################################
-#   Sequence processing routines
-##################################################
-
-
-"""
-"""
-def reverse_complement(seq):
-    comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
-    rc_seq = ""
-    for s in reversed(seq):
-        if s in comp_table:
-            rc_seq += comp_table[s]
-        else:
-            rc_seq += s
-    return rc_seq
-
-
-"""
-"""
-def read_genome(genome_file):
-    chr_dic, chr_names, chr_full_names = {}, [], []
-    chr_name, chr_full_name, sequence = "", "", ""
-    for line in genome_file:
-        if line.startswith(">"):
-            if chr_name and sequence:
-                chr_dic[chr_name] = sequence
-                chr_names.append(chr_name)
-            chr_full_name = line.strip()[1:]
-            chr_name = line.strip().split()[0][1:]
-            chr_full_names.append(chr_full_name)
-            sequence = ""
-        else:
-            sequence += line.strip()
-    if chr_name and sequence:
-        chr_dic[chr_name] = sequence
-        chr_names.append(chr_name)
-        chr_full_names.append(chr_full_name)
-    return chr_dic, chr_names, chr_full_names
-
-
-##################################################
-#   Alleles, variants, haplotypes, etc.
-##################################################
-
-
-"""
-"""
-def read_allele_sequences(fname):
-    allele_seqs = {}
-    allele_name, sequence = "", ""
-    for line in open(fname):
-        if line.startswith(">"):
-            if allele_name != "" and allele_name not in allele_seqs:
-                allele_seqs[allele_name] = sequence
-            allele_name = line.strip()[1:]
-            sequence = ""
-        else:
-            sequence += line.strip()
-    if allele_name != "" and allele_name not in allele_seqs:
-        allele_seqs[allele_name] = sequence
-    return allele_seqs
-
-
-"""
-"""
-def read_variants(fname):
-    allele_vars = {}
-    for line in open(fname):
-        var_id, type, allele_name, left, data = line.strip().split()
-        left = int(left)
-        if type == "deletion":
-            data = int(data)
-        if allele_name not in allele_vars:
-            allele_vars[allele_name] = []
-        allele_vars[allele_name].append([left, type, data, var_id])
-    return allele_vars
-
-
-"""
-"""
-def read_haplotypes(fname):
-    allele_haplotypes = {}
-    for line in open(fname):
-        haplotype_id, allele_name, left, right, vars = line.strip().split()
-        vars = vars.split(',')
-        left, right = int(left), int(right)
-        if allele_name not in allele_haplotypes:
-            allele_haplotypes[allele_name] = []
-        allele_haplotypes[allele_name].append([left, right, vars])
-    return allele_haplotypes
-
-
-"""
-"""
-def read_links(fname):
-    links = []
-    for line in open(fname):
-        var_id, allele_names = line.strip().split('\t')
-        links.append([var_id, allele_names])
-    return links
-
-
-"""
-Compare two variants
-"""
-def compare_vars(a, b):
-    a_pos, a_type, a_data = a[:3]
-    b_pos, b_type, b_data = b[:3]
-
-    if a_pos != b_pos:
-        return a_pos - b_pos
-    if a_type != b_type:
-         if a_type == 'I':
-             return -1
-         elif b_type == 'I':
-             return 1
-         if a_type == 'S':
-             return -1
-         else:
-             return 1
-    if a_data < b_data:
-        return -1
-    elif a_data > b_data:
-        return 1
-    else:
-        return 0
-
-
-"""
-"""
-def lower_bound(Var_list, pos):
-    low, high = 0, len(Var_list)
-    while low < high:
-        m = (low + high) / 2
-        m_pos = Var_list[m][0]
-        if m_pos < pos:
-            low = m + 1
-        elif m_pos > pos:
-            high = m
-        else:
-            assert m_pos == pos
-            while m > 0:
-                if Var_list[m-1][0] < pos:
-                    break
-                m -= 1
-            return m
-    return low
-
-
-
-"""
-"""
-def check_files(fnames):
-    for fname in fnames:
-        if not os.path.exists(fname):
-            return False
-    return True
-
-
-##################################################
-#   Database releated routines
-##################################################
-
-    
-"""
-Download GRCh38 human reference and HISAT2 indexes
-"""
-def download_genome_and_index():
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-    if not check_files(HISAT2_fnames):
-        os.system("wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data/grch38.tar.gz; tar xvzf grch38.tar.gz; rm grch38.tar.gz")
-        os.system("hisat2-inspect grch38/genome > genome.fa")
-        os.system("samtools faidx genome.fa")
-
-
-"""
-"""
-def clone_hisatgenotype_database():
-    os.system("git clone https://github.com/DaehwanKimLab/hisatgenotype_db.git")
-    os.system("cd hisatgenotype_db; git checkout hisatgenotype_v1.0.2_beta; cd ..")
-
-
-"""
-"""
-def extract_database_if_not_exists(base,
-                                   locus_list,
-                                   inter_gap = 30,
-                                   intra_gap = 50,
-                                   partial = True,
-                                   verbose = False):
-    fnames = [base + "_backbone.fa",
-              base + "_sequences.fa",
-              base + ".locus",
-              base + ".snp",
-              base + ".index.snp",
-              base + ".haplotype",
-              base + ".link",
-              base + ".allele",
-              base + ".partial"]
-    if check_files(fnames):
-        return
-
-    extract_cmd = ["hisatgenotype_extract_vars.py"]
-    extract_cmd += ["--base", base]
-    if len(locus_list) > 0:
-        extract_cmd += ["--locus-list", ','.join(locus_list)]    
-    if not partial:
-        extract_cmd += ["--no-partial"]
-    extract_cmd += ["--inter-gap", str(inter_gap),
-                    "--intra-gap", str(intra_gap)]
-    if base == "hla":
-        extract_cmd += ["--min-var-freq", "0.1"]
-
-    if base == "codis":
-        extract_cmd += ["--leftshift"]
-
-    # DK - debugging purposes
-    # extract_cmd += ["--ext-seq", "300"]
-    if verbose:
-        print >> sys.stderr, "\tRunning:", ' '.join(extract_cmd)
-    proc = subprocess.Popen(extract_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-    proc.communicate()
-
-    if not check_files(fnames):
-        print >> sys.stderr, "Error: hisatgenotype_extract_vars failed!"
-        sys.exit(1)
-
-        
-"""
-"""
-def build_index_if_not_exists(base,
-                              aligner,
-                              index_type,
-                              threads = 1,
-                              verbose = False):
-    if aligner == "hisat2":
-        # Build HISAT2 graph indexes based on the above information
-        if index_type == "graph":
-            hisat2_graph_index_fnames = ["%s.graph.%d.ht2" % (base, i+1) for i in range(8)]
-            if not check_files(hisat2_graph_index_fnames):
-                build_cmd = ["hisat2-build",
-                             "-p", str(threads),
-                             "--snp", "%s.index.snp" % base,
-                             "--haplotype", "%s.haplotype" % base,
-                             "%s_backbone.fa" % base,
-                             "%s.graph" % base]
-                if verbose:
-                    print >> sys.stderr, "\tRunning:", ' '.join(build_cmd)
-                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                proc.communicate()        
-                if not check_files(hisat2_graph_index_fnames):
-                    print >> sys.stderr, "Error: indexing HLA failed!  Perhaps, you may have forgotten to build hisat2 executables?"
-                    sys.exit(1)
-        # Build HISAT2 linear indexes based on the above information
-        else:
-            assert index_type == "linear"
-            hisat2_linear_index_fnames = ["%s.linear.%d.ht2" % (base, i+1) for i in range(8)]
-            if not check_files(hisat2_linear_index_fnames):
-                build_cmd = ["hisat2-build",
-                             "%s_backbone.fa,%s_sequences.fa" % (base, base),
-                             "%s.linear" % base]
-                proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'), stderr=open("/dev/null", 'w'))
-                proc.communicate()        
-                if not check_files(hisat2_linear_index_fnames):
-                    print >> sys.stderr, "Error: indexing HLA failed!"
-                    sys.exit(1)                    
-    else:
-        # Build Bowtie2 indexes based on the above information
-        assert aligner == "bowtie2" and index_type == "linear"        
-        bowtie2_index_fnames = ["%s.%d.bt2" % (base, i+1) for i in range(4)]
-        bowtie2_index_fnames += ["%s.rev.%d.bt2" % (base, i+1) for i in range(2)]
-        if not tcheck_files(bowtie2_index_fnames):
-            build_cmd = ["bowtie2-build",
-                         "%s_backbone.fa,%s_sequences.fa" % (base, base),
-                         base]
-            proc = subprocess.Popen(build_cmd, stdout=open("/dev/null", 'w'))
-            proc.communicate()        
-            if not check_files(bowtie2_index_fnames):
-                print >> sys.stderr, "Error: indexing HLA failed!"
-                sys.exit(1)
-
-                    
-
-##################################################
-#   Read simulation and alignment
-##################################################
-
-
-"""
-Simulate reads from alleles with headers (>) filled with mapping information.
-  For an example, see hisat2_test_HLA_genotyping.py.
-"""
-def simulate_reads(seq_dic,                       # seq_dic["A"]["A*24:36N"] = "ACGTCCG ..."
-                   base_fname,                    # hla, codis, cyp, or so on
-                   allele_list,                   # ["A*32:29", "B*07:02:01"]
-                   Vars,                          # Vars["A"]["hv326"] = ["single", 604, "C"]
-                   Links,
-                   simulate_interval = 1,
-                   read_len = 100,
-                   frag_len = 250,
-                   perbase_errorrate = 0.0,
-                   perbase_snprate = 0.0,
-                   skip_fragment_regions = []):
-    reads_1, reads_2 = [], []
-    num_pairs = []
-    for allele_names in allele_list:
-        gene = allele_names[0].split('*')[0]
-        num_pairs.append([])
-
-        # Introduce SNPs into allele sequences
-        def introduce_snps(seq):
-            seq = list(seq)
-            for i in range(len(seq)):
-                if random.random() * 100 < perbase_snprate:
-                    if seq[i] == 'A':
-                        alt_bases = ['C', 'G', 'T']
-                    elif seq[i] == 'C':
-                        alt_bases = ['A', 'G', 'T']
-                    elif seq[i] == 'G':
-                        alt_bases = ['A', 'C', 'T']
-                    else:
-                        assert seq[i] == 'T'
-                        alt_bases = ['A', 'C', 'G']
-                    random.shuffle(alt_bases)
-                    alt_base = alt_bases[0]
-                    seq[i] = alt_base
-            seq = ''.join(seq)
-            return seq
-
-        # Simulate reads from two alleles
-        def simulate_reads_impl(seq,
-                                seq_map,
-                                ex_seq_map,
-                                ex_seq,
-                                ex_desc,
-                                simulate_interval = 1,
-                                read_len = 100,
-                                frag_len = 250,
-                                perbase_errorrate = 0.0,
-                                skip_fragment_regions = []):
-            # Introduce sequencing errors
-            def introduce_seq_err(read_seq, pos):
-                read_seq = list(read_seq)
-                for i in range(read_len):
-                    map_pos = seq_map[pos + i]
-                    if ex_desc[map_pos] != "":
-                        continue
-                    if random.random() * 100 < perbase_errorrate:
-                        if read_seq[i] == 'A':
-                            alt_bases = ['C', 'G', 'T']
-                        elif read_seq[i] == 'C':
-                            alt_bases = ['A', 'G', 'T']
-                        elif read_seq[i] == 'G':
-                            alt_bases = ['A', 'C', 'T']
-                        else:
-                            assert read_seq[i] == 'T'
-                            alt_bases = ['A', 'C', 'G']
-                        random.shuffle(alt_bases)
-                        alt_base = alt_bases[0]
-                        read_seq[i] = alt_base
-                read_seq = ''.join(read_seq)
-                return read_seq                            
-                            
-            # Get read alignment, e.g., 260|R_483_61M5D38M23D1M_46|S|hv154,3|S|hv162,10|D|hv185,38|D|hv266
-            def get_info(read_seq, pos):
-                info = "%d_" % (seq_map[pos] + 1)
-                total_match, match, sub_match = 0, 0, 0
-                var_str = ""
-                ins_len, ins_var = 0, ""
-                for i in range(pos, pos + read_len):
-                    map_i = ex_seq_map[i]
-                    assert ex_seq[map_i] != 'D'
-                    total_match += 1
-                    match += 1
-                    if ex_seq[map_i] == 'I':
-                        if ins_var != "":
-                            assert ins_var == ex_desc[map_i]
-                        ins_var = ex_desc[map_i]
-                        ins_len += 1
-                    elif ins_var != "":
-                        if var_str != "":
-                            var_str += ','
-                        var_str += ("%s|I|%s" % (sub_match, ins_var))
-                        ins_len, ins_var = 0, ""
-                        sub_match = 0
-                    if ex_seq[map_i] != 'I':
-                        if ex_desc[map_i] != "" or read_seq[i-pos] != ex_seq[map_i]:
-                            if var_str != "":
-                                var_str += ','
-                            var_str += ("%d|S|%s" % (sub_match, ex_desc[map_i] if ex_desc[map_i] != "" else "unknown"))
-                            sub_match = 0
-                        else:
-                            sub_match += 1
-                    if i + 1 < pos + read_len and ex_seq[map_i+1] == 'D':
-                        assert match > 0
-                        info += ("%dM" % match)
-                        match = 0
-                        del_len = 1
-                        while map_i + 1 + del_len < len(ex_seq):
-                            if ex_seq[map_i + 1 + del_len] != 'D':
-                                break
-                            del_len += 1
-                        info += ("%dD" % del_len)
-                        if var_str != "":
-                            var_str += ','
-                        var_str += ("%s|D|%s" % (sub_match, ex_desc[map_i + 1]))
-                        sub_match = 0
-                assert match > 0
-                info += ("%dM" % match)
-                assert total_match == read_len
-                if var_str:
-                    info += "_"
-                    info += var_str                
-                return info
-                
-            comp_table = {'A':'T', 'C':'G', 'G':'C', 'T':'A'}
-            reads_1, reads_2 = [], []
-            for i in range(0, len(seq) - frag_len + 1, simulate_interval):
-                if len(skip_fragment_regions) > 0:
-                    skip = False
-                    for skip_left, skip_right in skip_fragment_regions:
-                        if i <= skip_right and i + frag_len > skip_left:
-                            skip = True
-                            break
-                    if skip:
-                        continue
-                        
-                pos1 = i
-                seq1 = seq[pos1:pos1+read_len]
-                if perbase_errorrate > 0.0:
-                    seq1 = introduce_seq_err(seq1, pos1)
-                info1 = get_info(seq1, pos1)
-                reads_1.append([seq1, info1])
-                
-                pos2 = i + frag_len - read_len
-                seq2 = seq[pos2:pos2+read_len]
-                if perbase_errorrate > 0.0:
-                    seq2 = introduce_seq_err(seq2, pos2)                
-                info2 = get_info(seq2, pos2)
-                tmp_read_2 = reversed(seq2)
-                read_2 = ""
-                for s in tmp_read_2:
-                    if s in comp_table:
-                        read_2 += comp_table[s]
-                    else:
-                        read_2 += s
-                reads_2.append([read_2, info2])
-            return reads_1, reads_2
-
-        # for each allele in a list of alleles such as ['A*32:29', 'B*07:02:01']
-        for allele_name in allele_names:
-            allele_seq = seq_dic[gene][allele_name]
-            backbone_seq = seq_dic[gene]["%s*BACKBONE" % gene]
-            allele_ex_seq = list(backbone_seq)
-            allele_ex_desc = [''] * len(allele_ex_seq)
-            allele_seq_map = [i for i in range(len(allele_seq))]
-            allele_ex_seq_map = [i for i in range(len(allele_seq))]
-
-            if perbase_snprate > 0:
-                HLA_seq = introduce_snps(allele_seq)
-
-            # Extract variants included in each allele
-            var_ids = []
-            for var_id, allele_list in Links.items():
-                if allele_name in allele_list:
-                    var_ids.append(var_id)
-
-            def var_cmp(a, b):
-                assert a.startswith("hv") and b.startswith("hv")
-                return int(a[2:]) - int(b[2:])
-            var_ids = sorted(var_ids, cmp=var_cmp)
-
-            # Build annotated sequence for the allele w.r.t backbone sequence
-            add_pos = 0
-            for var_id in var_ids:
-                var_type, var_pos, var_data = Vars[gene][var_id]
-                var_pos += add_pos
-                if var_type == "single":
-                    allele_ex_seq[var_pos] = var_data
-                    allele_ex_desc[var_pos] = var_id
-                elif var_type == "deletion":
-                    del_len = int(var_data)
-                    assert var_pos + del_len <= len(allele_ex_seq)
-                    allele_ex_seq[var_pos:var_pos+del_len] = ['D'] * del_len
-                    allele_ex_desc[var_pos:var_pos+del_len] = [var_id] * del_len
-                else:
-                    assert var_type == "insertion"
-                    ins_len = len(var_data)
-                    allele_ex_seq = allele_ex_seq[:var_pos] + (['I'] * ins_len) + allele_ex_seq[var_pos:]
-                    allele_ex_desc = allele_ex_desc[:var_pos] + ([var_id] * ins_len) + allele_ex_desc[var_pos:]
-                    add_pos += ins_len
-            allele_ex_seq = ''.join(allele_ex_seq)
-            assert len(backbone_seq) + add_pos == len(allele_ex_seq)            
-
-            # Build mapping from the allele to the annotated sequence
-            prev_j, minus_pos = 0, 0
-            for i in range(len(allele_seq)):
-                for j in range(prev_j, len(allele_ex_seq)):
-                    if allele_ex_seq[j] != 'D':
-                        if allele_ex_seq[j] == 'I':
-                            minus_pos += 1
-                        break
-                allele_seq_map[i] = j - minus_pos
-                allele_ex_seq_map[i] = j
-                prev_j = j + 1
-
-            # DK - debugging purposes
-            """
-            for t in range(0, len(allele_ex_seq), 100):
-                print t, allele_ex_seq[t:t+100]
-                print t, '-'.join(allele_ex_desc[t:t+100])
-                print t, allele_seq_map[t:t+100]
-            print "allele_seq length:", len(allele_seq)
-            print len(allele_ex_seq), "vs.", len(seq_dic[gene]["A*BACKBONE"]), "vs.", len(allele_seq_map)
-            print allele_ex_seq[1943:1946]
-            print allele_ex_desc[1943:1946]
-            sys.exit(1)
-            """
-            
-            tmp_reads_1, tmp_reads_2 = simulate_reads_impl(allele_seq,
-                                                           allele_seq_map,
-                                                           allele_ex_seq_map,
-                                                           allele_ex_seq,
-                                                           allele_ex_desc,
-                                                           simulate_interval,
-                                                           read_len,
-                                                           frag_len,
-                                                           perbase_errorrate,
-                                                           skip_fragment_regions)
-            reads_1 += tmp_reads_1
-            reads_2 += tmp_reads_2
-            num_pairs[-1].append(len(tmp_reads_1))
-
-    # Write reads into a FASTA file
-    def write_reads(reads, idx):
-        read_file = open('%s_input_%d.fa' % (base_fname, idx), 'w')
-        for read_i in range(len(reads)):
-            query_name = "%d|%s_%s" % (read_i + 1, "LR"[idx-1], reads[read_i][1])
-            if len(query_name) > 254:
-                query_name = query_name[:254]
-            print >> read_file, ">%s" % query_name
-            print >> read_file, reads[read_i][0]
-        read_file.close()
-    write_reads(reads_1, 1)
-    write_reads(reads_2, 2)
-
-    return num_pairs
-
-
-"""
-Align reads, and sort the alignments into a BAM file
-"""
-def align_reads(aligner,
-                simulation,
-                index_name,
-                index_type,
-                base_fname,
-                read_fname,
-                fastq,
-                threads,
-                out_fname,
-                verbose):
-    if aligner == "hisat2":
-        aligner_cmd = [aligner, "--mm"]
-        if not simulation:
-            aligner_cmd += ["--no-unal"]            
-        DNA = True
-        if DNA:
-            aligner_cmd += ["--no-spliced-alignment"] # no spliced alignment
-            aligner_cmd += ["-X", "1000"] # max fragment length
-        if index_type == "linear":
-            aligner_cmd += ["-k", "10"]
-        else:
-            aligner_cmd += ["--max-altstried", "64"]
-            aligner_cmd += ["--haplotype"]
-            if base_fname == "codis":
-                aligner_cmd += ["--enable-codis"]
-                aligner_cmd += ["--no-softclip"]
-
-    elif aligner == "bowtie2":
-        aligner_cmd = [aligner,
-                       "--no-unal",
-                       "-k", "10"]
-    else:
-        assert False
-    aligner_cmd += ["-x", index_name]
-    assert len(read_fname) in [1,2]
-    aligner_cmd += ["-p", str(threads)]
-    if not fastq:
-        aligner_cmd += ["-f"]
-    if len(read_fname) == 1:
-        aligner_cmd += ["-U", read_fname[0]]
-    else:
-        aligner_cmd += ["-1", "%s" % read_fname[0],
-                        "-2", "%s" % read_fname[1]]
-
-    if verbose >= 1:
-        print >> sys.stderr, ' '.join(aligner_cmd)
-    align_proc = subprocess.Popen(aligner_cmd,
-                                  stdout=subprocess.PIPE,
-                                  stderr=open("/dev/null", 'w'))
-
-    sambam_cmd = ["samtools",
-                  "view",
-                  "-bS",
-                  "-"]
-    sambam_proc = subprocess.Popen(sambam_cmd,
-                                   stdin=align_proc.stdout,
-                                   stdout=open(out_fname + ".unsorted", 'w'),
-                                   stderr=open("/dev/null", 'w'))
-    sambam_proc.communicate()
-    if index_type == "graph":
-        bamsort_cmd = ["samtools",
-                       "sort",
-                       out_fname + ".unsorted",
-                       "-o", out_fname]
-        bamsort_proc = subprocess.Popen(bamsort_cmd,
-                                        stderr=open("/dev/null", 'w'))
-        bamsort_proc.communicate()
-
-        bamindex_cmd = ["samtools",
-                        "index",
-                        out_fname]
-        bamindex_proc = subprocess.Popen(bamindex_cmd,
-                                         stderr=open("/dev/null", 'w'))
-        bamindex_proc.communicate()
-
-    os.system("rm %s" % (out_fname + ".unsorted"))
-
-
-"""
-HISAT-genotype's mpileup
-"""
-def get_mpileup(alignview_cmd,
-                ref_seq,
-                base_locus,
-                vars,
-                allow_discordant):
-    ref_seq_len = len(ref_seq)
-    mpileup = []
-    for i in range(ref_seq_len):
-        mpileup.append([[], {}])
-        
-    proc = subprocess.Popen(alignview_cmd,
-                            stdout=subprocess.PIPE,
-                            stderr=open("/dev/null", 'w'))
-
-    prev_pos = -1
-    cigar_re = re.compile('\d+\w')
-    for line in proc.stdout:
-        line = line.strip()
-        cols = line.split()
-        read_id, flag, _, pos, _, cigar_str = cols[:6]
-        read_seq = cols[9]
-        flag, pos = int(flag), int(pos)
-        # Unalined?
-        if flag & 0x4 != 0:
-            continue
-        pos -= (base_locus + 1)
-        if pos < 0:
-            continue
-
-        # Concordantly mapped?
-        if flag & 0x2 != 0:
-            concordant = True
-        else:
-            concordant = False
-
-        if not allow_discordant and not concordant:
-            continue
-
-        read_pos, left_pos = 0, pos
-        right_pos = left_pos
-        cigars = cigar_re.findall(cigar_str)
-        cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-        for i in range(len(cigars)):
-            cigar_op, length = cigars[i]
-            if cigar_op in "MD":
-                for j in range(length):
-                    if cigar_op == 'M':
-                        read_nt = read_seq[read_pos + j]
-                    else:
-                        read_nt = 'D'
-                    if right_pos + j < len(mpileup):
-                        if read_nt not in mpileup[right_pos + j][1]:
-                            mpileup[right_pos + j][1][read_nt] = 1
-                        else:
-                            mpileup[right_pos + j][1][read_nt] += 1
-
-            if cigar_op in "MND":
-                right_pos += length
-
-            if cigar_op in "MIS":
-                read_pos += length
-
-    # Choose representative bases or 'D'
-    for i in range(len(mpileup)):
-        nt_dic = mpileup[i][1]
-        num_nt = sum(nt_dic.values())
-        nt_set = []
-        if num_nt >= 20:
-            for nt, count in nt_dic.items():
-                if nt not in "ACGT":
-                    continue
-                if count >= num_nt * 0.2 or count >= 7:
-                    nt_set.append(nt)
-        mpileup[i][0] = nt_set
-
-    # Sort variants
-    var_list = [[] for i in range(len(mpileup))]
-    for var_id, value in vars.items():
-        var_type, var_pos, var_data = value
-        assert var_pos < len(var_list)
-        var_list[var_pos].append([var_id, var_type, var_data])
-
-    # Assign known or unknown variants
-    skip_i, prev_del_var_id = -1, ""
-    for i in range(len(mpileup)):
-        nt_dic = mpileup[i][1]
-        ref_nt = ref_seq[i]
-        new_nt_dic = {}
-        for nt, count in nt_dic.items():
-            var_id = ""
-            if nt == 'D':
-                if i <= skip_i:
-                    assert prev_del_var_id != ""
-                    var_id = prev_del_var_id
-                else:
-                    for var_id_, var_type, var_data in var_list[i]:
-                        if var_type != "deletion":
-                            continue
-                        del_len = int(var_data)
-                        del_exist = True
-                        for j in range(i + 1, i + del_len):
-                            assert j < len(mpileup)
-                            nt_dic2 = mpileup[j][1]
-                            if 'D' not in nt_dic2:
-                                del_exist = False
-                                break
-                        if del_exist:
-                            var_id = var_id_
-                            prev_del_var_id = var_id
-                            skip_i = i + del_len - 1
-                            break                                                
-            elif nt != 'N' and nt != ref_nt:
-                assert nt in "ACGT"
-                id = "unknown"
-                for var_id_, var_type, var_data in var_list[i]:
-                    if var_type != "single":
-                        continue
-                    if nt == var_data:
-                        var_id = var_id_
-                        break
-            new_nt_dic[nt] = [count, var_id]
-                        
-        mpileup[i][1] = new_nt_dic
-
-    return mpileup
-
-
-"""
-"""
-def get_pair_interdist(alignview_cmd,
-                       simulation,
-                       verbose):
-    bamview_proc = subprocess.Popen(alignview_cmd,
-                                    stdout=subprocess.PIPE,
-                                    stderr=open("/dev/null", 'w'))
-    sort_read_cmd = ["sort", "-k", "1,1", "-s"] # -s for stable sorting
-    alignview_proc = subprocess.Popen(sort_read_cmd,
-                                      stdin=bamview_proc.stdout,
-                                      stdout=subprocess.PIPE,
-                                      stderr=open("/dev/null", 'w'))
-
-    dist_list = []
-    prev_read_id = None
-    cigar_re = re.compile('\d+\w')
-    reads = []
-    for line in alignview_proc.stdout:
-        line = line.strip()
-        cols = line.split()
-        read_id, flag, _, pos, _, cigar_str = cols[:6]
-        read_seq = cols[9]
-        flag, pos = int(flag), int(pos)
-        # Unalined?
-        if flag & 0x4 != 0:
-            continue
-
-        if simulation:
-            read_id = read_id.split('|')[0]
-
-        # Concordantly mapped?
-        if flag & 0x2 != 0:
-            concordant = True
-        else:
-            concordant = False
-
-        NH, YT = sys.maxint, ""
-        for i in range(11, len(cols)):
-             col = cols[i]
-             if col.startswith("NH"):
-                 NH = int(col[5:])
-             elif col.startswith("YT"):
-                 YT = col[5:]
-        if NH > 1 or YT != "CP":
-            continue
-
-        if prev_read_id != None and read_id != prev_read_id:
-            if len(reads) == 2:
-                left1, right1 = reads[0]
-                left2, right2 = reads[1]
-                if left1 <= left2:
-                    dist = left2 - right1 - 1
-                else:
-                    dist = left1 - right2 - 1
-                dist_list.append(dist)
-            reads = []
-
-        left_pos = right_pos =  pos
-        cigars = cigar_re.findall(cigar_str)
-        cigars = [[cigar[-1], int(cigar[:-1])] for cigar in cigars]
-        for i in range(len(cigars)):
-            cigar_op, length = cigars[i]
-            if cigar_op in "MND":
-                right_pos += length
-
-        reads.append([left_pos, right_pos - 1])
-        
-        prev_read_id = read_id
-
-    dist_list = sorted(dist_list)
-    dist_avg = sum(dist_list) / max(1, len(dist_list))
-    if len(dist_list) > 0:
-        dist_median = dist_list[len(dist_list)/2]
-    else:
-        dist_median = -1
-
-    return dist_median
-
-
-##################################################
-#   Statistical routines
-##################################################
-
-
-"""
-"""
-def prob_diff(prob1, prob2):
-    diff = 0.0
-    for allele in prob1.keys():
-        if allele in prob2:
-            diff += abs(prob1[allele] - prob2[allele])
-        else:
-            diff += prob1[allele]
-    return diff
-
-
-"""
-"""
-def Gene_prob_cmp(a, b):
-    if a[1] != b[1]:
-        if a[1] < b[1]:
-            return 1
-        else:
-            return -1
-    assert a[0] != b[0]
-    if a[0] < b[0]:
-        return -1
-    else:
-        return 1
-
-
-"""
-"""
-def single_abundance(Gene_cmpt,
-                     remove_low_abundance_allele = False,
-                     Gene_length = {}):
-    def normalize(prob):
-        total = sum(prob.values())
-        for allele, mass in prob.items():
-            prob[allele] = mass / total        
-
-    def normalize_len(prob, length):
-        total = 0
-        for allele, mass in prob.items():
-            assert allele in length
-            total += (mass / length[allele])
-        for allele, mass in prob.items():
-            assert allele in length
-            prob[allele] = mass / length[allele] / total
-
-    Gene_prob, Gene_prob_next = {}, {}
-    for cmpt, count in Gene_cmpt.items():
-        alleles = cmpt.split('-')
-        for allele in alleles:
-            if allele not in Gene_prob:
-                Gene_prob[allele] = 0.0
-            Gene_prob[allele] += (float(count) / len(alleles))
-    if len(Gene_length) > 0:
-        normalize_len(Gene_prob, Gene_length)
-    else:
-        normalize(Gene_prob)
-
-    def next_prob(Gene_cmpt, Gene_prob, Gene_length):
-        Gene_prob_next = {}
-        for cmpt, count in Gene_cmpt.items():
-            alleles = cmpt.split('-')
-            alleles_prob = 0.0
-            for allele in alleles:
-                if allele not in Gene_prob:
-                    continue
-                alleles_prob += Gene_prob[allele]
-            if alleles_prob <= 0.0:
-                continue
-            for allele in alleles:
-                if allele not in Gene_prob:
-                    continue
-                if allele not in Gene_prob_next:
-                    Gene_prob_next[allele] = 0.0
-                Gene_prob_next[allele] += (float(count) * Gene_prob[allele] / alleles_prob)
-        if len(Gene_length) > 0:
-            normalize_len(Gene_prob_next, Gene_length)
-        else:
-            normalize(Gene_prob_next)
-        return Gene_prob_next
-
-    def select_alleles(Gene_prob):
-        if len(Gene_prob) == 0:
-            return Gene_prob
-        Gene_prob2 = {}
-        max_prob = max(Gene_prob.values())
-        for allele, prob in Gene_prob.items():
-            if prob >= max_prob / 10.0:
-                Gene_prob2[allele] = prob
-        return Gene_prob2
-
-    fast_EM = True
-    diff, iter = 1.0, 0
-    while diff > 0.0001 and iter < 1000:
-        Gene_prob_next = next_prob(Gene_cmpt, Gene_prob, Gene_length)
-        if fast_EM:
-            # Accelerated version of EM - SQUAREM iteration
-            #    Varadhan, R. & Roland, C. Scand. J. Stat. 35, 335-353 (2008)
-            #    Also, this algorithm is used in Sailfish - http://www.nature.com/nbt/journal/v32/n5/full/nbt.2862.html
-            Gene_prob_next2 = next_prob(Gene_cmpt, Gene_prob_next, Gene_length)
-            sum_squared_r, sum_squared_v = 0.0, 0.0
-            p_r, p_v = {}, {}
-            for a in Gene_prob.keys():
-                p_r[a] = Gene_prob_next[a] - Gene_prob[a]
-                sum_squared_r += (p_r[a] * p_r[a])
-                p_v[a] = Gene_prob_next2[a] - Gene_prob_next[a] - p_r[a]
-                sum_squared_v += (p_v[a] * p_v[a])
-            if sum_squared_v > 0.0:
-                gamma = -math.sqrt(sum_squared_r / sum_squared_v)
-                for a in Gene_prob.keys():
-                    Gene_prob_next2[a] = max(0.0, Gene_prob[a] - 2 * gamma * p_r[a] + gamma * gamma * p_v[a]);
-                Gene_prob_next = next_prob(Gene_cmpt, Gene_prob_next2, Gene_length)
-
-        diff = prob_diff(Gene_prob, Gene_prob_next)
-        Gene_prob = Gene_prob_next
-
-        # Accelerate convergence
-        if iter >= 10 and remove_low_abundance_allele:
-            Gene_prob = select_alleles(Gene_prob)
-
-        # DK - debugging purposes
-        if iter % 10 == 0 and False:
-            print >> sys.stderr, "iter", iter
-            for allele, prob in Gene_prob.items():
-                if prob >= 0.001:
-                    print >> sys.stderr, "\t", iter, allele, prob
-        
-        iter += 1
-
-    if remove_low_abundance_allele:
-        Gene_prob = select_alleles(Gene_prob)
-    if len(Gene_length) > 0:
-            normalize_len(Gene_prob, Gene_length)
-    else:
-        normalize(Gene_prob)
-    Gene_prob = [[allele, prob] for allele, prob in Gene_prob.items()]
-    Gene_prob = sorted(Gene_prob, cmp=Gene_prob_cmp)
-    return Gene_prob
-
-
-##################################################
-#   Realignment, alternative alignments
-##################################################
-
-
-"""
-Identify alternative haplotypes
-   insertions are not considered...
-
-   INPUT: see the function's parameters below
-   OUPUT: 529-hv8-hv22-606: set(['529-hv13-570', '529-hv4-hv18-590', '529-hv2-hv16-582'])
-          529-hv3-hv17-598: set(['529-hv6-hv21-hv26-610'])
-"""
-def get_alternatives(ref_seq,     # GATAACTAGATACATGAGATAGATTTGATAGATAGATAGATACATACATACATACATACATACAGGATAGATAACTAGG...
-                     allele_vars, # {'VWA*20(22)': ['hv231', 'hv245'], "VWA*16(18')": ['hv235', 'hv250', 'hv256'], ...}
-                     Vars,        # {'hv241': ['deletion', 529, '52'], 'hv240': ['deletion', 529, '48'], ... }
-                     Var_list,    # [[529, 'hv230'], [529, 'hv231'], [529, 'hv232'], [529, 'hv233'], ...]
-                     verbose):
-    haplotype_alts_left, haplotype_alts_right = {}, {}
-    second_order_haplotypes = set()
-    for allele_name, vars in allele_vars.items():
-        for v in range(len(vars) - 1):
-            ht = vars[v] + "-" + vars[v+1]
-            second_order_haplotypes.add(ht)
-
-    rev_Var_list = []
-    for _, var_id in Var_list:
-        var_type, var_pos, var_data = Vars[var_id]
-        if var_type == "deletion":
-            var_pos = var_pos + int(var_data) - 1
-        elif var_type == "insertion":
-            var_pos += 1
-        rev_Var_list.append([var_pos, var_id])
-    rev_Var_list = sorted(rev_Var_list, cmp=lambda a, b: a[0] - b[0])
-
-    def nextbases(haplotype,
-                  left = True,
-                  exclude_list = []):
-        if left:
-            pos = int(haplotype[0]) - 1
-        else:
-            pos = haplotype[-1] + 1
-        if pos < 0 or pos >= len(ref_seq):
-            return []
-
-        if left:
-            bases = [[[pos] + haplotype[1:], ref_seq[pos]]]
-            prev_id = None
-            if len(haplotype) > 2:
-                prev_id = haplotype[1]        
-
-            var_i = lower_bound(rev_Var_list, pos + 1)
-            for var_j in reversed(range(0, var_i)):
-                _, var_id = rev_Var_list[var_j]
-                var_type, var_pos, var_data = Vars[var_id]
-                if var_type == "deletion":
-                    if var_pos == 0:
-                        continue
-                    var_pos = var_pos + int(var_data) - 1
-                if var_pos > pos:
-                    continue
-                if var_pos < pos:
-                    break
-                if var_id in exclude_list:
-                    continue
-                if prev_id:
-                    second_ht = var_id + "-" + prev_id
-                    if second_ht not in second_order_haplotypes:
-                        continue
-
-                if var_type == "single":
-                    bases.append([[var_pos, var_id] + haplotype[1:], var_data])
-                elif var_type == "deletion":
-                    bases2 = nextbases([var_pos - int(var_data) + 1, var_id] + haplotype[1:],
-                                       left,
-                                       exclude_list)
-                    bases += bases2
-                else:
-                    assert var_type == "insertion"
-        else:
-            bases = [[haplotype[:-1] + [pos], ref_seq[pos]]]
-            prev_id = None
-            if len(haplotype) > 2:
-                prev_id = haplotype[-2]       
-
-            var_i = lower_bound(Var_list, pos)
-            for var_j in range(var_i, len(Var_list)):
-                _, var_id = Var_list[var_j]
-                var_type, var_pos, var_data = Vars[var_id]
-                if var_pos < pos:
-                    continue
-                if var_pos > pos:
-                    break
-                if var_id in exclude_list:
-                    continue
-                if prev_id:
-                    second_ht = prev_id + "-" + var_id
-                    if second_ht not in second_order_haplotypes:
-                        continue
-
-                if var_type == "single":
-                    bases.append([haplotype[:-1] + [var_id, var_pos], var_data])
-                elif var_type == "deletion":
-                    bases2 = nextbases(haplotype[:-1] + [var_id, var_pos + int(var_data) - 1],
-                                       left,
-                                       exclude_list)
-                    bases += bases2
-                else:
-                    assert var_type == "insertion"
-
-        return bases
-
-    def get_haplotype_seq(haplotype):
-        seq = ""
-        pos = int(haplotype[0])
-        for i in range(1, len(haplotype) - 1):
-            var_id = haplotype[i]
-            var_type, var_pos, var_data = Vars[var_id]
-            if pos < var_pos:
-                seq += ref_seq[pos:var_pos]
-            if var_type == "single":
-                seq += var_data
-                pos = var_pos + 1
-            elif var_type == "deletion":
-                pos = var_pos + int(var_data)
-            else:
-                assert var_type == "insertion"
-                seq += var_data
-                pos = var_pos
-            
-        last_pos = int(haplotype[-1]) + 1
-        assert pos <= last_pos
-        if pos < last_pos:
-            seq += ref_seq[pos:last_pos]                
-        return seq
-
-    def get_alternative_recur(var_orig_id,
-                              haplotype,
-                              haplotype_alt,
-                              left = True,
-                              dep = 0):
-        bases1 = nextbases(haplotype,
-                           left)
-        bases2 = nextbases(haplotype_alt,
-                           left,
-                           [var_orig_id]) # exclude
-
-        found = False
-        for base1 in bases1:
-            next_haplotype, bp = base1
-            for base2 in bases2:
-                next_haplotype_alt, bp2 = base2
-                if bp != bp2:
-                    continue
-
-                # Todo: implement a routine to handle haplotypes ending with the same coordinate
-                if left:
-                    left1, left2 = int(next_haplotype[0]), int(next_haplotype_alt[0])
-                    if left1 == left2:
-                        continue
-                else:
-                    right1, right2 = int(next_haplotype[-1]), int(next_haplotype_alt[-1])
-                    if right1 == right2:
-                        continue
-
-                found = True
-                get_alternative_recur(var_orig_id,
-                                      next_haplotype,
-                                      next_haplotype_alt,
-                                      left,
-                                      dep + 1)            
-  
-        if dep > 0:
-            if not found:
-                def to_haplotype_str(haplotype):
-                    if len(haplotype) <= 2:
-                        haplotype = "%d-%d" % (haplotype[0], haplotype[1])
-                    else:
-                        haplotype = "%d-%s-%d" % (haplotype[0], '-'.join(haplotype[1:-1]), haplotype[-1])
-                    return haplotype
-
-                haplotype, haplotype_alt = to_haplotype_str(haplotype), to_haplotype_str(haplotype_alt)
-                haplotype_alts = haplotype_alts_left if left else haplotype_alts_right
-                if haplotype not in haplotype_alts:
-                    haplotype_alts[haplotype] = set()
-                haplotype_alts[haplotype].add(haplotype_alt)
-
-                if haplotype_alt not in haplotype_alts:
-                    haplotype_alts[haplotype_alt] = set()
-                haplotype_alts[haplotype_alt].add(haplotype)
-
-    # Search alternative haplotypes in both left and right directions
-    for var_i in range(len(Var_list)):
-        _, var_id = Var_list[var_i]
-        var_type, var_pos, var_data = Vars[var_id]
-        if var_pos == 0:
-            continue
-        if var_type != "deletion":
-            continue
-        del_len = int(var_data)
-        if var_pos + del_len >= len(ref_seq):
-            continue
-
-        # Left direction
-        get_alternative_recur(var_id,
-                              [var_pos, var_id, var_pos + del_len - 1],
-                              [var_pos + del_len, var_pos + del_len - 1])
-
-        # Right direction    
-        get_alternative_recur(var_id,
-                              [var_pos, var_id, var_pos + del_len - 1],
-                              [var_pos, var_pos - 1],
-                              False)
-
-    # Print alternative haplotypes / Sanity check
-    def print_haplotype_alts(haplotype_alts):
-        for haplotype, haplotype_set in haplotype_alts.items():
-            if verbose: print "\t%s:" % haplotype, haplotype_set
-            haplotype_seq = get_haplotype_seq(haplotype.split('-'))
-            for haplotype_alt in haplotype_set:
-                haplotype_alt_seq = get_haplotype_seq(haplotype_alt.split('-'))
-                assert haplotype_seq == haplotype_alt_seq            
-
-    if verbose: print "number of left haplotypes:", len(haplotype_alts_left)
-    print_haplotype_alts(haplotype_alts_left)
-    if verbose: print "number of right haplotypes:", len(haplotype_alts_right)
-    print_haplotype_alts(haplotype_alts_right)
-
-    return haplotype_alts_left, haplotype_alts_right
-
-
-"""
-Identify ambigious differences that may account for other alleles,
-  given a list of differences (cmp_list) between a read and a potential allele   
-"""
-def identify_ambigious_diffs(ref_seq,
-                             Vars,
-                             Alts_left,
-                             Alts_right,
-                             Alts_left_list,
-                             Alts_right_list,
-                             cmp_list,
-                             verbose,
-                             debug = False):
-    cmp_left, cmp_right = 0, len(cmp_list) - 1
-    left, right = cmp_list[0][1], cmp_list[-1][1] + cmp_list[-1][2] - 1
-    left_alt_set, right_alt_set = set(), set()
-
-    def get_haplotype_and_seq(cmp_list):
-        ht, seq = [], ""
-        for i in range(len(cmp_list)):
-            cmp_i = cmp_list[i]
-            type, pos, length = cmp_i[:3]
-            if len(cmp_i) <= 3:
-                var_id = ""
-            else:
-                var_id = cmp_i[3]
-            if type == "match":
-                seq += ref_seq[pos:pos+length]
-            elif type == "mismatch":
-                seq += ref_seq[pos]
-            elif type == "insertion":
-                None
-                # seq += data
-            else:
-                assert type == "deletion"
-
-            if var_id != "" and var_id != "unknown":
-                ht.append(var_id)
-        return ht, seq
-
-    # Left direction
-    found = False
-    for i in reversed(range(len(cmp_list))):
-        i_found = False
-        cmp_i = cmp_list[i]
-        type, cur_left, length = cmp_i[:3]
-        var_id = cmp_i[3] if type in ["mismatch", "deletion"] else ""
-
-        # DK - debugging purposes
-        if type in ["mismatch", "deletion", "insertion"]:
-            if not var_id.startswith("hv"):
-                continue
-        
-        if type in ["match", "deletion"]:
-            cur_right = cur_left + length - 1
-        else:
-            cur_right = cur_left
-
-        cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[:i+1])
-        if len(cur_ht) == 0:
-            cur_ht_str = str(left)
-        else:
-            cur_ht_str = "%d-%s" % (left, '-'.join(cur_ht))
-        ht_i = lower_bound(Alts_left_list, cur_right + 1)
-        for ht_j in reversed(range(0, min(ht_i + 1, len(Alts_left_list)))):
-            ht_pos, ht = Alts_left_list[ht_j]
-            if ht_pos < cur_left:
-                break            
-            if ht_pos > cur_right:
-                continue
-
-            if len(cur_ht) > 0:
-                if ht.find('-'.join(cur_ht)) == -1:
-                    continue
-
-            ht = ht.split('-')[:-1]
-            if len(cur_ht) + 1 == len(ht):
-                ht_pos = int(ht[0])
-                if left < ht_pos:
-                    continue
-            else:
-                var_id2 = ht[len(ht) - len(cur_ht) - 1]
-                ht_type, ht_pos, ht_data = Vars[var_id2]
-                if ht_type == "deletion":
-                    ht_pos = ht_pos + int(ht_data) - 1
-                if left <= ht_pos:
-                    continue
-
-            i_found = True
-            if debug:
-                print cmp_list[:i+1]
-                print "\t", cur_ht, "vs", Alts_left_list[ht_j]
-
-            _, rep_ht = Alts_left_list[ht_j]
-
-            if debug:
-                print "DK1:", cmp_i, cmp_list
-                print "DK2:", rep_ht, Alts_left[rep_ht]
-                print "DK3:", left, right
-
-            for alt_ht_str in Alts_left[rep_ht]:
-                alt_ht = alt_ht_str.split('-')
-                alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1])
-                assert alt_ht_right <= cur_right
-                seq_pos = cur_right - alt_ht_right
-                cur_pos = alt_ht_right
-                part_alt_ht = []
-                alt_ht = alt_ht[1:-1]
-                for var_id_ in reversed(alt_ht):
-                    var_type_, var_pos_, var_data_ = Vars[var_id_]
-                    if var_type_ == "deletion":
-                        del_len = int(var_data_)
-                        var_pos_ = var_pos_ + del_len - 1
-                    assert var_pos_ <= cur_pos
-                    next_seq_pos = seq_pos + (cur_pos - var_pos_)
-                    if next_seq_pos >= len(cur_seq):
-                        break
-                    if var_type_ == "single":
-                        next_seq_pos += 1
-                        next_cur_pos = var_pos_ - 1
-                    elif var_type_ == "deletion":
-                        next_cur_pos = var_pos_ - del_len
-                    else:
-                        assert var_type_ == "insertion"
-                        assert False
-
-                    part_alt_ht.insert(0, var_id_)
-                    if next_seq_pos >= len(cur_seq):
-                        break
-                    seq_pos, cur_pos = next_seq_pos, next_cur_pos
-
-                if len(part_alt_ht) > 0:
-                    seq_left = len(cur_seq) - seq_pos - 1
-                    part_alt_ht_str = ""
-                    if found:
-                        var_id_list = []
-                        for j in range(i + 1, cmp_left):
-                            cmp_j = cmp_list[j]
-                            if cmp_j[0] in ["mismatch", "deletion", "insertion"]:
-                                var_id_ = cmp_j[3]
-                                if var_id_.startswith("hv"):
-                                    var_id_list.append(var_id_)
-                        if len(var_id_list) > 0:
-                            part_alt_ht_str = '-' + '-'.join(var_id_list)
-                    part_alt_ht_str = ("%d-%s" % (cur_pos - seq_left, '-'.join(part_alt_ht))) + part_alt_ht_str
-                    left_alt_set.add(part_alt_ht_str)
-                        
-                if debug:
-                    print "\t\t", cur_left, alt_ht_str
-
-        if i_found:
-            if not found:
-                cmp_left = i + 1
-                left_alt_set.add(cur_ht_str)
-            found = True
-
-    if not found:
-        left_alt_set.add(str(left))
-
-    # Right direction
-    found = False
-    for i in range(0, len(cmp_list)):
-        i_found = False
-        cmp_i = cmp_list[i]
-        type, cur_left, length = cmp_i[:3]
-        var_id = cmp_i[3] if type in ["mismatch", "deletion"] else ""
-
-        # DK - debugging purpose
-        if type in ["mismatch", "deletion", "insertion"]:
-            if not var_id.startswith("hv"):
-                continue
-
-        if type in ["match", "deletion"]:
-            cur_right = cur_left + length - 1
-        else:
-            cur_right = cur_left
-
-        cur_ht, cur_seq = get_haplotype_and_seq(cmp_list[i:])
-        if len(cur_ht) == 0:
-            cur_ht_str = str(right)
-        else:
-            cur_ht_str = "%s-%d" % ('-'.join(cur_ht), right)
-
-        ht_i = lower_bound(Alts_right_list, cur_left)
-        for ht_j in range(ht_i, len(Alts_right_list)):
-            ht_pos, ht = Alts_right_list[ht_j]
-            if ht_pos > cur_right:
-                break
-            if ht_pos < cur_left:
-                continue
-
-            if len(cur_ht) > 0:
-                if ht.find('-'.join(cur_ht)) == -1:
-                    continue
-
-            ht = ht.split('-')[1:]
-            if len(cur_ht) + 1 == len(ht):
-                ht_pos = int(ht[-1])
-                if right > ht_pos:
-                    continue
-            else:
-                var_id2 = ht[len(cur_ht)]
-                var_type, ht_pos, _ = Vars[var_id2]
-                if right >= ht_pos:
-                    continue
-
-            i_found = True
-            _, rep_ht = Alts_right_list[ht_j]
-
-            if debug:
-                print "DK1:", cmp_i, cmp_list
-                print "DK2:", rep_ht, Alts_right[rep_ht]
-                print "DK3:", left, right, ht_pos
-
-            for alt_ht_str in Alts_right[rep_ht]:
-                alt_ht = alt_ht_str.split('-')
-                alt_ht_left, alt_ht_right = int(alt_ht[0]), int(alt_ht[-1])
-                assert cur_left <= alt_ht_left
-                seq_pos = alt_ht_left - cur_left
-                cur_pos = alt_ht_left
-                part_alt_ht = []
-                alt_ht = alt_ht[1:-1]
-                for var_id_ in alt_ht:
-                    var_type_, var_pos_, var_data_ = Vars[var_id_]
-                    assert var_pos_ >= cur_pos
-                    next_seq_pos = seq_pos + (var_pos_ - cur_pos)
-                    if next_seq_pos >= len(cur_seq):
-                        break
-                    
-                    if var_type_ == "single":
-                        next_seq_pos += 1
-                        next_cur_pos = var_pos_ + 1
-                    elif var_type_ == "deletion":
-                        next_cur_pos = var_pos_ + int(var_data_)
-                    else:
-                        assert var_type_ == "insertion"
-                        assert False
-
-                    part_alt_ht.append(var_id_)
-                    if next_seq_pos >= len(cur_seq):
-                        break
-                    seq_pos, cur_pos = next_seq_pos, next_cur_pos
-
-                if len(part_alt_ht) > 0:
-                    seq_left = len(cur_seq) - seq_pos - 1
-                    assert seq_left >= 0
-                    part_alt_ht_str = ""
-                    if found:
-                        var_id_list = []
-                        for j in range(cmp_right + 1, i):
-                            cmp_j = cmp_list[j]
-                            if cmp_j[0] in ["mismatch", "deletion", "insertion"]:
-                                var_id_ = cmp_j[3]
-                                if var_id_.startswith("hv"):
-                                    var_id_list.append(var_id_)
-                        if len(var_id_list) > 0:
-                            part_alt_ht_str = '-'.join(var_id_list) + '-'
-                    part_alt_ht_str += ("%s-%d" % ('-'.join(part_alt_ht), cur_pos + seq_left))
-                    right_alt_set.add(part_alt_ht_str)
-                        
-        if i_found:            
-            if not found:
-                cmp_right = i - 1
-                right_alt_set.add(cur_ht_str)
-            found = True
-
-    if not found:
-        right_alt_set.add(str(right))
-
-    if cmp_right < cmp_left:
-        cmp_left = 0
-        left_alt_set = set([str(left)])
-
-    # Sanity check
-    ht_set_ = set()
-    for ht in left_alt_set:
-        ht = '-'.join(ht.split('-')[1:])
-        if ht == "":
-            continue
-        if ht in ht_set_:
-            print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_
-
-            # DK - debugging purposes
-            print "DK: cmp_list_range: [%d, %d]" % (cmp_left, cmp_right)
-            print "DK: cmp_list:", cmp_list
-            print "DK: left_alt_set:", left_alt_set, "right_alt_set:", right_alt_set
-            
-            assert False
-        ht_set_.add(ht)
-    for ht in right_alt_set:
-        ht = '-'.join(ht.split('-')[:-1])
-        if ht == "":
-            continue
-        if ht in ht_set_:
-            print >> sys.stderr, "Error) %s should not be in" % ht, ht_set_
-            assert False
-        ht_set_.add(ht)
-
-    if debug:
-        print "cmp_list_range: [%d, %d]" % (cmp_left, cmp_right)
-        print "left  alt set:", left_alt_set
-        print "right alt set:", right_alt_set
-    
-    return cmp_left, cmp_right, list(left_alt_set), list(right_alt_set)
-
diff --git a/hisatgenotype_scripts/compare_HLA.py b/hisatgenotype_scripts/compare_HLA.py
deleted file mode 100755
index d32b593c..00000000
--- a/hisatgenotype_scripts/compare_HLA.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os
-from argparse import ArgumentParser, FileType
-use_message = '''
-'''
-
-def compare(hisatgenotype_fname,
-            utsw_fname):
-    hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]
-    for level in [1,2]:
-        print >> sys.stderr, "Level: %d" % level
-        def read_hla_types(fname):
-            hla, hla_orig = {}, {}
-            for line in open(fname):
-                line = line.strip()
-                fields = line.split('\t')
-                if len(fields) == 2:
-                    sample, allele = fields
-                    abundance, vars_covered = 0.0, ""
-                elif len(fields) == 3:
-                    sample, allele, abundance = fields
-                    vars_covered = ""
-                else:
-                    assert len(fields) == 4
-                    sample, allele, abundance, vars_covered = fields
-                # sample = sample.split('_')[0]
-                abundance = float(abundance)
-                if sample not in hla:
-                    hla[sample] = {}
-                    hla_orig[sample] = {}
-                gene, allele = allele.split('*')
-                if gene not in hla[sample]:
-                    hla[sample][gene] = []
-                    hla_orig[sample][gene] = []
-                hla_orig[sample][gene].append([allele, abundance])
-
-                if level == 1:
-                    allele = allele.split(':')[0]
-                else:
-                    assert level == 2
-                    allele = ':'.join(allele.split(':')[:2])
-
-                found = False
-                for i in range(len(hla[sample][gene])):
-                    cmp_allele, cmp_abundance = hla[sample][gene][i]
-                    if level == 1 or allele.find(':') == -1:
-                        one = two = allele
-                        cmp_one = cmp_two = cmp_allele
-                    else:
-                        one, two = allele.split(':')
-                        cmp_one, cmp_two = cmp_allele.split(':')
-                    if one == cmp_one and two == cmp_two:
-                        found = True
-                        hla[sample][gene][i][1] = cmp_abundance + abundance
-                        break
-
-                if not found:
-                    hla[sample][gene].append([allele, abundance])
-
-            for sample_hla in hla.values():
-                for gene, allele_list in sample_hla.items():
-                    sample_hla[gene] = sorted(allele_list, key=lambda a: a[1], reverse=True)
-                
-            return hla, hla_orig
-                    
-        hla1, hla1_orig = read_hla_types(hisatgenotype_fname)
-        hla2, hla2_orig = read_hla_types(utsw_fname)
-
-        for gene in hla_list:
-            count, count_10 = [0, 0, 0], [0, 0, 0]
-            print >> sys.stderr, "\t%s" % gene
-            for sample in hla2.keys():
-                if sample not in hla1:
-                    continue
-                hla1_sample = hla1[sample]
-                hla2_sample = hla2[sample]
-                if gene not in hla1_sample or gene not in hla2_sample:
-                    continue
-                hla1_gene = hla1_sample[gene]
-                hla2_gene = hla2_sample[gene]
-                num_match, num_match_10 = 0, 0
-                for hla2_allele, _ in hla2_gene:
-                    hla2_allele = hla2_allele.split(':')
-                    for allele_idx in range(len(hla1_gene)):
-                        hla1_allele = hla1_gene[allele_idx][0]
-                        hla1_allele = hla1_allele.split(':')
-                        equal = True
-                        for i in range(min(len(hla1_allele), len(hla2_allele), level)):
-                            hla1_num = hla1_allele[i]
-                            hla2_num = hla2_allele[i]
-                            if hla1_num != hla2_num:
-                                equal = False
-                                break
-                            
-                        if equal:
-                            if allele_idx < 2:
-                                num_match += 1
-                                if len(hla2_gene) == 1:
-                                    num_match += 1
-                            num_match_10 += 1
-                            if len(hla2_gene) == 1:
-                                num_match_10 += 1
-                            break
-
-                # DK - for debugging purposes
-                # """
-                # if gene in ["A", "B", "C", "DQA1", "DQB1", "DRB1"] and num_match < 2:
-                if level == 3 and gene in ["B"] and num_match < 2:
-                    print sample
-                    print "\t", hla1_gene, "orig:", hla1_orig[sample][gene]
-                    print "\t", hla2_gene, "orig:", hla2_orig[sample][gene]
-                    # sys.exit(1)
-                # """
-
-                # DK - debugging purposes
-                if num_match >= len(count) or num_match_10 >= len(count_10):
-                    print sample, num_match, num_match_10
-
-                assert num_match < len(count) and num_match_10 < len(count_10)
-                count[num_match] += 1
-                count_10[num_match_10] += 1
-
-            if sum(count) <= 0:
-                continue
-
-            print >> sys.stderr, "\t\tTop two\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count[0], count[1], count[2], (count[1] + count[2] * 2) / float(sum(count) * 2) * 100.0)
-            print >> sys.stderr, "\t\tTop ten\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count_10[0], count_10[1], count_10[2], (count_10[1] + count_10[2] * 2) / float(sum(count_10) * 2) * 100.0)
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description='Compare HISAT-genotype and Utsw HLA typing results')
-    parser.add_argument('hisatgenotype_fname',
-                        nargs='?',
-                        type=str,
-                        help='hisatgenotype file name (e.g. cp_hla.txt)')
-    parser.add_argument('utsw_fname',
-                        nargs='?',
-                        type=str,
-                        help='utsw file name (e.g. utsw_caapa_hla.txt)')
-
-    args = parser.parse_args()
-
-    compare(args.hisatgenotype_fname,
-            args.utsw_fname)
-
diff --git a/hisatgenotype_scripts/compare_HLA_Omixon.py b/hisatgenotype_scripts/compare_HLA_Omixon.py
deleted file mode 100755
index ad79c19e..00000000
--- a/hisatgenotype_scripts/compare_HLA_Omixon.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python
-
-import sys, os
-from argparse import ArgumentParser, FileType
-use_message = '''
-'''
-
-def compare(hisatgenotype_fname, omixon_fname):
-    hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]
-    
-    # Read HISAT-genotype predicted HLA alleles for the CAAPA genomes
-    hisat_hla = {}
-    for line in open(hisatgenotype_fname):
-        line = line.strip()
-        fields = line.split('\t')
-        if len(fields) == 2:
-            sample, allele = fields
-            abundance, vars_covered = 0.0, ""
-        elif len(fields) == 3:
-            sample, allele, abundance = fields
-            vars_covered = ""
-        else:
-            assert len(fields) == 4
-            sample, allele, abundance, vars_covered = fields
-        abundance = float(abundance)
-        if sample not in hisat_hla:
-            hisat_hla[sample] = {}
-        gene, allele = allele.split('*')
-        if gene not in hisat_hla[sample]:
-            hisat_hla[sample][gene] = []
-        hisat_hla[sample][gene].append([allele, abundance])
-
-    # Read Omixon predicted HLA alleles for the CAAPA genomes
-    omixon_hla = {}
-    for line in open(omixon_fname):
-        line = line.strip()
-        sample, allele1, allele2 = line.split('\t')
-        gene1, allele1 = allele1.split('*')
-        gene2, allele2 = allele2.split('*')
-        
-        assert gene1 == gene2
-        if sample not in omixon_hla:
-            omixon_hla[sample] = {}
-        if gene1 not in omixon_hla[sample]:
-            omixon_hla[sample][gene1] = []
-
-        if len(omixon_hla[sample][gene1]) >= 2:
-            continue
-            
-        omixon_hla[sample][gene1].append(allele1)
-        omixon_hla[sample][gene1].append(allele2)
-
-    for gene in hla_list:
-        count, count_10 = [0, 0, 0], [0, 0, 0]
-        print >> sys.stderr, gene
-        for sample in omixon_hla.keys():
-            if sample not in hisat_hla:
-                continue
-            hisat_sample = hisat_hla[sample]
-            omixon_sample = omixon_hla[sample]
-            if gene not in omixon_sample or gene not in hisat_sample:
-                continue
-            hisat_gene = hisat_sample[gene]
-            omixon_gene = omixon_sample[gene]
-            num_match, num_match_10 = 0, 0
-            for omixon_allele in omixon_gene:
-                omixon_allele = omixon_allele.split(':')
-                for hisat_allele_idx in range(len(hisat_gene)):
-                    hisat_allele = hisat_gene[hisat_allele_idx]
-                    hisat_allele = hisat_allele[0].split(':')
-                    equal = True
-                    for i in range(min(len(omixon_allele), len(hisat_allele), 2)):
-                        omixon_num = omixon_allele[i]
-                        hisat_num = hisat_allele[i]
-                        """
-                        if not omixon_num[-1].isdigit():
-                            omixon_num = omixon_num[:-1]
-                        if not hisat_num[-1].isdigit():
-                            hisat_num = hisat_num[:-1]
-                        if int(hisat_num) != int(omixon_num):
-                            equal = False
-                            break
-                        """
-                        if hisat_num != omixon_num:
-                            equal = False
-                            break
-                    if equal:
-                        if hisat_allele_idx < 2:
-                            num_match += 1
-                        num_match_10 += 1
-                        break
-                    
-            # DK - for debugging purposes
-            """
-            if gene in ["A", "B", "C", "DQA1", "DQB1", "DRB1"] and num_match < 2:
-                print sample
-                print "\t", omixon_gene
-                print "\t", hisat_gene
-                # sys.exit(1)
-            """
-                
-            assert num_match < len(count)
-            count[num_match] += 1
-            count_10[num_match_10] += 1
-
-        if sum(count) <= 0:
-            continue
-        
-        print >> sys.stderr, "\tTop two\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count[0], count[1], count[2], (count[1] + count[2] * 2) / float(sum(count) * 2) * 100.0)
-        print >> sys.stderr, "\tTop ten\t0: %d, 1: %d, 2: %d (%.2f%%)" % (count_10[0], count_10[1], count_10[2], (count_10[1] + count_10[2] * 2) / float(sum(count_10) * 2) * 100.0)
-        
-
-if __name__ == "__main__":
-    parser = ArgumentParser(
-        description='Compare HISAT-genotype and Omixon HLA typing results')
-    parser.add_argument('hisatgenotype_fname',
-                        nargs='?',
-                        type=str,
-                        help='hisatgenotype file name (e.g. cp_hla.txt)')
-    parser.add_argument('omixon_fname',
-                        nargs='?',
-                        type=str,
-                        help='omixon file name (e.g. omixon_caapa_hla.txt)')
-
-    args = parser.parse_args()
-
-    compare(args.hisatgenotype_fname,
-            args.omixon_fname)
-
diff --git a/hisatgenotype_scripts/extract_Omixon_HLA.py b/hisatgenotype_scripts/extract_Omixon_HLA.py
deleted file mode 100755
index 23aaa045..00000000
--- a/hisatgenotype_scripts/extract_Omixon_HLA.py
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2016, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, glob
-
-if __name__ == '__main__':
-    hla_list = ["A", "B", "C", "DQA1", "DQB1", "DRB1"]
-    gen_alleles = {}
-    for hla in hla_list:
-        for line in open("IMGTHLA/fasta/%s_gen.fasta" % hla):
-            if line.startswith(">"):
-                allele = line.split()[1]
-                gene = allele.split('*')[0]
-                if gene not in gen_alleles:
-                    gen_alleles[gene] = set()
-                gen_alleles[gene].add(allele)
-                
-    nuc_alleles = {}
-    for hla in hla_list:
-        for line in open("IMGTHLA/fasta/%s_nuc.fasta" % hla):
-            if line.startswith(">"):
-                allele = line.split()[1]
-                gene = allele.split('*')[0]
-                if gene not in nuc_alleles:
-                    nuc_alleles[gene] = set()
-                nuc_alleles[gene].add(allele)
-
-    print >> sys.stderr, "IMGTHLA"
-    for gene, alleles in nuc_alleles.items():
-        print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles))
-
-    # Read HLA alleles from Omixon data
-    omixon_alleles = {}
-    omixon_fnames = glob.glob("HLAresults/*.gz")
-    for fname in omixon_fnames:
-        genome = fname.split("/")[1].split("_HLA")[0]
-        view_cmd = ["gzip", "-cd", fname]
-        proc = subprocess.Popen(view_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
-        allele_count = {}
-        prev_allele1, prev_allele2 = "", ""
-        for line in proc.stdout:
-            if not line.startswith("HLA"):
-                continue
-
-            fields = line.strip().split()
-            if len(fields) > 6:
-                allele1, allele2 = fields[0][4:-1], fields[6][4:-1]
-            else:
-                allele1 = allele2 = fields[0][4:-1]
-
-            gene = allele1.split("*")[0]
-            if gene not in hla_list:
-                continue
-            if gene not in omixon_alleles:
-                omixon_alleles[gene] = set()
-            if gene not in allele_count:
-                allele_count[gene] = 0
-            if allele_count[gene] >= 10:
-                continue
-
-            if allele2 == "":
-                allele2 = prev_allele2
-            assert allele1 != "" and allele2 != ""
-
-            def update_allele(allele):
-                if allele == "DRB1*08:01:03":
-                    allele = "DRB1*08:01:01"
-                elif allele == "DRB1*11:11:02":
-                    allele = "DRB1*11:11:01"
-                return allele
-
-            allele1, allele2 = update_allele(allele1), update_allele(allele2)
-            
-            allele_count[gene] += 1
-            omixon_alleles[gene].add(allele1)
-            omixon_alleles[gene].add(allele2)
-            prev_allele1, prev_allele2 = allele1, allele2
-
-            print "%s\t%s\t%s" % (genome, allele1, allele2)
-
-    print >> sys.stderr, "Omixon"
-    for gene, alleles in omixon_alleles.items():
-        print >> sys.stderr, "\t%s: %d alleles" % (gene, len(alleles))
-        for allele in alleles:
-            if allele in nuc_alleles[gene]:
-                continue
-            found = False
-            for allele_cmp in nuc_alleles[gene]:
-                if allele_cmp.find(allele) != -1:
-                    found = True
-                    break                    
-
-            if not found:
-                print >> sys.stderr, "\t\t%s is missing" % allele
-
-            
diff --git a/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py b/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py
deleted file mode 100755
index 34cd4ecf..00000000
--- a/hisatgenotype_scripts/hisatgenotype_HLA_genotyping_PGs.py
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2015, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re
-import inspect
-import random
-from argparse import ArgumentParser, FileType
-
-# Gold Standard (experimentally verified, a lot of literature, ...)
-gold_allele_info = {
-    "NA12877" : {"A" : ["03:01", "02:01"], "B" : ["15:01", "44:02"], "C" : ["05:01", "03:04"], "DQA1" : ["03:03", "03:01"], "DQB1" : ["03:02", "03:01"], "DRB1" : ["04:01", "04:01"]},
-    "NA12878" : {"A" : ["01:01", "11:01"], "B" : ["08:01", "56:01"], "C" : ["01:02", "07:01"], "DQA1" : ["05:01", "01:01"], "DQB1" : ["02:01", "05:01"], "DRB1" : ["03:01", "01:01"]},
-    "NA12879" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]},
-    "NA12880" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]},
-    "NA12881" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["05:01", "01:02"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["04:01", "01:01"]},
-    "NA12882" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["04:01", "01:01"]},
-    "NA12883" : {"A" : ["03:01", "11:01"], "B" : ["44:02", "56:01"], "C" : ["01:02", "05:01"], "DQA1" : ["03:03", "01:01"], "DQB1" : ["03:01", "05:01"], "DRB1" : ["01:01", "04:01"]},
-    "NA12884" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:01"]},
-    "NA12885" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["05:01", "07:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["03:01", "02:01"], "DRB1" : ["03:01", "04:01"]},
-    "NA12886" : {"A" : ["03:01", "01:01"], "B" : ["44:02", "08:01"], "C" : ["07:01", "05:01"], "DQA1" : ["03:03", "05:01"], "DQB1" : ["02:01", "03:01"], "DRB1" : ["03:01", "04:01"]},
-    "NA12887" : {"A" : ["02:01", "01:01"], "B" : ["15:01", "08:01"], "C" : ["03:04", "07:01"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]},
-    "NA12888" : {"A" : ["01:01", "02:01"], "B" : ["08:01", "15:01"], "C" : ["07:01", "03:04"], "DQA1" : ["03:01", "05:01"], "DQB1" : ["03:02", "02:01"], "DRB1" : ["03:01", "04:01"]},
-    "NA12889" : {"A" : ["03:01", "03:01"], "B" : ["07:02", "44:02"], "C" : ["05:01", "07:02"], "DQA1" : ["03:03", "01:02"], "DQB1" : ["03:01", "06:02"], "DRB1" : ["15:01", "04:01"]},
-    "NA12890" : {"A" : ["03:01", "02:01"], "B" : ["44:03", "15:01"], "C" : ["16:01", "03:04"], "DQA1" : ["03:01", "02:01"], "DQB1" : ["03:02", "02:02"], "DRB1" : ["04:03", "07:01"]},
-    "NA12891" : {"A" : ["24:02", "01:01"], "B" : ["08:01", "07:02"], "C" : ["07:02", "07:01"], "DQA1" : ["05:01", "01:02"], "DQB1" : ["06:02", "02:01"], "DRB1" : ["03:01", "15:01"]},
-    "NA12892" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "04:01"], "DQA1" : ["01:01", "01:01"], "DQB1" : ["05:01", "05:01"], "DRB1" : ["01:01", "01:01"]},
-    "NA12893" : {"A" : ["02:01", "11:01"], "B" : ["15:01", "56:01"], "C" : ["01:02", "03:04"], "DQA1" : ["03:01", "01:01"], "DQB1" : ["03:02", "05:01"], "DRB1" : ["01:01", "04:01"]}
-    }
-
-# CEPH pedigree (17 family members)
-pedigree = {
-    "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]},
-    "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]},
-    "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]},
-
-    "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]},
-    "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]},
-    "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]},
-
-    "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    }
-
-
-"""
-"""
-def test_HLA_genotyping(reference_type,
-                        hla_list,
-                        aligners,
-                        query_genomes,
-                        exclude_allele_list,
-                        num_mismatch,
-                        verbose):
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(test_HLA_genotyping))
-    ex_path = os.path.dirname(curr_script)
-
-    if not os.path.exists("illumina/HLA"):
-        print >> sys.stderr, "Error: illumina/HLA data is needed (please send an email to infphilo@gmail.com for getting the data)"
-        sys.exit(1)
-
-    num_test, num_success = 0, 0
-    for genome in sorted(gold_allele_info.keys()):
-        if not genome in query_genomes:
-            continue
-        genes = gold_allele_info[genome]
-        read_fname_1, read_fname_2 = "illumina/HLA/%s.fished_1.fq" % genome, "illumina/HLA/%s.fished_2.fq" % genome
-        if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2):
-            continue
-        print >> sys.stderr, genome        
-        cmd_aligners = ['.'.join(aligners[i]) for i in range(len(aligners))]
-        test_hla_script = os.path.join(ex_path, "hisat2_test_HLA_genotyping.py")
-        for gene in sorted(genes.keys()):
-            if not gene in hla_list:
-                continue
-            alleles = genes[gene]
-            print >> sys.stderr, "\t%s - %s" % (gene, ' / '.join(alleles))            
-            test_hla_cmd = [test_hla_script,
-                            "--reference-type", reference_type,
-                            "--hla-list", gene,
-                            "--aligner-list", ','.join(cmd_aligners),
-                            "--reads", "%s,%s" % (read_fname_1, read_fname_2),
-                            "--best-alleles",
-                            "--exclude-allele-list", ','.join(exclude_allele_list),
-                            "--num-mismatch", str(num_mismatch)]
-
-            if verbose:
-                print >> sys.stderr, ' '.join(test_hla_cmd)
-            
-            proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=open("/dev/null", 'w'))
-            num_test += 2
-            test_alleles = set()
-            for line in proc.stdout:
-                print "\t\t", line,
-                model, allele = line.split()[:2]
-                if model != "SingleModel":
-                    continue
-                allele = allele.split('*')[1]
-                allele = ':'.join(allele.split(':')[:2])
-                test_alleles.add(allele)
-            proc.communicate()
-            for allele in alleles:
-                if allele in test_alleles:
-                    num_success += 1
-
-    print >> sys.stderr, "%d/%d (%.2f%%)" % (num_success, num_test, num_success * 100.0 / num_test)
-
-
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='test HLA genotyping for Platinum Genomes')
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
-                        type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome (default: gene)")
-    parser.add_argument("--hla-list",
-                        dest="hla_list",
-                        type=str,
-                        default="A,B,C,DQA1,DQB1,DRB1",
-                        help="A comma-separated list of HLA genes (default: A,B,C,DQA1,DQB1,DRB1)")
-    parser.add_argument("--aligner-list",
-                        dest="aligners",
-                        type=str,
-                        default="hisat2.graph",
-                        help="A comma-separated list of aligners (default: hisat2.graph)")
-    genomes_default = ','.join(gold_allele_info.keys())
-    parser.add_argument("--genome-list",
-                        dest="genome_list",
-                        type=str,
-                        default=genomes_default,
-                        help="A comma-separated list of genomes (default: %s)" % genomes_default)
-    parser.add_argument("--exclude-allele-list",
-                        dest="exclude_allele_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of allleles to be excluded")
-    parser.add_argument("--num-mismatch",
-                        dest="num_mismatch",
-                        type=int,
-                        default=0,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 0)")
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-
-    args = parser.parse_args()
-
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
-        sys.exit(1)
-    args.hla_list = args.hla_list.split(',')
-    if args.aligners == "":
-        print >> sys.stderr, "Error: --aligners must be non-empty."
-        sys.exit(1)    
-    args.aligners = args.aligners.split(',')
-    for i in range(len(args.aligners)):
-        args.aligners[i] = args.aligners[i].split('.')
-    args.genome_list = args.genome_list.split(',')
-    args.exclude_allele_list = args.exclude_allele_list.split(',')
-
-    test_HLA_genotyping(args.reference_type,
-                        args.hla_list,
-                        args.aligners,
-                        args.genome_list,
-                        args.exclude_allele_list,
-                        args.num_mismatch,
-                        args.verbose)
diff --git a/hisatgenotype_scripts/hisatgenotype_convert_codis.py b/hisatgenotype_scripts/hisatgenotype_convert_codis.py
deleted file mode 100755
index 415a42c8..00000000
--- a/hisatgenotype_scripts/hisatgenotype_convert_codis.py
+++ /dev/null
@@ -1,654 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2017, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT-genotype.
-#
-# HISAT-genotype is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT-genotype is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import os, sys, subprocess, re
-import inspect, operator
-from copy import deepcopy
-from argparse import ArgumentParser, FileType
-import hisatgenotype_typing_common as typing_common
-try:
-    import openpyxl
-except ImportError:
-    print >> sys.stderr, "Error: please install openpyxl by running 'pip install openpyxl'."
-    sys.exit(1)
-
-
-# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm
-
-orig_CODIS_seq = {
-    "CSF1PO" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_CSF1PO.htm
-    # allele 13: 5:150076172-150076490 - (samtools faidx genome.fa - GRCh38)
-    ["[AGAT]13",
-     "AACCTGAGTCTGCCAAGGACTAGCAGGTTGCTAACCACCCTGTGTCTCAGTTTTCCTACCTGTAAAATGAAGATATTAACAGTAACTGCCTTCATAGATAGAAGATAGATAGATT", # left flanking sequence
-     "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT", # STR
-     "AGGAAGTACTTAGAACAGGGTCTGACACAGGAAATGCTGTCCAAGTGTGCACCAGGAGATAGTATCTGAGAAGGCTCAGTCTGGCACCATGTGGGTTGGGTGGGAACCTGGAGGCTGGAGAATGGGCTGAAGATGGCCAGTGGTGTGTGGAA"], # right flanking sequence
-             
-    "FGA" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_FGA.htm
-    # allele 22: 4:154587696-154587891 -
-    ["[TTTC]3TTTTTTCT[CTTT]14CTCC[TTCC]2",
-     "GCCCCATAGGTTTTGAACTCACAGATTAAACTGTAACCAAAATAAAATTAGGCATATTTACAAGCTAG",
-     "TTTCTTTCTTTCTTTTTTCTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTTTCTCCTTCCTTCC",
-     "TTTCTTCCTTTCTTTTTTGCTGGCAATTACAGACAAATCA"],
-
-    "TH01" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_TH01.htm
-    # allele 7: 11:2170990-2171176 +
-    ["[AATG]7",
-     "GTGGGCTGAAAAGCTCCCGATTATCCAGCCTGGCCCACACAGTCCCCTGTACACAGGGCTTCCGAGTGCAGGTCACAGGGAACACAGACTCCATGGTG",
-     "AATGAATGAATGAATGAATGAATGAATG",
-     "AGGGAAATAAGGGAGGAACAGGCCAATGGGAATCACCCCAGAGCCCAGATACCCTTTGAAT"],
-             
-    "TPOX" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_TPOX.htm
-    # allele 8: 2:1489617-1489848
-    ["[AATG]8",
-     "ACTGGCACAGAACAGGCACTTAGGGAACCCTCACTG",
-     "AATGAATGAATGAATGAATGAATGAATGAATG",
-     "TTTGGGCAAATAAACGCTGACAAGGACAGAAGGGCCTAGCGGGAAGGGAACAGGAGTAAGACCAGCGCACAGCCCGACTTGTGTTCAGAAGACCTGGGATTGGACCTGAGGAGTTCAATTTTGGATGAATCTCTTAATTAACCTGTGGGGTTCCCAGTTCCTCC"],
-             
-    "VWA" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_VWA.htm
-    # allele unknown: 12:5983938-5984087 -
-    ["TCTA[TCTG]5[TCTA]11TCCA TCTA",
-     "CCCTAGTGGATGATAAGAATAATCAGTATGTGACTTGGATTGA",
-     "TCTATCTGTCTGTCTGTCTGTCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCCATCTA",
-     "TCCATCCATCCTATGTATTTATCATCTGTCC"],
-             
-    "D3S1358" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D3S1358.htm
-    # allele unknown: 3:45540713-45540843 +
-    ["TCTATCTG[TCTA]14",
-     "ATGAAATCAACAGAGGCTTGCATGTA",
-     "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA",
-     "TGAGACAGGGTCTTGCTCTGTCACCCAGATTGGACTGCAGT"],
-             
-    "D5S818" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D5S818.htm
-    # allele 11: 5:123775504-123775638 -
-    ["[AGAT]11",
-     "GGTGATTTTCCTCTTTGGTATCCTTATGTAATATTTTGA",
-     "AGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGAT",
-     "AGAGGTATAAATAAGGATACAGATAAAGATACAAATGTTGTAAACTGTGGCT"],
-             
-    "D7S820" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D7S820.htm
-    # allele 13: 7:84160125-84160367 -
-    ["[GATA]13",
-     "ATGTTGGTCAGGCTGACTATGGAGTTATTTTAAGGTTAATATATATAAAGGGTATGATAGAACACTTGTCATAGTTTAGAACGAACTAAC",
-     "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA",
-     "GACAGATTGATAGTTTTTTTTAATCTCACTAAATAGTCTATAGTAAACATTTAATTACCAATATTTGGTGCAATTCTGTCAATGAGGATAAATGTGGAATC"],
-             
-    "D8S1179" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D8S1179.htm
-    # allele 13: 8:124894838-124895018 +
-    ["[TCTA]1[TCTG]1[TCTA]11",
-     "TTTTTGTATTTCATGTGTACATTCGTA",
-     "TCTATCTGTCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA",
-     "TTCCCCACAGTGAAAATAATCTACAGGATAGGTAAATAAATTAAGGCATATTCACGCAATGGGATACGATACAGTGATGAAAATGAACTAATTATAGCTACG"],
-             
-    "D13S317" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D13S317.htm
-    # Perhaps, allele 11: 13:82147921-82148112 +
-    ["[TATC]11A",
-     "ATCACAGAAGTCTGGGATGTGGAGGAGAGTTCATTTCTTTAGTGGGCATCCGTGACTCTCTGGACTCTGACCCATCTAACGCCTATCTGTATTTACAAATACAT",
-     "TATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCA",
-     "ATCAATCATCTATCTATCTTTCTGTCTGTCTTTTTGGGCTGCC"],
-             
-    "D16S539" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D16S539.htm
-    # allele 11: 16:86352518-86352805 +
-    ["[GATA]11",
-     "GGGGGTCTAAGAGCTTGTAAAAAGTGTACAAGTGCCAGATGCTCGTTGTGCACAAATCTAAATGCAGAAAAGCACTGAAAGAAGAATCCAGAAAACCACAGTTCCCATTTTTATATGGGAGCAAACAAAGGCAGATCCCAAGCTCTTCCTCTTCCCTAGATCAATACAGACAGACAGACAGGTG",
-     "GATAGATAGATAGATAGATAGATAGATAGATAGATAGATAGATA",
-     "TCATTGAAAGACAAAACAGAGATGGATGATAGATACATGCTTACAGATGCACACACAAAC"],
-             
-    "D18S51" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D18S51.htm
-    # allele 18: 18:63281611-63281916 +
-    ["[AGAA]18",
-     "GAGCCATGTTCATGCCACTGCACTTCACTCTGAGTGACAAATTGAGACCTTGTCTC",
-     "AGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAAAGAA",
-     "AAAGAGAGAGGAAAGAAAGAGAAAAAGAAAAGAAATAGTAGCAACTGTTATTGTAAGACATCTCCACACACCAGAGAAGTTAATTTTAATTTTAACATGTTAAGAACAGAGAGAAGCCAACATGTCCACCTTAGGCTGACGGTTTGTTTATTTGTGTTGTTGCTGGTAGTCGGGTTTG"],
-             
-    "D21S11" :
-    # http://www.cstl.nist.gov/biotech/strbase/str_D21S11.htm
-    # Perhaps, allele 29: 21:19181945-19182165 +
-    ["[TCTA]4[TCTG]6[TCTA]3TA[TCTA]3TCA[TCTA]2TCCATA[TCTA]11",
-     "GTGAGTCAATTCCCCAAGTGAATTGCCT",
-     "TCTATCTATCTATCTATCTGTCTGTCTGTCTGTCTGTCTGTCTATCTATCTATATCTATCTATCTATCATCTATCTATCCATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTA",
-     "TCGTCTATCTATCCAGTCTATCTACCTCCTATTAGTCTGTCTCTGGAGAACATTGACTAATACAAC"],
-
-    # "AMEL" - http://www.cstl.nist.gov/biotech/strbase/jpg_amel.htm
-    #          X chromosome has 6 bp deletion and Y chromosome doesn't
-    "AMELX" :
-    ["",
-     "TGTTGATTCTTTATCCCAGATGTTTCTCAAGTGG", # chromosome X at 11296898
-     "",
-     ""],
-
-    "AMELY" :
-    ["",
-     "AGAAACCACTTTATTTGGGATGAAGAATCCACC", # chromosome Y at 6869902
-     "",
-     ""]
-}
-
-CODIS_ref_name = {}
-
-
-"""
-"""
-def get_flanking_seqs(seq,
-                      flank_len = 500):
-    def align_seq(seq):
-        aligner_cmd = ["hisat2",
-                       "--score-min", "C,0",
-                       "--no-unal",
-                        "-x", "grch38/genome",
-                        "-c", seq]
-        align_proc = subprocess.Popen(aligner_cmd,
-                                      stdout=subprocess.PIPE,
-                                      stderr=open("/dev/null", 'w'))
-        chr, left, right, strand = "", -1, -1, '+'
-        for line in align_proc.stdout:
-            if line.startswith('@'):
-                continue
-            line = line.strip()
-            cols = line.split()
-            allele_id, flag, chr, left, _, cigar_str = cols[:6]
-            assert cigar_str[-1] == 'M'
-            left = int(left)
-            flag = int(flag)
-            strand = '-' if flag & 0x10 else '+'
-            assert cigar_str == ("%dM" % len(seq))
-            right = left + len(seq)
-            break
-        
-        assert chr != "" and left >= 0 and right > left
-        return chr, left, right, strand
-    
-    chr, left, right, strand = align_seq(seq)    
-    left_flank_seq, right_flank_seq = "", ""
-    if left > 1:
-        extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, max(1, left - flank_len), left - 1)]
-        extract_seq_proc = subprocess.Popen(extract_seq_cmd,
-                                            stdout=subprocess.PIPE,
-                                            stderr=open("/dev/null", 'w'))
-        for line in extract_seq_proc.stdout:
-            if line.startswith('>'):
-                continue
-            line = line.strip()
-            left_flank_seq += line
-    extract_seq_cmd = ["samtools", "faidx", "genome.fa", "%s:%d-%d" % (chr, right, right + flank_len - 1)]
-    extract_seq_proc = subprocess.Popen(extract_seq_cmd,
-                                        stdout=subprocess.PIPE,
-                                        stderr=open("/dev/null", 'w'))
-    for line in extract_seq_proc.stdout:
-        if line.startswith('>'):
-            continue
-        line = line.strip()
-        right_flank_seq += line
-
-    if strand == '-':
-        left_flank_seq, right_flank_seq = typing_common.reverse_complement(right_flank_seq), typing_common.reverse_complement(left_flank_seq)
-
-    chr, _, _, _ = align_seq(left_flank_seq + seq + right_flank_seq)
-    assert chr != ""
-    
-    return left_flank_seq, right_flank_seq
-
-
-
-"""
-"""
-def get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j):
-    if repeat_i == repeat_j:
-        # DK - experimental SW alignment
-        min_diff = sys.maxint
-        for repeat_num_i in repeat_nums_i:
-            for repeat_num_j in repeat_nums_j:
-                min_diff = min(abs(repeat_num_i - repeat_num_j), min_diff)
-        equal_score = -min_diff / 10.0 + (len(repeat_nums_i) + len(repeat_nums_j)) / 100.0
-        equal_score = max(min(0.0 if min_diff == 0 else -0.1, equal_score), -0.9)
-
-        # DK - just for now
-        equal_score = 0
-        
-        return equal_score
-    elif repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1]):
-        return -1
-    else:
-        return -2
-
-    
-"""
-Smith Waterman Algorithm
-"""
-def SW_alignment(allele_i, allele_j):
-    n, m = len(allele_i), len(allele_j)
-    a = [[-(i+j) if i == 0 or j == 0 else 0 for j in range(m + 1)] for i in range(n + 1)]
-
-    # Fill 2D array
-    for i in range(n):
-        repeat_i, repeat_nums_i = allele_i[i]
-        for j in range(m):
-            repeat_j, repeat_nums_j = allele_j[j]
-            equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j)
-            a[i+1][j+1] = max(a[i][j+1] - 1, a[i+1][j] - 1, a[i][j] + equal_score)
-
-    return a, n, m
-
-
-"""
-"""
-def combine_alleles(backbone_allele, add_allele):
-    allele_i, allele_j = backbone_allele, add_allele
-    a, n, m = SW_alignment(allele_i, allele_j)
-
-    # Back tracking
-    new_backbone_allele = []
-    i, j = n - 1, m - 1
-    while i >= 0 or j >= 0:
-        if i < 0:
-            repeat_j, repeat_nums_j = allele_j[j]
-            new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])])
-            j -= 1
-        elif j < 0:
-            repeat_i, repeat_nums_i = allele_i[i]
-            new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])])
-            i -= 1
-        else:
-            repeat_i, repeat_nums_i = allele_i[i]
-            repeat_j, repeat_nums_j = allele_j[j]    
-            equal_score = get_equal_score(repeat_i, repeat_nums_i, repeat_j, repeat_nums_j)
-            if a[i][j+1] - 1 == a[i+1][j+1]:
-                new_backbone_allele.append([repeat_i, repeat_nums_i | set([0])])
-                i -= 1
-            elif a[i+1][j] - 1 == a[i+1][j+1]:
-                new_backbone_allele.append([repeat_j, repeat_nums_j | set([0])])
-                j -= 1
-            else:
-                assert a[i][j] + equal_score == a[i+1][j+1]
-                if repeat_i == repeat_j:
-                    new_backbone_allele.append([repeat_i, repeat_nums_i | repeat_nums_j])
-                else:
-                    assert repeat_nums_i == repeat_nums_j
-                    assert repeat_nums_i == set([1])
-                    new_backbone_allele.append([repeat_i | repeat_j, repeat_nums_i | repeat_nums_j])
-                i -= 1
-                j -= 1
-
-    new_backbone_allele = new_backbone_allele[::-1]
-    return new_backbone_allele
-
-
-"""
-"""
-def msf_alignment(backbone_allele, allele):
-    allele_i, allele_j = backbone_allele, allele
-    a, n, m = SW_alignment(allele_i, allele_j)
-
-    # Back tracking
-    allele_seq, backbone_seq = "", ""
-    i, j = n - 1, m - 1
-    while i >= 0 or j >= 0:
-        assert i >= 0
-        repeats_i, repeat_nums_i = allele_i[i]
-        repeat_i = ""
-        max_repeat = ""
-        for repeat_str in repeats_i:
-            if len(repeat_str) > len(repeat_i):
-                repeat_i = repeat_str
-        repeat_num_i = max(repeat_nums_i)
-        if j < 0:
-            allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq
-            backbone_seq = repeat_i * repeat_num_i + backbone_seq
-            i -= 1
-        else:
-            repeats_j, repeat_nums_j = allele_j[j]
-            assert len(repeats_j) == 1 and len(repeat_nums_j) == 1
-            repeat_j, repeat_num_j = list(repeats_j)[0], list(repeat_nums_j)[0]
-            equal_score = get_equal_score(repeats_i, repeat_nums_i, repeats_j, repeat_nums_j)
-            if a[i][j+1] - 1 == a[i+1][j+1]:
-                allele_seq = '.' * (len(repeat_i) * repeat_num_i) + allele_seq
-                backbone_seq = repeat_i * repeat_num_i + backbone_seq
-                i -= 1
-            else:
-                assert a[i][j] + equal_score == a[i+1][j+1]
-                if repeat_i == repeat_j:
-                    add_seq = repeat_i * repeat_num_j
-                    dot_seq = '.' * (len(repeat_i) * (repeat_num_i - repeat_num_j))
-                    allele_seq = add_seq + dot_seq + allele_seq
-                    add_seq = repeat_i * repeat_num_i
-                    backbone_seq = add_seq + backbone_seq                    
-                else:
-                    assert repeat_nums_i == repeat_nums_j and repeat_nums_i == set([1])
-                    dot_seq = '.' * (len(repeat_i) - len(repeat_j))
-                    allele_seq = repeat_j + dot_seq + allele_seq
-                    backbone_seq = repeat_i + backbone_seq                    
-                i -= 1
-                j -= 1
-
-    return allele_seq, backbone_seq
-
-
-"""
-Extract multiple sequence alignments
-"""
-def extract_msa(base_dname,
-                base_fname,
-                locus_list,
-                min_freq,
-                verbose):    
-    # Download human genome and HISAT2 index
-    HISAT2_fnames = ["grch38",
-                     "genome.fa",
-                     "genome.fa.fai"]
-    if not typing_common.check_files(HISAT2_fnames):
-        typing_common.download_genome_and_index(ex_path)
-
-    # Load allele frequency information
-    allele_freq = {}
-    if min_freq > 0.0:
-        excel = openpyxl.load_workbook("hisatgenotype_db/CODIS/NIST-US1036-AlleleFrequencies.xlsx")
-        sheet = excel.get_sheet_by_name(u'All data, n=1036')
-        for col in range(2, 100):
-            locus_name = sheet.cell(row = 3, column = col).value
-            if not locus_name:
-                break
-            locus_name = locus_name.encode('ascii','ignore')
-            locus_name = locus_name.upper()
-            assert locus_name not in allele_freq
-            allele_freq[locus_name] = {}
-
-            for row in range(4, 101):
-                allele_id = sheet.cell(row = row, column = 1).value
-                allele_id = str(allele_id)
-                freq = sheet.cell(row = row, column = col).value
-                if not freq:
-                    continue
-                allele_freq[locus_name][allele_id] = float(freq)
-        excel.close()
-
-    CODIS_seq = orig_CODIS_seq
-    if len(locus_list) > 0:
-        new_CODIS_seq = {}
-        for locus_name, fields in CODIS_seq.items():
-            if locus_name in locus_list:
-                new_CODIS_seq[locus_name] = fields
-        CODIS_seq = new_CODIS_seq        
-
-    # Add some additional sequences to allele sequences to make them reasonably long for typing and assembly
-    for locus_name, fields in CODIS_seq.items():
-        _, left_seq, repeat_seq, right_seq = fields
-        allele_seq = left_seq + repeat_seq + right_seq
-        left_flank_seq, right_flank_seq = get_flanking_seqs(allele_seq)
-        CODIS_seq[locus_name][1] = left_flank_seq + left_seq
-        CODIS_seq[locus_name][3] = right_seq + right_flank_seq
-
-        print >> sys.stderr, "%s is found on the reference genome (GRCh38)" % locus_name
-    
-    for locus_name in CODIS_seq.keys():
-        alleles = []
-        for line in open("hisatgenotype_db/CODIS/codis.dat"):
-            locus_name2, allele_id, repeat_st = line.strip().split('\t')
-            if locus_name != locus_name2:
-                continue
-            if min_freq > 0.0:
-                assert locus_name in allele_freq
-                if allele_id not in allele_freq[locus_name] or \
-                   allele_freq[locus_name][allele_id] < min_freq:
-                    continue
-                
-            alleles.append([allele_id, repeat_st])
-
-        # From   [TTTC]3TTTTTTCT[CTTT]20CTCC[TTCC]2
-        # To     [['TTTC', [3]], ['TTTTTTCT', [1]], ['CTTT', [20]], ['CTCC', [1]], ['TTCC', [2]]]
-        def read_allele(repeat_st):
-            allele = []
-            s = 0
-            while s < len(repeat_st):
-                ch = repeat_st[s]
-                if ch == ' ':
-                    s += 1
-                    continue
-                assert ch in "[ACGT"
-                if ch == '[':
-                    s += 1
-                    repeat = ""
-                    while s < len(repeat_st):
-                        nt = repeat_st[s]
-                        if nt in "ACGT":
-                            repeat += nt
-                            s += 1
-                        else:
-                            assert nt == ']'
-                            s += 1
-                            break
-                    assert s < len(repeat_st)
-                    num = 0
-                    while s < len(repeat_st):
-                        digit = repeat_st[s]
-                        if digit.isdigit():
-                            num = num * 10 + int(digit)
-                            s += 1
-                        else:
-                            break
-                    assert num > 0
-                    allele.append([set([repeat]), set([num])])
-                else:
-                    repeat = ""
-                    while s < len(repeat_st):
-                        nt = repeat_st[s]
-                        if nt in "ACGT":
-                            repeat += nt
-                            s += 1
-                        else:
-                            assert nt == ' ' or nt == '['
-                            break
-                    allele.append([set([repeat]), set([1])])
-
-            # Sanity check
-            cmp_repeat_st = ""
-            for repeats, repeat_nums in allele:
-                repeat = list(repeats)[0]
-                repeat_num = list(repeat_nums)[0]
-                if repeat_num > 1 or locus_name == "D8S1179":
-                    cmp_repeat_st += "["
-                cmp_repeat_st += repeat
-                if repeat_num > 1 or locus_name == "D8S1179":
-                    cmp_repeat_st += "]%d" % repeat_num
-
-            assert repeat_st.replace(' ', '') == cmp_repeat_st.replace(' ', '')
-            return allele
-
-        alleles = [[allele_id, read_allele(repeat_st)] for allele_id, repeat_st in alleles]
-
-        def to_sequence(repeat_st):
-            sequence = ""
-            for repeats, repeat_nums in repeat_st:
-                repeat = list(repeats)[0]
-                repeat_num = list(repeat_nums)[0]
-                sequence += (repeat * repeat_num)
-            return sequence
-
-        def remove_redundant_alleles(alleles):
-            seq_to_ids = {}
-            new_alleles = []
-            for allele_id, repeat_st in alleles:
-                allele_seq = to_sequence(repeat_st)
-                if allele_seq in seq_to_ids:
-                    print >> sys.stderr, "Warning) %s: %s has the same sequence as %s" % \
-                        (locus_name, allele_id, seq_to_ids[allele_seq])
-                    continue
-                if allele_seq not in seq_to_ids:
-                    seq_to_ids[allele_seq] = [allele_id]
-                else:
-                    seq_to_ids[allele_seq].append(allele_id)         
-                new_alleles.append([allele_id, repeat_st])
-
-            return new_alleles
-
-        alleles = remove_redundant_alleles(alleles)
-
-        allele_seqs = [[allele_id, to_sequence(repeat_st)] for allele_id, repeat_st in alleles]
-
-        ref_allele_st, ref_allele_left, ref_allele, ref_allele_right = CODIS_seq[locus_name]
-        ref_allele_st = read_allele(ref_allele_st)
-        for allele_id, allele_seq in allele_seqs:
-            if ref_allele == allele_seq:
-                CODIS_ref_name[locus_name] = allele_id
-                break
-            
-        # Add GRCh38 allele
-        if locus_name not in CODIS_ref_name:
-            allele_id = "GRCh38"
-            CODIS_ref_name[locus_name] = allele_id
-            allele_seqs = [[allele_id, ref_allele]] + allele_seqs
-            alleles = [[allele_id, ref_allele_st]] + alleles
-
-        print >> sys.stderr, "%s: %d alleles with reference allele as %s" % (locus_name, len(alleles), CODIS_ref_name[locus_name])
-        if verbose:
-            print >> sys.stderr, "\t", ref_allele_left, ref_allele, ref_allele_right
-            for allele_id, allele in alleles:
-                print >> sys.stderr, allele_id, "\t", allele
-
-        # Create a backbone sequence
-        assert len(alleles) > 0
-        backbone_allele = deepcopy(alleles[-1][1])
-        for allele_id, allele_st in reversed(alleles[:-1]):
-            if verbose:
-                print >> sys.stderr
-                print >> sys.stderr, allele_id
-                print >> sys.stderr, "backbone         :", backbone_allele
-                print >> sys.stderr, "allele           :", allele_st
-            backbone_allele = combine_alleles(backbone_allele, allele_st)
-            msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st)
-            if verbose:                
-                print >> sys.stderr, "combined backbone:", backbone_allele
-                print >> sys.stderr, "msf_allele_seq  :", msf_allele_seq
-                print >> sys.stderr, "msf_backbone_seq:", msf_backbone_seq
-                print >> sys.stderr
-
-        allele_dic = {}
-        for allele_id, allele_seq in allele_seqs:
-            allele_dic[allele_id] = allele_seq
-
-        allele_repeat_msf = {}
-        for allele_id, allele_st in alleles:
-            msf_allele_seq, msf_backbone_seq = msf_alignment(backbone_allele, allele_st)
-            allele_repeat_msf[allele_id] = msf_allele_seq
-
-        # Sanity check
-        assert len(allele_dic) == len(allele_repeat_msf)
-        repeat_len = None
-        for allele_id, repeat_msf in allele_repeat_msf.items():
-            if not repeat_len:
-                repeat_len = len(repeat_msf)
-            else:
-                assert repeat_len == len(repeat_msf)
-
-        # Creat full multiple sequence alignment
-        ref_allele_id = CODIS_ref_name[locus_name]
-        allele_msf = {}
-        for allele_id, repeat_msf in allele_repeat_msf.items():
-            allele_msf[allele_id] = ref_allele_left + repeat_msf + ref_allele_right
-
-        # Make sure the length of allele ID is short, less than 20 characters
-        max_allele_id_len = max([len(allele_id) for allele_id in allele_dic.keys()])
-        assert max_allele_id_len < 20
-
-        # Write MSF (multiple sequence alignment file)
-        msf_len = len(ref_allele_left) + len(ref_allele_right) + repeat_len
-        msf_fname = "%s_gen.msf" % locus_name
-        msf_file = open(msf_fname, 'w')
-        for s in range(0, msf_len, 50):
-            for allele_id, msf in allele_msf.items():
-                assert len(msf) == msf_len
-                allele_name = "%s*%s" % (locus_name, allele_id)
-                print >> msf_file, "%20s" % allele_name,
-                for s2 in range(s, min(msf_len, s + 50), 10):
-                    print >> msf_file, " %s" % msf[s2:s2+10],
-                print >> msf_file
-
-            if s + 50 >= msf_len:
-                break
-            print >> msf_file
-        msf_file.close()
-
-        # Write FASTA file
-        fasta_fname = "%s_gen.fasta" % locus_name
-        fasta_file = open(fasta_fname, 'w')
-        for allele_id, allele_seq in allele_seqs:
-            gen_seq = ref_allele_left + allele_seq + ref_allele_right
-            print >> fasta_file, ">%s*%s %d bp" % (locus_name, allele_id, len(gen_seq))
-            for s in range(0, len(gen_seq), 60):
-                print >> fasta_file, gen_seq[s:s+60]
-        fasta_file.close()
-
-
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description="Extract multiple sequence alignments for DNA Fingerprinting loci")
-    parser.add_argument("-b", "--base",
-                        dest="base_fname",
-                        type=str,
-                        default="codis",
-                        help="base filename (default: codis)")
-    parser.add_argument("--locus-list",
-                        dest="locus_list",
-                        type=str,
-                        default="",
-                        help="base filename (default: empty)")
-    parser.add_argument("--min-freq",
-                        dest="min_freq",
-                        type=float,
-                        default=0.0,
-                        help="minimum allele frequency (default: 0.0)")    
-    parser.add_argument("-v", "--verbose",
-                        dest="verbose",
-                        action="store_true",
-                        help="also print some statistics to stderr")
-
-    args = parser.parse_args()
-    if args.base_fname.find('/') != -1:
-        elems = args.base_fname.split('/')
-        base_fname = elems[-1]
-        base_dname = '/'.join(elems[:-1])
-    else:
-        base_fname = args.base_fname
-        base_dname = ""
-    if args.locus_list != "":
-        locus_list = args.locus_list.split(',')
-    else:
-        locus_list = []
-        
-    extract_msa(base_dname,
-                base_fname,
-                locus_list,
-                args.min_freq,
-                args.verbose)
-
diff --git a/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py b/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py
deleted file mode 100755
index c17d86c5..00000000
--- a/hisatgenotype_scripts/hisatgenotype_extract_codis_data.py
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2017, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import os, sys, subprocess, re
-import inspect, operator
-from argparse import ArgumentParser, FileType
-
-# sequences for DNA fingerprinting loci are available at http://www.cstl.nist.gov/biotech/strbase/seq_ref.htm
-
-CODIS_loci = ["CSF1PO", "FGA", "TH01", "TPOX", "VWA", "D3S1358", "D5S818", "D7S820", "D8S1179", "D13S317", "D16S539", "D18S51", "D21S11"]
-
-
-"""
-## Download variant information from website
-"""
-def get_html(url):
-    download_cmd = ["wget",
-                    "-O", "-",
-                    url]
-    proc = subprocess.Popen(download_cmd,
-                            stdout=subprocess.PIPE,
-                            stderr=open("/dev/null", 'w'))
-
-    output = ""
-    for line in proc.stdout:
-        output += line
-
-    return output
-
-
-"""
-Download CODIS data
-"""
-def download_codis(base_dname,
-                   base_fname,
-                   locus_list,
-                   verbose):    
-    # CODIS database base URL
-    base_url = "http://www.cstl.nist.gov/biotech/strbase"
-    
-    # Refer to Python's regular expression at https://docs.python.org/2/library/re.html
-    #   <td width="16%" align="center"><font size="4">47.2 </font> </td>
-    allele_re = re.compile('>(\d+\.?\d?\"?\'*\(?\d*\.?\d?\"?\'*\)?\*?)</')
-    #   <td width="35%"><font size="2">[TTTC]<sub>4</sub>TTTT TT<span style="mso-spacerun: yes"> </span>[CTTT]<sub>14</sub>[CTGT]<sub>3</sub>[CTTT]<sub>14 </sub>[CTTC]<sub>4</sub>[CTTT]<sub>3</sub>CTCC[TTCC]<sub>4</sub></font> </td>
-    # repeat_re = re.compile('^(\[[ACGT]+\]\d+|[ACGT]+)+$')
-    repeat_re = re.compile('^(\[[ACGT]+\]\d+|\[[ACGT]+\]|[ACGT]+|\s)+$')
-    # Remove extra tags
-    tag_re = re.compile('(<[^>]*>)')
-    nbsp_re = re.compile('&nbsp;')
-    quot_re = re.compile('&quot;')
-    codis_data_file = open(base_fname + ".dat", 'w')
-    for locus_name in CODIS_loci:
-        if len(locus_list) > 0 and locus_name not in locus_list:
-            continue
-        url = "%s/str_%s.htm" % (base_url, locus_name)
-        content = get_html(url).split("\r\n")
-        content = map(lambda x: x.strip(), content)
-        content2 = []
-        for line in content:
-            if line.startswith("<t") or \
-               line.startswith("</tr") or \
-               len(content2) == 0:
-                content2.append(line)
-            else:
-                content2[-1] += line
-                
-        content = content2
-        alleles = []
-        l = 0
-        while l < len(content):
-            line = content[l]
-            if line.startswith("<tr"):
-                l += 1
-                if l < len(content):
-                    line = content[l]
-                    line = re.sub(nbsp_re, '', line)
-                    line = re.sub(quot_re, "''", line)
-                    line = line.replace(' ', '')
-                    allele_match = allele_re.search(line)
-                    if not allele_match:
-                        continue
-                    allele_id = allele_match.group(1)                        
-                    l += 1
-                    repeat_match = None
-                    while l < len(content):
-                        line = content[l]                        
-                        if not line.startswith("<td"):
-                            break
-                        line = re.sub(tag_re, '', line)
-                        line = re.sub(nbsp_re, '', line)
-                        repeat_match = repeat_re.search(line)
-                        if repeat_match:
-                            break
-                        l += 1
-                        
-                    if not repeat_match:
-                        continue
-
-                    repeat_st = line
-                    alleles.append([allele_id, repeat_st])
-            else:
-                l += 1
-
-        for allele_id, repeat_st in alleles:
-            print >> codis_data_file, "%s\t%s\t%s" % (locus_name, allele_id, repeat_st)
-
-    codis_data_file.close()
-
-
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description="Extract multiple sequence alignments for DNA Fingerprinting loci")
-    parser.add_argument("-b", "--base",
-                        dest="base_fname",
-                        type=str,
-                        default="codis",
-                        help="base filename (default: codis)")
-    parser.add_argument("--locus-list",
-                        dest="locus_list",
-                        type=str,
-                        default="",
-                        help="base filename (default: empty)")    
-    parser.add_argument("-v", "--verbose",
-                        dest="verbose",
-                        action="store_true",
-                        help="also print some statistics to stderr")
-
-    args = parser.parse_args()
-    if args.base_fname.find('/') != -1:
-        elems = args.base_fname.split('/')
-        base_fname = elems[-1]
-        base_dname = '/'.join(elems[:-1])
-    else:
-        base_fname = args.base_fname
-        base_dname = ""
-    if args.locus_list != "":
-        locus_list = args.locus_list.split(',')
-    else:
-        locus_list = []
-        
-    download_codis(base_dname,
-                   base_fname,
-                   locus_list,
-                   args.verbose)
-
diff --git a/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py b/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py
deleted file mode 100755
index b0b4d039..00000000
--- a/hisatgenotype_scripts/hisatgenotype_extract_cyp_data.py
+++ /dev/null
@@ -1,1061 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2016, Raymon Cao <rcao5@jhu.edu> and Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT 2.
-#
-# HISAT 2 is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT 2 is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT 2.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import os, sys, subprocess, re
-import inspect, operator
-import glob
-from argparse import ArgumentParser, FileType
-
-
-global gene_names
-gene_names = ['cyp1a1','cyp1a2','cyp1b1','cyp2a6',
-              'cyp2a13','cyp2b6','cyp2c8','cyp2c9',
-              'cyp2c19','cyp2d6','cyp2e1','cyp2f1',
-              'cyp2j2','cyp2r1','cyp2S1','cyp2w1',
-              'cyp3a4','cyp3a5','cyp3a7','cyp3a43',
-              'cyp4a11','cyp4a22','cyp4b1','cyp4f2',
-              'cyp5a1','cyp8a1','cyp19a1','cyp21a2',
-              'cyp26a1']
-
-"""
-Download variant information from website database
-"""
-
-def get_html(url):
-    download_cmd = ["wget",
-                    "-O", "-",
-                    url]
-    proc = subprocess.Popen(download_cmd,
-                            stdout=subprocess.PIPE,
-                            stderr=open("/dev/null", 'w'))
-
-    output = ""
-    for line in proc.stdout:
-        output += line
-
-    return output
-
-
-def download_CYP(verbose):
-    print("Downloading data from:")
-    
-    # CYP database base URL
-    base_url = "http://www.cypalleles.ki.se"
-    
-    # Current script directory
-    curr_script = os.path.realpath(inspect.getsourcefile(download_CYP))
-    ex_path = os.path.dirname(curr_script)
-
-    # Refer to Python's regular expression at https://docs.python.org/2/library/re.html
-    cyp_re = re.compile('http://www.cypalleles.ki.se/cyp\w+.htm')
-    output = get_html(base_url)
-    cyp_urls = cyp_re.findall(output)
-    # Original list had duplicate urls, removes duplicates
-    cyp_urls = set(cyp_urls)
-
-    os.system('mkdir cyp_var_files')
-    for cyp_url in cyp_urls:
-        cyp_gene_name = cyp_url.split('/')[-1]
-        cyp_gene_name = cyp_gene_name.split('.')[0]
-        
-        # Hardcoded for cyp21 database (has inconsistant url naming) 
-        if cyp_gene_name.lower() == "cyp21".lower():
-            cyp_gene_name = cyp_gene_name + "a2" 
-
-        # Changed to match all instances of "cyp"
-        if not re.compile("cyp[\d\w]+", re.IGNORECASE).search(cyp_gene_name):
-            continue
-
-        # Open file to write on
-        cyp_file = open("cyp_var_files/%s.var" % (cyp_gene_name), 'w')
-        
-        print >> sys.stderr, cyp_url, cyp_gene_name
-        print >> cyp_file, cyp_url, cyp_gene_name
-
-        cyp_output = get_html(cyp_url)
-        if cyp_output == "":
-            continue
-
-        listA = cyp_output.split("<tr style=")
-
-        indStart = -1
-        foundStart = False
-        while not foundStart:
-            indStart += 1
-            foundStart = (cyp_gene_name + '*').upper() in listA[indStart].upper()
-            
-        # Look for first occurance of "[cyp_gene_name]*"
-        listA = listA[indStart:]
-
-        # Look for last occurance of "[cyp_gene_name]*"
-        indEnd = 0
-        foundEnd = False
-        while not foundEnd:
-            indEnd -= 1
-            foundEnd = (cyp_gene_name + '*').upper()  in listA[indEnd].upper()
-
-        listA = listA[:(indEnd + 1)]
-        
-        for itemA in listA:
-            tabRow = itemA.split("</td>")
-            for ind in range(len(tabRow)):
-                tabRow[ind] = tabRow[ind].replace("\r\n","")
-
-            allele_name_re = re.compile(cyp_gene_name.upper() + '\*[\w\d]+')
-            varInfo_re = re.compile('-?\d+[ACGT]\&gt;[ACGT]|-?\d+_?-?\d+?del[ACGT]+|-?\d+_?-?\d+?ins[ACGT]+|None')
-
-            alleleName = allele_name_re.findall(tabRow[0])
-            if len(alleleName) > 0:
-                alleleName = alleleName[0]
-
-            # @RaymonFix - some databases have extra table, ignores headers (CYP2A6)
-            # @Daehwan - some databases (e.g. http://www.cypalleles.ki.se/cyp3a4.htm)
-            #            have 2 rows of Nucleotide changes (cDNA and Gene), might need
-            #            to look at all rows for snps
-            #
-            # @RaymonFix - look in 4th column for "Gene" nt changes first, then consider cDNA if applicable; updated re to remove "<>" formating expressions 
-
-            if cyp_url == 'http://www.cypalleles.ki.se/cyp21.htm': # Hardcoded for special format for cyp21a2
-                try:
-                    varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[1]))
-                except IndexError:
-                    continue
-                
-            else:
-                try:
-                    varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[3]))
-                    if len(varInfo) == 0:
-                        varInfo = varInfo_re.findall(re.sub('<[^>]+>', '',tabRow[2]))
-                except IndexError:
-                    continue
-
-            for varInd in range(len(varInfo)):
-                varInfo[varInd] = varInfo[varInd].replace('&gt;','>')
-
-            if 'None' in varInfo:
-                try:
-                    assert len(varInfo) == 1
-                except:
-                    varInfo = filter(lambda a: a != 'None', varInfo)
-                
-        
-            if isinstance(alleleName, basestring):
-                print >> cyp_file, (str(alleleName) + "\t" + ','.join(varInfo))
-            
-        cyp_file.close()
-
-         
-"""
-Make MSF files from variants
-"""
-
-def checkNTloc(fasta_fileName,var_fileName,gene_name):
-    print "\nGene: %s" % gene_name
-    seq = ""
-    for line in open(fasta_fileName,'r'):
-        if line[0] == '>':
-            continue
-        seq += line.strip()
-
-    cyp_var_file = open(var_fileName,'r')
-    cyp_var_dict = makeVarDict(cyp_var_file)
-    cyp_var_file.close()
-
-    print "len:", len(seq)
-    varsPos = set()
-    varsNeg = set()
-
-    for varList in cyp_var_dict.values():
-        for var in varList:
-            if ">" in var: # is SNP
-                posNt = int(var[:-3])
-                ntChange = var[-3:].replace('>','')
-                assert len(ntChange) == 2
-                for nt in ntChange:
-                    assert nt in "ACGT"
-
-                if posNt > 0:
-                    varsPos.add(str(posNt) + '->' + ntChange[0])
-                else:
-                    assert posNt < 0
-                    varsNeg.add(str(posNt) + '->' + ntChange[0])
-                    
-            elif "del" in var: # is deletion
-                posNt = var.split('del')[0].split('_')
-                posNt = [int(p) for p in posNt]
-                ntDel = var.split('del')[1]
-                for nt in ntDel:
-                    assert nt in "ACGT"
-
-                if len(posNt) == 1: # single nt deletion
-                    assert len(ntDel) == 1
-                    if posNt[0] > 0:
-                        varsPos.add(str(posNt[0]) + '->' + ntDel)
-                    else:
-                        assert posNt[0] < 0
-                        varsNeg.add(str(posNt[0]) + '->' + ntDel)
-
-                else: # mutliple nt deletion
-                    assert len(posNt) == 2
-                    try:
-                        assert posNt[1] - posNt[0] + 1 == len(ntDel)
-                    except AssertionError:
-                        print "Incorrect deletion format: %s , skipping variation" % (var)
-                        '''sys.exit(1)'''
-                        continue
-                    ntDelList = list(ntDel)
-                    for i in range(posNt[0],posNt[1] + 1):
-                        if i > 0:
-                            varsPos.add(str(i) + '->' + ntDelList.pop(0))
-                        else:
-                            assert i < 0
-                            varsNeg.add(str(i) + '->' + ntDelList.pop(0))
-                    assert len(ntDelList) == 0
-                    
-            else:
-                assert ("ins" in var) or ("None" in var)
-                continue
-    
-    scorePos = {} # { position offset : number of alignments } for positive positions
-    for i in range(-len(seq), len(seq)):
-        align_score = 0
-        for var in varsPos:
-            pos, base = var.split('->')
-            pos = int(pos)
-            
-            try:
-                seq[pos+i]
-            except IndexError:
-                continue
-            
-            if seq[pos+i] == base:
-                align_score += 1
-
-        scorePos[i] = align_score
-    oSetPos = max(scorePos.iteritems(), key=operator.itemgetter(1))[0]
-    print "Positive postitions offset: %d" % oSetPos
-    print "Score: %d out of %d\n" % (scorePos[oSetPos], len(varsPos))
-    
-
-    print "Checking negative position offset: %d" % (oSetPos + 1)
-    align_score = 0
-    oSetNeg = oSetPos + 1
-    for var in varsNeg:
-        pos, base = var.split('->')
-        pos = int(pos)
-        
-        try:
-            seq[pos + oSetNeg]
-        except IndexError:
-            continue
-        
-        if seq[pos + oSetNeg] == base:
-            align_score += 1
-    print "Score: %d out of %d\n\n" % (align_score, len(varsNeg))
-
-    if len(varsNeg) == 0 and len(varsPos) != 0:
-        return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), 1.0, float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg))
-    elif len(varsNeg) != 0 and len(varsPos) == 0:
-        return oSetPos, oSetNeg, 1.0, float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg))
-    elif len(varsNeg) == 0 and len(varsPos) == 0:
-        return oSetPos, oSetNeg, 1.0, 1.0, 1.0
-    else:
-        assert len(varsNeg) != 0 and len(varsPos) != 0
-        return oSetPos, oSetNeg, float(scorePos[oSetPos])/float(len(varsPos)), float(align_score)/float(len(varsNeg)), float(scorePos[oSetPos] + align_score)/float(len(varsPos) + len(varsNeg))
-        
-
-def create_map(seq):
-    seq_map = {}
-    count = 0
-    for i in range(len(seq)):
-        bp = seq[i]
-        if bp == '.':
-            continue
-        assert bp.upper() in "ACGT"
-        seq_map[count] = i
-        count += 1
-    return seq_map
-
-def splitString(someStr,posList):
-    posList.insert(0,-1)
-    posList.append(len(someStr) - 1)
-    splitStr = []
-    for i in range(len(posList) - 1):
-        left = posList[i] + 1
-        right = posList[i+1] + 1
-        splitStr.append(someStr[left:right])
-
-    return splitStr
-
-def extractSeq(faFile):
-    seq = ""
-    for line in faFile:
-        if line.startswith(">"):
-            continue
-
-        seq += line.strip()
-
-    return seq
-
-def makeVarDict(fname):
-    alleleVarDict = {}
-
-    allLines = [line.strip() for line in fname]
-
-    ref_al_id_present = False
-    for line in allLines[1:]:
-        if 'None' in line:
-            ref_al_id_present = True
-
-    line_num = 0
-    for line in allLines[1:]:
-        line_num += 1
-        assert line.upper().startswith("CYP")
-        alleleName = line.split("\t")[0].upper()
-
-        if (not ref_al_id_present) and line_num == 1:
-            varList = ['None']            
-        else:
-            try:
-                varList = line.split("\t")[1].split(',')
-            except IndexError:
-                continue
-        
-        try:
-            assert not alleleName in alleleVarDict
-            alleleVarDict[alleleName] = set(varList)
-        except:
-            print >> sys.stdout, ("Warning, %s allele is already represented" % alleleName)
-            alleleVarDict[alleleName] = alleleVarDict[alleleName] | set(varList)
-
-    return alleleVarDict
-
-def makeSnp(oldSeq, pos, oldNt, newNt):
-    assert oldSeq[pos] == oldNt
-    newSeq = oldSeq[:pos] + newNt + oldSeq[pos+1:]
-    assert len(newSeq) == len(oldSeq)
-    return newSeq
-
-def makeDel(oldSeq, left, right, toDel):
-    assert right - left + 1 == len(toDel)
-    assert oldSeq[left:right + 1] == toDel
-    newSeq = oldSeq[:left] + '.'*len(toDel) + oldSeq[right + 1:]
-    assert len(newSeq) == len(oldSeq)
-    return newSeq
-    
-def makeIns(oldSeq,left,right,toIns):
-    assert right - left - 1 >= len(toIns)
-    for nt in oldSeq[left + 1:right]:
-      assert nt == '.'
-    remDots = right - left - 1 - len(toIns)
-    newSeq = oldSeq[:left + 1] + toIns + '.'*remDots + oldSeq[right:]
-    assert len(newSeq) == len(oldSeq)
-    return newSeq
-    
-
-def makeMSF(gene_name, oSetPos, oSetNeg):
-    cyp_var_file = open("cyp_var_files/%s.var" % gene_name,'r')
-    cyp_var_dict = makeVarDict(cyp_var_file)
-    cyp_var_file.close()
-
-    if len(cyp_var_dict) < 2:
-        print('\tOnly reference allele included, skipping gene')
-        return
-
-    try:
-        blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name)
-        if len(blast_allele_var) > 0:
-            cyp_var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var)
-    except IOError:
-        print('\t%s blast file was skipped.' % gene_name)
-
-    cyp_faFile = open("cyp_fasta/%s.fasta" % gene_name,'r')
-    cyp_seq = extractSeq(cyp_faFile)
-    cyp_faFile.close()
-    preBackbone_seq = ''
-    
-
-    msfTable = {}
-
-    # Building backbone structure (augment length with insertions)
-    longestIns = {} # { key = position : value = length }
-    for allele,varList in cyp_var_dict.items():
-        for var in varList:
-            if not "ins" in var:
-                continue
-            pos = var.split('ins')[0].split('_')
-            pos = [int(p) for p in pos]
-            ntIns = var.split('ins')[1]
-            correctFormat = len(pos) == 2 and pos[1] - pos[0] == 1
-            if not correctFormat:
-                correctFormat = len(pos) == 1
-            try:
-                assert correctFormat
-            except:
-                print >> sys.stdout, "\tIncorrect format for insertion: variation %s on allele %s" % (var, allele)
-                continue
-
-            # convert to position in string
-            if not 'GRCH38' in allele:
-                if pos[0] > 0:
-                    pos = pos[0] + oSetPos
-                else:
-                    pos = pos[0] + oSetNeg
-            else:
-                pos = pos[0]
-                
-            # Make dictionary of longest insertions
-            if not pos in longestIns:
-                longestIns[pos] = len(ntIns)
-            else:
-                if len(ntIns) > longestIns[pos]:
-                    longestIns[pos] = len(ntIns)
-    
-    posInsList = sorted(longestIns.keys())
-    
-    splitSeq = splitString(cyp_seq,posInsList)
-    posInsList = posInsList[1:-1]
-
-    for i in range(len(posInsList)):
-        splitSeq[i] += '.' * longestIns[posInsList[i]]
-
-    for subseq in splitSeq:
-        try:
-            assert len(subseq) > 0 and not subseq.startswith('.')
-            preBackbone_seq += subseq
-        except:
-            continue
-    # pre-backbone built
-
-
-    map_cyp = create_map(preBackbone_seq) # { Index of bp in original seq : Actual index in string }
-    
-
-    for allele,varList in cyp_var_dict.items():
-        for var in varList:
-            isSnp = False
-            isDel = False
-            isIns = False
-        
-            if ">" in var:
-                isSnp = True
-            elif "del" in var:
-                isDel = True
-            elif "ins" in var:
-                isIns = True
-            else:
-                assert("None" in var)
-                isRef = True
-
-            if isSnp:
-                pos = int(var[:-3])
-                dbPos = pos
-                ntChange = var[-3:].replace('>','')
-                assert len(ntChange) == 2
-                for nt in ntChange:
-                    assert nt in "ACGT"
-
-                if not 'GRCH38' in allele:
-                    if pos > 0:
-                        pos = pos + oSetPos
-                    else:
-                        pos = pos + oSetNeg
-
-                if pos < 0 or pos > len(cyp_seq) - 1:
-                    print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos)
-                    print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
-                    continue
-                    
-                try:
-                    assert(preBackbone_seq[map_cyp[pos]] == ntChange[0]) # nt at pos in seq must match database
-                except:
-                    print >> sys.stdout, "\tWarning: position %d in sequence contains %s, but expected %s from database" % (dbPos, preBackbone_seq[map_cyp[pos]], ntChange[0])
-                    print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
-                    continue
-                
-                # Adding to msf table
-                if not allele in msfTable:
-                    msfTable[allele] = makeSnp(preBackbone_seq, map_cyp[pos], ntChange[0], ntChange[1])
-                else:
-                    msfTable[allele] = makeSnp(msfTable[allele], map_cyp[pos], ntChange[0], ntChange[1])
-                    
-            elif isDel:
-                pos = var.split('del')[0].split('_')
-                pos = [int(p) for p in pos]
-                if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707])  
-                    pos.append(pos[0])
-                assert len(pos) == 2
-                dbPos = pos
-                ntDel = var.split('del')[1]
-                for nt in ntDel:
-                    assert nt in "ACGT"
-
-                if not 'GRCH38' in allele:
-                    for i in range(len(pos)):
-                        if pos[i] > 0:
-                            pos[i] = pos[i] + oSetPos
-                        else:
-                            pos[i] = pos[i] + oSetNeg
-
-                skipDel = False
-                for i in range(len(pos)):
-                    if pos[i] < 0 or pos[i] > len(cyp_seq) - 1:
-                        print >> sys.stdout, "\tWarning: position %d out of bounds" % (dbPos[i])
-                        print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
-                        skipDel = True
-
-                if skipDel:
-                    continue
-                        
-            
-                try:
-                    assert pos[1] - pos[0] + 1 == len(ntDel)
-                except:
-                    print >> sys.stdout, "\tIncorrect deletion data with %s on allele %s. Skipping variation." % (var, allele)
-                    continue
-                            
-                try:
-                    assert preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ] == ntDel
-                except:
-                    print >> sys.stdout, "\tWarning, positions %d to %d in sequence contains %s, but expected %s from database" % \
-                          (dbPos[0], dbPos[1], preBackbone_seq[ map_cyp[pos[0]] : map_cyp[pos[1]] + 1 ], ntDel)
-                    print >> sys.stdout, "\t\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
-                    continue
-
-
-                # Adding to msf table
-                if not allele in msfTable:
-                    msfTable[allele] = makeDel(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntDel)
-                else:
-                    msfTable[allele] = makeDel(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntDel)
-
-                        
-            elif isIns:
-                pos = var.split('ins')[0].split('_')
-                pos = [int(p) for p in pos]
-                if len(pos) == 1:
-                    pos.append(pos[0] + 1)
-                assert len(pos) == 2
-                dbPos = pos
-                try:
-                    assert pos[1] - pos[0] == 1
-                except AssertionError:
-                    print >> sys.stdout, "\tIncorrect insertion data with %s on allele %s. Skipping variation." % (var, allele)
-                    continue 
-                ntIns = var.split('ins')[1]
-                for nt in ntIns:
-                    assert nt in "ACGT"
-
-                if not 'GRCH38' in allele:
-                    for i in range(len(pos)):
-                        if pos[i] > 0:
-                            pos[i] = pos[i] + oSetPos
-                        else:
-                            pos[i] = pos[i] + oSetNeg
-
-                skipIns = False
-                for i in range(len(pos)):
-                    if pos[i] < 0 or pos[i] > len(cyp_seq) - 1:
-                        print >> sys.stdout, "Warning: position %d out of bounds" % (dbPos[i])
-                        print >> sys.stdout, "\tError occured on variation %s on allele %s. Skipping variation." % (var, allele)
-                        skipIns = True
-
-                if skipIns:
-                    continue
-
-
-                # Adding to msf table
-                if not allele in msfTable:
-                    msfTable[allele] = makeIns(preBackbone_seq, map_cyp[pos[0]], map_cyp[pos[1]], ntIns)
-                else:
-                    msfTable[allele] = makeIns(msfTable[allele], map_cyp[pos[0]], map_cyp[pos[1]], ntIns)
-
-
-            else:
-                assert isRef
-                assert not allele in msfTable
-                msfTable[allele] = preBackbone_seq
-
-    # Sanity checking
-    seq_len = 0
-    for allele, msf_seq in msfTable.items():
-        if seq_len == 0:
-            seq_len = len(msf_seq)
-        else:
-            assert seq_len == len(msf_seq)
-    assert seq_len > 0
-
-    # Follow MSF style of IMGT/HLA database
-    msfFile = open('cyp_msf/%s_gen.msf' % gene_name[3:].upper(),'w')
-    for i in range(0, seq_len, 50):
-        for allele, msf_seq in msfTable.items():
-            output = "%12s" % allele[3:].upper()
-            for j in range(i, i+50, 10):
-                if j >= seq_len:
-                    break
-                if j == i:
-                    output += "\t"
-                else:
-                    output += " "
-                output += msf_seq[j:j+10]
-            print >> msfFile, output
-        print >> msfFile
-
-    msfFile.close()
-
-
-def build_msf_files():
-    os.system('mkdir cyp_msf')
-
-    oSetPos = 0
-    oSetNeg = 0
-    oSetScorePos = 0.0
-    oSetScoreNeg = 0.0
-    tot_score = 0.0
-        
-    print('\nBuilding MSF files:')
-    for gene_name in gene_names:
-        oSetPos, oSetNeg, oSetScorePos, oSetScoreNeg, tot_score = checkNTloc("cyp_fasta/%s.fasta" % gene_name,"cyp_var_files/%s.var" % gene_name,gene_name)
-        if not (tot_score >= 0.95):
-            print "\tLess than 95% match, skipping gene."
-            continue
-        
-        makeMSF(gene_name, oSetPos, oSetNeg)
-
-
-'''
-Check MSF files against variants files
-'''
-
-global incorrect_msf_entries
-incorrect_msf_entries = []
-
-def create_inv_map(seq):
-    seq_map = {}
-    count = 0
-    for i in range(len(seq)):
-        bp = seq[i]
-        if bp == '.':
-            continue
-        assert bp.upper() in "ACGT"
-        seq_map[i] = count
-        count += 1
-    return seq_map
-
-def readMSF(msf_fname): # { Allele name : MSF sequence }
-    msf_dict = {}
-    all_lines = [line for line in msf_fname]
-    for line in all_lines:
-        line = line.strip().replace(' ','')
-        if len(line) == 0 : continue
-        allele_name = 'CYP' + line.split('\t')[0]
-        msf_seq = line.split('\t')[1]
-        if not allele_name in msf_dict:
-            msf_dict[allele_name] = msf_seq
-        else:
-            msf_dict[allele_name] = msf_dict[allele_name] + msf_seq
-
-    return msf_dict
-
-def msf_removeIns(ref_seq, al_seq):
-    assert len(ref_seq) == len(al_seq)
-    ins_ind_list = []
-    for i in range(len(ref_seq)):
-        if ref_seq[i] == '.':
-            ins_ind_list.append(i)
-
-    ori_ref_seq = ref_seq.replace('.','')
-    ori_al_seq = list(al_seq)
-
-    for i in ins_ind_list:
-        ori_al_seq[i] = '-'
-
-    ori_al_seq = ''.join(ori_al_seq).replace('-','')
-
-    assert len(ori_ref_seq) == len(ori_al_seq)
-    return ori_ref_seq, ori_al_seq
-
-def msfToVarList(ref_seq, al_seq):
-    var_list = []
-    
-    assert len(ref_seq) == len(al_seq)
-    for bp in ref_seq: assert bp in "ACGT."
-    for bp in al_seq: assert bp in "ACGT."
-    inv_map = create_inv_map(ref_seq)
-    
-    ins_re = re.compile('[ACGT]\.+')
-    ins_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ref_seq)] # list of duples of indicies of insertions in ref_seq
-    ins_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in ins_subStrPos]
-
-    for tup in ins_pos_length:
-        ins_pos, ins_length = tup[0], tup[1]
-        ins_seq = al_seq[ins_pos + 1: ins_pos + ins_length  + 1]
-        ins_seq = ins_seq.replace('.','')
-        if len(ins_seq) == 0:
-            continue
-        ins_str_data = str(inv_map[tup[0]]) + '_' + str(inv_map[tup[0]] + 1) + 'ins' + ins_seq
-        var_list.append(ins_str_data)
-
-    # insertions finished
-    
-    ori_ref_seq, ori_al_seq = msf_removeIns(ref_seq, al_seq)
-
-    for i in range(len(ori_ref_seq)):
-        if ori_al_seq[i] == '.':
-            continue 
-        elif ori_al_seq[i] != ori_ref_seq[i]: # snp
-            var_list.append(str(i) + ori_ref_seq[i] + '>' + ori_al_seq[i])
-
-    del_subStrPos = [(m.start(0), m.end(0)) for m in re.finditer(ins_re, ori_al_seq)] # list of duples of indicies of deletions in ori_al_seq
-    del_pos_length = [(tup[0], tup[1] - tup[0] - 1) for tup in del_subStrPos]
-
-    for tup in del_pos_length:
-        del_pos, del_length = tup[0], tup[1]
-        del_seq = ori_ref_seq[del_pos + 1 : del_pos + del_length + 1]
-        if del_length == 1:
-            assert len(del_seq) == 1
-            del_str_data = str(tup[0] + 1) + 'del' + del_seq
-        else:
-            del_str_data = str(tup[0] + 1) + '_' + str(tup[0] + tup[1]) + 'del' + del_seq
-        var_list.append(del_str_data)
-
-    # deletions finished
-
-    return var_list
-
-def checkMSFfile(gene_name, msf_fname, var_fname, fasta_filename):
-    oSetPos, oSetNeg, oSet_pos_score, oSet_neg_score, tot_score = checkNTloc(fasta_filename, var_fname, gene_name)
-    
-    try:
-        msf_file = open(msf_fname,'r')
-        msf_dict = readMSF(msf_file) # { Allele name : MSF sequence }
-        msf_file.close()
-    except IOError:
-        print("\t%s msf file was skipped.\n" % (gene_name))
-        return
-
-    var_file = open(var_fname,'r')
-    var_dict = makeVarDict(var_file)
-    var_file.close()
-
-    try:
-        blast_allele_var = extract_var_from_blast('cyp_blast_alignment/%s_blast.align' % gene_name)
-        if len(blast_allele_var) > 0:
-            var_dict[gene_name.upper() + '*REFGRCH38P7'] = set(blast_allele_var)
-    except IOError:
-        print('\t%s blast file was skipped.' % gene_name)
-    
-    fa_file = open(fasta_filename,'r')
-    oriSeq = extractSeq(fa_file)
-    fa_file.close()
-
-
-    # Find reference allele
-    ref_allele = ''
-    for allele_name in var_dict.keys():
-        if len(var_dict[allele_name]) == 1 and list(var_dict[allele_name])[0] == "None":
-            assert ref_allele == ''
-            ref_allele = allele_name
-    assert not ref_allele == ''
-
-
-    # Check if ref allele seq in msf matches fasta
-    assert ref_allele in msf_dict
-
-    try:
-        assert msf_dict[ref_allele].replace('.','') == oriSeq
-        print("Sequences match for reference allele %s" % ref_allele)
-    except AssertionError:
-        print("Warning: sequences do not match for reference allele %s" % ref_allele)
-        sys.exit(1)
-
-
-    # Check all alleles are included
-    try:
-        assert set([k.upper() for k in msf_dict.keys()]).issubset(set([k.upper() for k in var_dict.keys()]))
-    except AssertionError:
-        print("Extra alleles in MSF!\n")
-        print(sorted(msf_dict.keys()))
-        print("\n\n")
-        print(sorted(var_dict.keys()))
-        sys.exit(1)
-
-
-    # Convert from database positions to sequence positions (using offset)
-    for allele, var_list in var_dict.items():
-        oSet_var_list = []
-        for var in var_list:
-            if '>' in var: # snp
-                pos = int(var.split('>')[0][:-1])
-                ntSnp = [var.split('>')[0][-1]]
-                ntSnp.append(var.split('>')[1])
-                assert len(ntSnp) == 2
-                if not 'GRCH38' in allele:
-                    if pos > 0:
-                        pos = pos + oSetPos
-                    else:
-                        pos = pos + oSetNeg
-
-                if pos < 0 or pos > len(oriSeq) - 1: # out of bounds
-                    continue
-                if oriSeq[pos] != ntSnp[0]: # mismatch
-                    print('\tMismatch on variation %s' % var)
-                    continue
-
-                oSet_var = str(pos) + ntSnp[0] + '>' + ntSnp[1]
-                oSet_var_list.append(oSet_var)
-
-            elif 'del' in var: # deletion
-                pos = var.split('del')[0].split('_')
-                pos = [int(p) for p in pos]
-                if len(pos) == 1: # Handle single deletion with format for multi deletion with one location (e.g. [1707] -> [1707,1707])  
-                    pos.append(pos[0])
-                assert len(pos) == 2
-                ntDel = var.split('del')[1]
-                for nt in ntDel:
-                    assert nt in "ACGT"
-
-                skipDel = False
-                if not 'GRCH38' in allele:
-                    for i in range(len(pos)):
-                        if pos[i] > 0:
-                            pos[i] = pos[i] + oSetPos
-                        else:
-                            pos[i] = pos[i] + oSetNeg
-                        if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds
-                            skipDel = True
-                if (oriSeq[ pos[0] : pos[1] + 1 ] != ntDel): # mismatch
-                    print('\tMismatch on variation %s' % var)
-                    continue
-
-                if skipDel:
-                    continue
-
-                assert pos[1] - pos[0] + 1 == len(ntDel)
-
-                oSet_var = 'del' + ntDel
-                if pos[0] == pos[1]:
-                    oSet_var = str(pos[0]) + oSet_var
-                else:
-                    oSet_var = str(pos[0]) + '_' + str(pos[1]) + oSet_var
-
-                oSet_var_list.append(oSet_var)                        
-
-            elif 'ins' in var: # insertion
-                pos = var.split('ins')[0].split('_')
-                pos = [int(p) for p in pos]
-                if len(pos) == 1:
-                    pos.append(pos[0] + 1)
-                assert len(pos) == 2
-                try:
-                    assert pos[1] - pos[0] == 1
-                except AssertionError:
-                    print('\tIncorrect insertion format on variation %s' % var)
-                    continue
-                ntIns = var.split('ins')[1]
-                for nt in ntIns:
-                    assert nt in "ACGT"
-
-                skipIns = False
-                if not 'GRCH38' in allele:
-                    for i in range(len(pos)):
-                        if pos[i] > 0:
-                            pos[i] = pos[i] + oSetPos
-                        else:
-                            pos[i] = pos[i] + oSetNeg
-                        if pos[i] < 0 or pos[i] > len(oriSeq) - 1: # out of bounds
-                            skipIns = True
-
-                if skipIns:
-                    continue
-
-                oSet_var = str(pos[0]) + '_' + str(pos[1]) + 'ins' + ntIns
-                oSet_var_list.append(oSet_var)
-
-            else:
-                assert allele == ref_allele
-                assert var == 'None'
-                assert len(oSet_var_list) == 0
-                oSet_var_list.append('None')
-
-        var_dict[allele] = set(oSet_var_list)
-
-    # Check variants created from MSF file against variants list
-    num_correct_alleles = 0
-    for allele, msf_seq in msf_dict.items():
-        if allele == ref_allele:
-            num_correct_alleles += 1
-            continue
-        msf_var_list = msfToVarList(msf_dict[ref_allele], msf_seq)
-        '''print('\t' + str(var_dict[allele] == set(msf_var_list)) + '\t' + str(allele) + '\t' + str(msf_var_list))'''
-
-        try:
-            assert var_dict[allele] == set(msf_var_list)
-            num_correct_alleles += 1
-        except AssertionError:
-            incorrect_msf_entries.append(allele)
-            print('\n')
-            print('\t\tVar File:\t' + str(var_dict[allele]))
-            print('\t\tMSF File:\t' + str(set(msf_var_list)))
-            print('\t\tDifference:\t' + str(var_dict[allele] - set(msf_var_list)) + '\n')
-            '''sys.exit(1)'''
-
-    print("\t%d out of %d alleles have correct msf sequences\n" % (num_correct_alleles, len(msf_dict)))
-
-def check_msf_files():
-    print("\nChecking MSF files:")
-
-    for gene_name in gene_names:
-        checkMSFfile(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 'cyp_var_files/%s.var' % gene_name, 'cyp_fasta/%s.fasta' % gene_name)
-
-    print('\n\n%d incorrect msf entries on alleles %s\n' % (len(incorrect_msf_entries), str(incorrect_msf_entries)))
-
-
-"""
-Write allele sequences to fasta for each gene
-"""
-
-def writeGenFasta(gene_name, msf_fname, line_length):
-    try:
-        msf_file = open(msf_fname,'r')
-        msf_seq_dict = readMSF(msf_file)
-        msf_file.close()
-    except IOError:
-        print("\t%s msf file was skipped." % (gene_name))
-        return
-
-    gen_fasta_file = open('gen_fasta/%s_gen.fasta' % gene_name[3:].upper(), 'w')
-    
-    for allele, seq in msf_seq_dict.items():
-        seq = seq.replace('.','')
-        print >> gen_fasta_file, ('>' + allele[3:].upper() + ' ' + str(len(seq)) + ' bp')
-        seq_lines = [seq[i:i+line_length] for i in range(0, len(seq), line_length)]
-        print >> gen_fasta_file, ('\n'.join(seq_lines))
-
-    gen_fasta_file.close()
-    print('%s_gen.fasta completed' % gene_name)
-
-def build_gen_fasta_files():
-    os.system('mkdir gen_fasta')
-
-    print("\nBuilding alleles sequence fasta files:")
-    for gene_name in gene_names:
-        writeGenFasta(gene_name, 'cyp_msf/%s_gen.msf' % gene_name[3:].upper(), 60)
-
-
-"""
-Run script
-"""
-
-def extract_cyp_data():
-    download_CYP(True)
-    build_msf_files()
-    check_msf_files()
-    build_gen_fasta_files()
-
-####################################################################################################
-## Debuging BLASTN alignment ref alleles
-
-def adjust_blast_vars(blast_vars_list,qry_pos):
-    if len(blast_vars_list) == 0:
-        return []
-
-    qry_pos = qry_pos - 1
-    adj_blst_var_list = []
-
-    for var in blast_vars_list:
-        if '>' in var: # SNP
-            old_pos = int(var[:-3])
-            adj_var = str(old_pos + qry_pos) + var[-3:]
-            adj_blst_var_list.append(adj_var)
-        elif 'del' in var: # deletion
-            old_pos = var.split('del')[0].split('_')
-            old_pos = [int(i) for i in old_pos]
-            old_pos = [i + qry_pos for i in old_pos]
-            if len(old_pos) == 1:
-                adj_var = str(old_pos[0]) + 'del' + var.split('del')[1]
-            else:
-                assert len(old_pos) == 2
-                adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'del' + var.split('del')[1]
-            adj_blst_var_list.append(adj_var)
-        else: # insertion
-            assert 'ins' in var
-            old_pos = var.split('ins')[0].split('_')
-            old_pos = [int(i) for i in old_pos]
-            old_pos = [i + qry_pos for i in old_pos]
-            assert len(old_pos) == 2 and (old_pos[1] - old_pos[0] == 1)
-            adj_var = str(old_pos[0]) + '_' + str(old_pos[1]) + 'ins' + var.split('ins')[1]
-            adj_blst_var_list.append(adj_var)
-
-    return adj_blst_var_list
-
-def extract_var_from_blast(cyp_blast_fname):
-    blastn_file = open(cyp_blast_fname,'r')
-    all_lines = [line.strip() for line in blastn_file if not (len(line.strip()) == 0 or line.strip().startswith('|'))]
-    blastn_file.close()
-
-    id_match = [m.group(0) for l in all_lines[0:25] for m in [re.compile('.*(Identities.*).*').search(l)] if m][0]
-    id_match = id_match.split('%')[0].split(' (')[0].split('= ')[1].split('/')
-    id_match = [int(i) for i in id_match]
-
-    # print(id_match)    
-    assert len(id_match) == 2 and id_match[1] - id_match[0] >= 0
-    if id_match[1] - id_match[0] == 0:
-        print('\tPerfect match using blastn')
-        return []
-    
-    
-    start = -1
-    end = -1
-    for i in range(len(all_lines)): # Get rid of headers and footers
-        if all_lines[i].startswith('Score ='):
-            assert start == -1
-            start = i
-
-        if all_lines[i].startswith('Lambda'):
-            assert start != -1 and end == -1
-            end = i
-            break
-
-    all_lines = all_lines[start + 3 : end]
-    # print('\n'.join(all_lines))
-
-    blastn_var_list = []
-    for i in range(0,len(all_lines),2):
-        qry_seq = '\t'.join(all_lines[i].split())
-        qry_seq_pos = int(qry_seq.split('\t')[1])
-        sbj_seq = '\t'.join(all_lines[i + 1].split())
-        qry_seq = qry_seq.split('\t')[2].replace('-','.').upper()
-        sbj_seq = sbj_seq.split('\t')[2].replace('-','.').upper()
-        #print(qry_seq)
-        #print(sbj_seq)
-
-        temp_var_list = msfToVarList(qry_seq, sbj_seq)
-        #print(str(qry_seq_pos) + '\t' + str(temp_var_list) +  '\t' + str(adjust_blast_vars(temp_var_list,qry_seq_pos)))
-        temp_var_list = adjust_blast_vars(temp_var_list,qry_seq_pos)
-        blastn_var_list = blastn_var_list + temp_var_list
-        
-    return blastn_var_list
-
-# extract_var_from_blast('cyp_blast_alignment/cyp2d6_blast.align')
-
-extract_cyp_data()
diff --git a/hisatgenotype_scripts/hisatgenotype_locus_samples.py b/hisatgenotype_scripts/hisatgenotype_locus_samples.py
deleted file mode 100755
index 3de636a0..00000000
--- a/hisatgenotype_scripts/hisatgenotype_locus_samples.py
+++ /dev/null
@@ -1,354 +0,0 @@
-#!/usr/bin/env python
-
-#
-# Copyright 2017, Daehwan Kim <infphilo@gmail.com>
-#
-# This file is part of HISAT-genotype.
-#
-# HISAT-genotype is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# HISAT-genotype is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with HISAT-genotype.  If not, see <http://www.gnu.org/licenses/>.
-#
-
-
-import sys, os, subprocess, re, threading
-import inspect
-import random
-import glob
-from argparse import ArgumentParser, FileType
-import hisatgenotype_typing_common as typing_common
-
-
-# Platinum genomes - CEPH pedigree (17 family members)
-CEPH_pedigree = {
-    "NA12889" : {"gender" : "M", "spouse" : "NA12890", "children" : ["NA12877"]},
-    "NA12890" : {"gender" : "F", "spouse" : "NA12889", "children" : ["NA12877"]},
-    "NA12877" : {"gender" : "M", "father" : "NA12889", "mother" : "NA12890", "spouse" : "NA12878", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]},
-
-    "NA12891" : {"gender" : "M", "spouse" : "NA12892", "children" : ["NA12878"]},
-    "NA12892" : {"gender" : "F", "spouse" : "NA12891", "children" : ["NA12878"]},
-    "NA12878" : {"gender" : "F", "father" : "NA12892", "mother" : "NA12891", "spouse" : "NA12877", "children" : ["NA12879", "NA12880", "NA12881", "NA12882", "NA12883", "NA12884", "NA12885", "NA12886", "NA12887", "NA12888", "NA12893"]},
-
-    "NA12879" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12880" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12881" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12882" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12883" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12884" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12885" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12886" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12887" : {"gender" : "F", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12888" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    "NA12893" : {"gender" : "M", "father" : "NA12877", "mother" : "NA12878"},
-    }
-
-
-
-"""
-"""
-class myThread(threading.Thread):
-    def __init__(self,
-                 lock, 
-                 paths,
-                 reference_type,
-                 region_list,
-                 num_editdist,
-                 max_sample,
-                 assembly,
-                 out_dir,
-                 genotype_results,
-                 verbose):
-        threading.Thread.__init__(self)
-        self.lock = lock
-        self.paths = paths
-        self.reference_type = reference_type
-        self.region_list = region_list
-        self.num_editdist = num_editdist
-        self.max_sample = max_sample
-        self.assembly = assembly
-        self.out_dir = out_dir
-        self.genotype_results = genotype_results
-        self.verbose = verbose
-
-    def run(self):
-        global work_idx
-        while True:
-            self.lock.acquire()
-            my_work_idx = work_idx
-            work_idx += 1
-            self.lock.release()
-            if my_work_idx >= len(self.paths) or \
-               my_work_idx >= self.max_sample:
-                return
-            worker(self.lock,
-                   self.paths[my_work_idx],
-                   self.reference_type,
-                   self.region_list,
-                   self.num_editdist,
-                   self.assembly,
-                   self.out_dir,
-                   self.genotype_results,
-                   self.verbose)
-
-            
-"""
-"""
-work_idx = 0
-def worker(lock,
-           path,
-           reference_type,
-           region_list,
-           num_editdist,
-           assembly,
-           out_dir,
-           genotype_results,
-           verbose):
-    fq_name = path.split('/')[-1]
-    read_dir = '/'.join(path.split('/')[:-1])
-    genome = fq_name.split('.')[0]
-    if not fq_name.endswith("extracted.1.fq.gz"):
-        return
-    read_basename = fq_name[:fq_name.find("extracted.1.fq.gz")]
-    read_fname_1, read_fname_2 = "%s/%sextracted.1.fq.gz" % \
-                                 (read_dir, read_basename), "%s/%sextracted.2.fq.gz" % (read_dir, read_basename)
-
-    if not os.path.exists(read_fname_1) or not os.path.exists(read_fname_2):
-        return
-    lock.acquire()
-    print >> sys.stderr, genome
-    lock.release()
-
-    for family, loci in region_list.items():
-        test_hla_cmd = ["hisatgenotype_locus.py",
-                        "--base", family]
-        if len(loci) > 0:
-            test_hla_cmd += ["--locus", ','.join(loci)]
-        test_hla_cmd += ["--num-editdist", str(num_editdist)]
-        test_hla_cmd += ["-1", read_fname_1, "-2", read_fname_2]
-        if assembly:
-            test_hla_cmd += ["--assembly"]
-            test_hla_cmd += ["--assembly-base"]
-            if out_dir != "":
-                test_hla_cmd += ["%s/%s" % (out_dir, genome)]
-            else:
-                test_hla_cmd += [genome]        
-
-        if verbose:
-            lock.acquire()
-            print >> sys.stderr, ' '.join(test_hla_cmd)
-            lock.release()
-
-        proc = subprocess.Popen(test_hla_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-        test_alleles = set()
-        output_list = []
-        for line in proc.stdout:
-            line = line.strip()
-            if line.find("abundance") == -1:
-                continue
-
-            rank, _, allele, _, abundance = line.split()        
-            output_list.append([allele, float(abundance[:-2])])
-
-    lock.acquire()
-    for allele, abundance in output_list:
-        print >> sys.stdout, "%s\t%s\t%.2f" % (genome, allele, abundance)
-        genotype_results.append([genome, allele, abundance])
-    sys.stdout.flush()
-    lock.release()
-
-
-"""
-"""
-def genotyping(read_dir,
-               reference_type,
-               region_list,
-               num_editdist,
-               nthreads,
-               max_sample,
-               assembly,
-               out_dir,
-               verbose,
-               platinum_check):
-    for database_name in region_list:
-        # Extract variants, backbone sequence, and other sequeces
-        typing_common.extract_database_if_not_exists(database_name,
-                                                     [])            # locus_list
-        # Build HISAT2's graph index
-        typing_common.build_index_if_not_exists(database_name,
-                                                "hisat2",
-                                                "graph",
-                                                1,            # threads
-                                                verbose)
-    
-    if not os.path.exists(read_dir):
-        print >> sys.stderr, "Error: %s does not exist." % read_dir
-        sys.exit(1)
-
-    if out_dir != "" and not os.path.exists(out_dir):
-        os.mkdir(out_dir)        
-
-    # fastq files
-    fq_fnames = glob.glob("%s/*.extracted.1.fq.gz" % read_dir)
-
-    genotype_results = []
-
-    lock = threading.Lock()
-    threads = []
-    for t in range(nthreads):
-        thread = myThread(lock,
-                          fq_fnames,
-                          reference_type,
-                          region_list,
-                          num_editdist,
-                          max_sample,
-                          assembly,
-                          out_dir,
-                          genotype_results,
-                          verbose)
-        thread.start()
-        threads.append(thread)
-
-    for thread in threads:
-        thread.join()
-
-
-    if platinum_check:
-        genotype_dic = {}
-        for genome, allele, abundance in genotype_results:
-            region, _ = allele.split('*')
-            if region not in genotype_dic:
-                genotype_dic[region] = {}
-            if genome not in genotype_dic[region]:
-                genotype_dic[region][genome] = []
-            if len(genotype_dic[region][genome]) >= 2:
-                continue
-            # DK - debugging purposes
-            # if abundance < 0.15 * 100:
-            #    continue
-            genotype_dic[region][genome].append([allele, abundance])
-
-        for region, region_genotype in genotype_dic.items():
-            print >> sys.stderr, region
-            included, total = 0, 0
-            for genome, genome_alleles in region_genotype.items():
-                genome_alleles = set([allele for allele, _ in genome_alleles])
-                if "father" in CEPH_pedigree[genome]:
-                    assert "mother" in CEPH_pedigree[genome]
-                    parents = [CEPH_pedigree[genome]["father"], CEPH_pedigree[genome]["mother"]]
-                else:
-                    parents = []
-                parent_allele_sets = []
-                assert len(parents) in [0, 2]
-                if len(parents) == 2 and \
-                   parents[0] in region_genotype and \
-                   parents[1] in region_genotype:
-                    for parent_allele, _ in region_genotype[parents[0]]:
-                        for parent_allele2, _ in region_genotype[parents[1]]:
-                            parent_allele_sets.append(set([parent_allele, parent_allele2]))
-                print >> sys.stderr, "\t", genome, genome_alleles, parent_allele_sets
-                if len(parent_allele_sets) > 0:
-                    total += 1
-                    if genome_alleles in parent_allele_sets:
-                        included += 1
-            print >> sys.stderr, "\t%d / %d" % (included, total)
-
-
-"""
-"""
-if __name__ == '__main__':
-    parser = ArgumentParser(
-        description='genotyping on many samples')
-    parser.add_argument("--reference-type",
-                        dest="reference_type",
-                        type=str,
-                        default="gene",
-                        help="Reference type: gene, chromosome, and genome (default: gene)")
-    parser.add_argument("--region-list",
-                        dest="region_list",
-                        type=str,
-                        default="",
-                        help="A comma-separated list of regions (default: empty)")
-    parser.add_argument('--read-dir',
-                        dest="read_dir",
-                        type=str,
-                        default="",
-                        help='read directory (e.g. read_input)')
-    parser.add_argument("--num-editdist",
-                        dest="num_editdist",
-                        type=int,
-                        default=2,
-                        help="Maximum number of mismatches per read alignment to be considered (default: 2)")
-    parser.add_argument("-p", "--threads",
-                        dest="threads",
-                        type=int,
-                        default=1,
-                        help="Number of threads")
-    parser.add_argument('--assembly',
-                        dest='assembly',
-                        action='store_true',
-                        help='Perform assembly')
-    parser.add_argument("--max-sample",
-                        dest="max_sample",
-                        type=int,
-                        default=sys.maxint,
-                        help="Number of samples to be analyzed (default: sys.maxint)")
-    parser.add_argument("--out-dir",
-                        dest="out_dir",
-                        type=str,
-                        default="",
-                        help='Output directory (default: (empty))')
-    parser.add_argument('-v', '--verbose',
-                        dest='verbose',
-                        action='store_true',
-                        help='also print some statistics to stderr')
-    parser.add_argument('--platinum-check',
-                        dest='platinum_check',
-                        action='store_true',
-                        help='Check for concordance of platinum genomes')
-
-    args = parser.parse_args()
-
-    if args.read_dir == "":
-        print >> sys.stderr, "Error: please specify --read-dir."
-        sys.exit(1)
-
-    if not args.reference_type in ["gene", "chromosome", "genome"]:
-        print >> sys.stderr, "Error: --reference-type (%s) must be one of gene, chromosome, and genome." % (args.reference_type)
-        sys.exit(1)
-
-    region_list = {}
-    if args.region_list != "":
-        for region in args.region_list.split(','):
-            region = region.split('.')
-            if len(region) < 1 or len(region) > 2:
-                print >> sys.stderr, "Error: --region-list is incorrectly formatted."
-                sys.exit(1)
-                
-            family = region[0].lower()
-            if len(region) == 2:
-                locus_name = region[1].upper()
-            if family not in region_list:
-                region_list[family] = set()
-            if len(region) == 2:
-                region_list[family].add(locus_name)
-
-    genotyping(args.read_dir,
-               args.reference_type,
-               region_list,
-               args.num_editdist,
-               args.threads,
-               args.max_sample,
-               args.assembly,
-               args.out_dir,
-               args.verbose,
-               args.platinum_check)
-
diff --git a/hisatgenotype_scripts/run_extract_CP.sh b/hisatgenotype_scripts/run_extract_CP.sh
deleted file mode 100755
index ceca077e..00000000
--- a/hisatgenotype_scripts/run_extract_CP.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash -l
-#SBATCH --job-name=infphio.HLA.CP.extract.genome
-#SBATCH --nodes=1
-#SBATCH --cpus-per-task=40
-#SBATCH --mem=400G
-#SBATCH --partition=lrgmem
-#SBATCH --time=166:0:0 
-#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel
-
-/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/scripts/extract_reads.py --base-fname genotype_genome --reference-type genome --read-dir /home-1/dkim136@jhu.edu/aszalay1/genomes --out-dir CP_80 -p 40 --max-sample 80 --job-range 0,2
-
diff --git a/hisatgenotype_scripts/run_extract_ILMN.sh b/hisatgenotype_scripts/run_extract_ILMN.sh
deleted file mode 100755
index 3aaf0cbb..00000000
--- a/hisatgenotype_scripts/run_extract_ILMN.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash -l
-#SBATCH --job-name=infphio.HLA.ILMN.extract.genome
-#SBATCH --nodes=1
-#SBATCH --cpus-per-task=17
-#SBATCH --mem=120G
-#SBATCH --partition=shared
-#SBATCH --time=166:0:0 
-#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel
-
-/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/scripts/extract_reads.py --base-fname genotype_genome --reference-type genome --read-dir /home-1/dkim136@jhu.edu/ssalzbe1/users/infphilo/platinum_genomes --out-dir ILMN -p 17
-
diff --git a/hisatgenotype_scripts/run_genotype_build.sh b/hisatgenotype_scripts/run_genotype_build.sh
deleted file mode 100755
index ac2a3363..00000000
--- a/hisatgenotype_scripts/run_genotype_build.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash -l
-#SBATCH --job-name=infphio.genotype
-#SBATCH --nodes=1
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=400G
-#SBATCH --partition=lrgmem
-#SBATCH --time=168:0:0 
-#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel
-
-/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_novel/hisatgenotype_build_genome.py -p 4 --verbose --commonvar genome.fa genotype_genome
diff --git a/hisatgenotype_scripts/run_hisat2_build.sh b/hisatgenotype_scripts/run_hisat2_build.sh
deleted file mode 100755
index 15d25611..00000000
--- a/hisatgenotype_scripts/run_hisat2_build.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash -l
-#SBATCH --job-name=infphio.genotype.hisat2-build
-#SBATCH --nodes=1
-#SBATCH --cpus-per-task=4
-#SBATCH --mem=400G
-#SBATCH --partition=lrgmem
-#SBATCH --time=168:0:0 
-#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/hisat2/evaluation/tests/genotype
-
-/home-1/dkim136@jhu.edu/infphilo/hisat2/hisat2/hisat2-build -p 4 --snp genotype_genome.snp --haplotype genotype_genome.haplotype genotype_genome.fa genotype_genome
diff --git a/hisatgenotype_scripts/run_type_CP.sh b/hisatgenotype_scripts/run_type_CP.sh
deleted file mode 100755
index 4fd54ffd..00000000
--- a/hisatgenotype_scripts/run_type_CP.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/bash -l
-#SBATCH --job-name=infphio.HLA.CP
-#SBATCH --nodes=1
-#SBATCH --cpus-per-task=24
-#SBATCH --mem=64G
-#SBATCH --partition=shared
-#SBATCH --time=12:0:0 
-#SBATCH --workdir=/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_CP_extract_genome_partial
-
-/home-1/dkim136@jhu.edu/infphilo/hisat2/evaluation/tests/HLA_CP_extract_genome_partial/hisat2_test_HLA_genotyping_CP.py CP --num-editdist 2 -p 24 > cp_hla.txt