Skip to content

Commit

Permalink
Allowing user input for UMIs and context (nanoporetech#8)
Browse files Browse the repository at this point in the history
Added option to specify custom UMIs, for both strands
--fwd-umi
--rev-umi
  • Loading branch information
SemiQuant authored Mar 10, 2021
1 parent 4fdbcfc commit 6feca40
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 18 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ These parameters have to be specified to run the pipeline.
| sample_name | String | Name of the output folder |
| input_fastq | Absolute file path | FASTQ file or folder containing FASTQ files |
| reference_fasta | Absolute file path | FASTA file containing the reference genome |
| target_bed | Absolute file path | BED file containing amplicon coordinates and names |
| targets_bed | Absolute file path | BED file containing amplicon coordinates and names |

##### Optional parameters

Expand Down
24 changes: 17 additions & 7 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

def read_bed_names(filename):
names = []
with open(filename) as fh:
Expand Down Expand Up @@ -40,15 +39,19 @@ max_reads_per_cluster = config.get("max_reads_per_cluster", 60)
min_overlap = config.get("min_overlap", 0.9)
balance_strands = config.get("balance_strands", True)
mm = config.get("medaka_model", "r941_min_high_g360")
fwd_context = config.get("fwd_context", "GTATCGTGTAGAGACTGCGTAGG")
rev_context = config.get("rev_context", "AGTGATCGAGTCAGTGCGAGTG")
fwd_umi = config.get("fwd_umi", "TTTVVVVTTVVVVTTVVVVTTVVVVTTT")
rev_umi = config.get("rev_umi", "AAABBBBAABBBBAABBBBAABBBBAAA")
min_length = config.get("min_length", 40)
max_length = config.get("max_length", 60)

########################
########################
########################

target = read_bed_names(target_bed)

min_length = 40
max_length = 60

balance_strands_param = "--balance_strands"
if not balance_strands:
balance_strands_param = ""
Expand Down Expand Up @@ -144,9 +147,13 @@ rule detect_umi_fasta:
"{name}/fasta_umi/{target}_detected_umis.fasta"
params:
errors = allowed_umi_errors,
fwd_context = fwd_context,
rev_context = rev_context,
fwd_umi = fwd_umi,
rev_umi = rev_umi,
shell:
"""
umi_extract --max-error {params.errors} {input}/{wildcards.target}.fastq -o {output} --tsv {output}.tsv
umi_extract --fwd-context {params.fwd_context} --rev-context {params.rev_context} --fwd-umi {params.fwd_umi} --rev-umi {params.rev_umi} --max-error {params.errors} {input}/{wildcards.target}.fastq -o {output} --tsv {output}.tsv
"""

rule detect_umi_consensus_fasta:
Expand All @@ -156,9 +163,13 @@ rule detect_umi_consensus_fasta:
"{name}/fasta_umi/{target}_detected_umis_final.fasta"
params:
errors = allowed_umi_errors,
fwd_context = fwd_context,
rev_context = rev_context,
fwd_umi = fwd_umi,
rev_umi = rev_umi,
shell:
"""
umi_extract --max-error {params.errors} {input} -o {output} --tsv {output}.tsv
umi_extract --fwd-context {params.fwd_context} --rev-context {params.rev_context} --fwd-umi {params.fwd_umi} --rev-umi {params.rev_umi} --max-error {params.errors} {input} -o {output} --tsv {output}.tsv
"""

rule cluster:
Expand Down Expand Up @@ -261,4 +272,3 @@ rule seqkit_bam_acc_tsv:
"""
echo -e "Read\tCluster_size\tRef\tMapQual\tAcc\tReadLen\tRefLen\tRefAln\tRefCov\tReadAln\tReadCov\tStrand\tMeanQual\tLeftClip\tRightClip\tFlags\tIsSec\tIsSup" > {output} && seqkit bam {input} 2>&1 | sed 's/_/\t/' | tail -n +2 >> {output}
"""

16 changes: 16 additions & 0 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,21 @@ balance_strands: True
# Medaka model used to compute consensus reads
medaka_model: "r941_min_high_g360"

# Forward tail of primer (Ftail...UMI...primer)
fwd_context: "GTATCGTGTAGAGACTGCGTAGG"

# Reverse tail of primer (Rtail...UMI...primer)
rev_context: "AGTGATCGAGTCAGTGCGAGTG"

# Forward UMI (Ftail...UMI...primer)
fwd_umi: "TTTVVVVTTVVVVTTVVVVTTVVVVTTT"

# Reverse UMI (Rtail...UMI...primer)
rev_umi: "AAABBBBAABBBBAABBBBAABBBBAAA"

# Minimum combined UMI length
min_length: 40

# Maximum combined UMI length
max_length: 60

40 changes: 30 additions & 10 deletions lib/umi_amplicon_tools/extract_umis.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,18 +88,40 @@ def parse_args(argv):
default="AGTGATCGAGTCAGTGCGAGTG",
help="Reverse upstream sequence",
)
parser.add_argument(
"--fwd-umi",
dest="FWD_UMI",
type=str,
default="TTTVVVVTTVVVVTTVVVVTTVVVVTTT",
help="Forward UMI sequence",
)
parser.add_argument(
"--rev-umi",
dest="REV_UMI",
type=str,
default="AAABBBBAABBBBAABBBBAABBBBAAA",
help="Reverse UMI sequence",
)
parser.add_argument(
"INPUT_FA", type=str, nargs="+", default="/dev/stdin", help="Detected UMIs"
)

args = parser.parse_args(argv)

return args


def align(query, pattern_info, max_ed, normalise=False):
pattern, wildcard, equalities, forward = pattern_info

pattern, forward = pattern_info

# move this somewhere not in a loop
seq = pattern
for c in 'actgACTG':
seq = seq.replace(c, "")
wildcard = set(''.join(seq))

equalities=[("M", "A"), ("M", "C"), ("R", "A"), ("R", "A"), ("W", "A"), ("W", "A"), ("S", "C"), ("S", "C"), ("Y", "C"), ("Y", "C"), ("K", "G"), ("K", "G"), ("V", "A"), ("V", "C"), ("V", "G"), ("H", "A"), ("H", "C"), ("H", "T"), ("D", "A"), ("D", "G"), ("D", "T"), ("B", "C"), ("B", "G"), ("B", "T"), ("N", "G"), ("N", "A"), ("N", "T"), ("N", "C")]

result = edlib.align(
pattern,
query,
Expand All @@ -121,7 +143,7 @@ def align(query, pattern_info, max_ed, normalise=False):
umi = ""
align = edlib.getNiceAlignment(result, pattern, query)
for q, t in zip(align["query_aligned"], align["target_aligned"]):
if q != wildcard:
if q not in wildcard:
continue
if t == "-":
umi += "N"
Expand Down Expand Up @@ -304,17 +326,15 @@ def main(argv=sys.argv[1:]):
output_file = args.OUT
tsv_file = args.TSV
input_files = args.INPUT_FA
umi_fwd = args.FWD_UMI
umi_rev = args.REV_UMI

pattern_fwd = (
"TTTVVVVTTVVVVTTVVVVTTVVVVTTT",
"V",
[("V", "A"), ("V", "G"), ("V", "C")],
umi_fwd,
True,
)
pattern_rev = (
"AAABBBBAABBBBAABBBBAABBBBAAA",
"B",
[("B", "T"), ("B", "G"), ("B", "C")],
umi_rev,
False,
)

Expand Down

0 comments on commit 6feca40

Please sign in to comment.