Merge pull request #44 from nanoporetech/dev

Add primers/models for developer release
nanoporetech · Oct 22, 2020 · c6395e7 · c6395e7
2 parents 3f4970b + 8a8eb4d
commit c6395e7
Show file tree

Hide file tree

Showing 11 changed files with 1,063 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -58,11 +58,13 @@ Issue `make help` to get a list of `make` targets.
 
 ```
 usage: cdna_classifier.py [-h] [-b primers] [-g phmm_file] [-c config_file]
-                          [-q cutoff] [-r report_pdf] [-u unclass_output]
-                          [-w rescue_output] [-S stats_output]
+                          [-k kit] [-q cutoff] [-Q min_qual] [-z min_len]
+                          [-r report_pdf] [-u unclass_output]
+                          [-l len_fail_output] [-w rescue_output]
+                          [-S stats_output] [-K qc_fail_output]
                           [-Y autotune_nr] [-L autotune_samples]
                           [-A scores_output] [-m method] [-x rescue] [-p]
-                          [-t threads] [-B batch_size]
+                          [-t threads] [-B batch_size] [-D read stats]
                           input_fastx output_fastx
 
 Tool to identify, orient and rescue full-length cDNA reads.
@@ -77,11 +79,16 @@ optional arguments:
   -g phmm_file         File with custom profile HMMs (None).
   -c config_file       File to specify primer configurations for each
                        direction (None).
+  -k kit               Use primer sequences from this kit (PCS109).
   -q cutoff            Cutoff parameter (autotuned).
+  -Q min_qual          Minimum mean base quality (7.0).
+  -z min_len           Minimum segment length (50).
   -r report_pdf        Report PDF (cdna_classifier_report.pdf).
   -u unclass_output    Write unclassified reads to this file.
+  -l len_fail_output   Write fragments failing the length filter in this file.
   -w rescue_output     Write rescued reads to this file.
   -S stats_output      Write statistics to this file.
+  -K qc_fail_output    Write reads failing mean quality filter to this file.
   -Y autotune_nr       Approximate number of reads used for tuning the cutoff
                        parameter (10000).
   -L autotune_samples  Number of samples taken when tuning cutoff parameter

diff --git a/pychopper/__init__.py b/pychopper/__init__.py
@@ -2,4 +2,4 @@
 
 __author__ = 'ONT Applications Group'
 __email__ = '[email protected]'
-__version__ = '2.4.0'
+__version__ = '2.5.0'
diff --git a/pychopper/phmm_data/PCS110_primers.hmm b/pychopper/phmm_data/PCS110_primers.hmm
diff --git a/pychopper/phmm_data/PCS110_primers.hmm.h3f b/pychopper/phmm_data/PCS110_primers.hmm.h3f
diff --git a/pychopper/phmm_data/PCS110_primers.hmm.h3i b/pychopper/phmm_data/PCS110_primers.hmm.h3i
diff --git a/pychopper/phmm_data/PCS110_primers.hmm.h3m b/pychopper/phmm_data/PCS110_primers.hmm.h3m
diff --git a/pychopper/phmm_data/PCS110_primers.hmm.h3p b/pychopper/phmm_data/PCS110_primers.hmm.h3p
diff --git a/pychopper/primer_data/PCS110_primers.fas b/pychopper/primer_data/PCS110_primers.fas
@@ -0,0 +1,4 @@
+>VNP
+ACTTGCCTGTCGCTCTATCTTCGAGGAGAGTCCGCCGCCCGCAAGTTT
+>SSP
+TTTCTGTTGGTGCTGATATTGCTTT
diff --git a/scripts/cdna_classifier.py b/scripts/cdna_classifier.py
@@ -28,6 +28,8 @@
     '-g', metavar='phmm_file', type=str, default=None, help="File with custom profile HMMs (None).", required=False)
 parser.add_argument(
     '-c', metavar='config_file', type=str, default=None, help="File to specify primer configurations for each direction (None).")
+parser.add_argument(
+    '-k', metavar='kit', type=str, default="PCS109", help="Use primer sequences from this kit (PCS109).")
 parser.add_argument(
     '-q', metavar='cutoff', type=float, default=None, help="Cutoff parameter (autotuned).")
 parser.add_argument(
@@ -174,7 +176,7 @@ def _detect_anomalies(st, config):
     anom = []
     for tmp in raw_anom:
         pc = tmp[1] * 100.0 / total
-        if pc >= 1:
+        if pc >= 3.0:
             anom.append((tmp[0], tmp[1], pc))
     if len(anom) > 0:
         sys.stderr.write("Detected {} potential artefactual primer configurations:\n".format(len(anom)))
@@ -219,14 +221,23 @@ def _plot_pd_line(df, title, report, alpha=0.7, xrot=0, vline=None):
 def _plot_stats(st, pdf):
     "Generate plots and save to report PDF"
     R = report.Report(pdf)
-    _plot_pd_bars(st.loc[st.Category == "Classification", ].copy(), "Classification of output reads", R, ann=True)
+    rs = st.loc[st.Category == "Classification", ]
+    _plot_pd_bars(rs.copy(), "Classification of output reads", R, ann=True)
+    found, rescue, unusable = float(rs.loc[rs.Name == "Primers_found", ].Value), float(rs.loc[rs.Name == "Rescue", ].Value), float(rs.loc[rs.Name == "Unusable", ].Value)
+    total = found + rescue + unusable
+    found = found / total * 100
+    rescue = rescue / total * 100
+    unusable = unusable / total * 100
+    sys.stderr.write("-----------------------------------\n")
+    sys.stderr.write("Reads with two primers:\t{:.2f}%\nRescued reads:\t\t{:.2f}%\nUnusable reads:\t\t{:.2f}%\n".format(found, rescue, unusable))
+    sys.stderr.write("-----------------------------------\n")
     _plot_pd_bars(st.loc[st.Category == "Strand", ].copy(), "Strand of oriented reads", R, ann=True)
     _plot_pd_bars(st.loc[st.Category == "RescueStrand", ].copy(), "Strand of rescued reads", R, ann=True)
     _plot_pd_bars(st.loc[st.Category == "UnclassHitNr", ].copy(), "Number of hits in unclassified reads", R)
     _plot_pd_bars(st.loc[st.Category == "RescueHitNr", ].copy(), "Number of hits in rescued reads", R)
     _plot_pd_bars(st.loc[st.Category == "RescueSegmentNr", ].copy(), "Number of usable segments per rescued read", R)
-    if args.q is None:
-        _plot_pd_line(st.loc[st.Category == "AutotuneSample", ].copy(), "Usable bases as function of cutoff(q). Best q={:.4f}".format(args.q), R, vline=args.q)
+    if q_bak is None:
+        _plot_pd_line(st.loc[st.Category == "AutotuneSample", ].copy(), "Usable bases as function of cutoff(q). Best q={:.4g}".format(args.q), R, vline=args.q)
     udf = st.loc[st.Category == "Unusable", ].copy()
     udf.Name = np.log10(1.0 + np.array(udf.Name, dtype=float))
     _plot_pd_line(udf, "Log10 length distribution of trimmed away sequences.", R)
@@ -244,17 +255,21 @@ def _plot_stats(st, pdf):
     if args.c is not None:
         CONFIG = open(args.c, "r").readline().strip()
 
+    kits = {"PCS109": {"HMM": os.path.join(os.path.dirname(phmm_data.__file__), "cDNA_SSP_VNP.hmm"), "FAS": os.path.join(os.path.dirname(primer_data.__file__), "cDNA_SSP_VNP.fas"), }, "PCS110": {
+                                           "HMM": os.path.join(os.path.dirname(phmm_data.__file__), "PCS110_primers.hmm"), "FAS": os.path.join(os.path.dirname(primer_data.__file__), "PCS110_primers.fas")}}
+
     if args.g is None:
-        args.g = os.path.join(os.path.dirname(phmm_data.__file__), "cDNA_SSP_VNP.hmm")
+        args.g = kits[args.k]["HMM"]
 
     if args.b is None:
-        args.b = os.path.join(os.path.dirname(primer_data.__file__), "cDNA_SSP_VNP.fas")
+        args.b = kits[args.k]["FAS"]
 
     if args.x is not None and args.x in ('DCS109'):
         if args.x == "DCS109":
             CONFIG = "-:VNP,-VNP"
 
     config = utils.parse_config_string(CONFIG)
+    sys.stderr.write("Using kit: {}\n".format(args.k))
     sys.stderr.write("Configurations to consider: \"{}\"\n".format(CONFIG))
 
     in_fh = sys.stdin
@@ -306,9 +321,10 @@ def backend(x, pool, q=None, mb=None):
     else:
         raise Exception("Invalid backend!")
 
-    # Pick the -q maximizing the number of classified reads using frid search:
+    # Pick the -q maximizing the number of classified reads using grid search:
     nr_records = None
     tune_df = None
+    q_bak = args.q
     if args.q is None:
         nr_cutoffs = args.L
         cutoffs = np.linspace(0.0, 1.0, num=nr_cutoffs)
@@ -356,7 +372,7 @@ def backend(x, pool, q=None, mb=None):
         tune_df["Value"] += [args.q]
         if best_qi == (len(class_reads) - 1):
             sys.stderr.write("Best cuttoff value is at the edge of the search interval! Using tuned value is not safe! Please pick a q value manually and QC your data!\n")
-        sys.stderr.write("Best cutoff (q) value is {:.4f} with {:.0f}% of the reads classified.\n".format(args.q, class_reads[best_qi] * 100 / len(read_sample)))
+        sys.stderr.write("Best cutoff (q) value is {:.4g} with {:.0f}% of the reads classified.\n".format(args.q, class_reads[best_qi] * 100 / len(read_sample)))
 
     if nr_records is not None:
         input_size = nr_records

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.4.0
+current_version = 2.5.0
 commit = True
 tag = True
 

diff --git a/setup.py b/setup.py
@@ -26,7 +26,7 @@
 
 setup(
     name='pychopper',
-    version='2.4.0',
+    version='2.5.0',
     description="A tool to identify full length cDNA reads.",
     long_description=readme,
     author="ONT Applications Group",