Skip to content
This repository has been archived by the owner on Jul 4, 2022. It is now read-only.

Commit

Permalink
Merge pull request #44 from nanoporetech/dev
Browse files Browse the repository at this point in the history
Add primers/models for developer release
  • Loading branch information
sagrudd authored Oct 22, 2020
2 parents 3f4970b + 8a8eb4d commit c6395e7
Show file tree
Hide file tree
Showing 11 changed files with 1,063 additions and 14 deletions.
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,13 @@ Issue `make help` to get a list of `make` targets.

```
usage: cdna_classifier.py [-h] [-b primers] [-g phmm_file] [-c config_file]
[-q cutoff] [-r report_pdf] [-u unclass_output]
[-w rescue_output] [-S stats_output]
[-k kit] [-q cutoff] [-Q min_qual] [-z min_len]
[-r report_pdf] [-u unclass_output]
[-l len_fail_output] [-w rescue_output]
[-S stats_output] [-K qc_fail_output]
[-Y autotune_nr] [-L autotune_samples]
[-A scores_output] [-m method] [-x rescue] [-p]
[-t threads] [-B batch_size]
[-t threads] [-B batch_size] [-D read stats]
input_fastx output_fastx
Tool to identify, orient and rescue full-length cDNA reads.
Expand All @@ -77,11 +79,16 @@ optional arguments:
-g phmm_file File with custom profile HMMs (None).
-c config_file File to specify primer configurations for each
direction (None).
-k kit Use primer sequences from this kit (PCS109).
-q cutoff Cutoff parameter (autotuned).
-Q min_qual Minimum mean base quality (7.0).
-z min_len Minimum segment length (50).
-r report_pdf Report PDF (cdna_classifier_report.pdf).
-u unclass_output Write unclassified reads to this file.
-l len_fail_output Write fragments failing the length filter in this file.
-w rescue_output Write rescued reads to this file.
-S stats_output Write statistics to this file.
-K qc_fail_output Write reads failing mean quality filter to this file.
-Y autotune_nr Approximate number of reads used for tuning the cutoff
parameter (10000).
-L autotune_samples Number of samples taken when tuning cutoff parameter
Expand Down
2 changes: 1 addition & 1 deletion pychopper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

__author__ = 'ONT Applications Group'
__email__ = '[email protected]'
__version__ = '2.4.0'
__version__ = '2.5.0'
1,022 changes: 1,022 additions & 0 deletions pychopper/phmm_data/PCS110_primers.hmm

Large diffs are not rendered by default.

Binary file added pychopper/phmm_data/PCS110_primers.hmm.h3f
Binary file not shown.
Binary file added pychopper/phmm_data/PCS110_primers.hmm.h3i
Binary file not shown.
Binary file added pychopper/phmm_data/PCS110_primers.hmm.h3m
Binary file not shown.
Binary file added pychopper/phmm_data/PCS110_primers.hmm.h3p
Binary file not shown.
4 changes: 4 additions & 0 deletions pychopper/primer_data/PCS110_primers.fas
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>VNP
ACTTGCCTGTCGCTCTATCTTCGAGGAGAGTCCGCCGCCCGCAAGTTT
>SSP
TTTCTGTTGGTGCTGATATTGCTTT
32 changes: 24 additions & 8 deletions scripts/cdna_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
'-g', metavar='phmm_file', type=str, default=None, help="File with custom profile HMMs (None).", required=False)
parser.add_argument(
'-c', metavar='config_file', type=str, default=None, help="File to specify primer configurations for each direction (None).")
parser.add_argument(
'-k', metavar='kit', type=str, default="PCS109", help="Use primer sequences from this kit (PCS109).")
parser.add_argument(
'-q', metavar='cutoff', type=float, default=None, help="Cutoff parameter (autotuned).")
parser.add_argument(
Expand Down Expand Up @@ -174,7 +176,7 @@ def _detect_anomalies(st, config):
anom = []
for tmp in raw_anom:
pc = tmp[1] * 100.0 / total
if pc >= 1:
if pc >= 3.0:
anom.append((tmp[0], tmp[1], pc))
if len(anom) > 0:
sys.stderr.write("Detected {} potential artefactual primer configurations:\n".format(len(anom)))
Expand Down Expand Up @@ -219,14 +221,23 @@ def _plot_pd_line(df, title, report, alpha=0.7, xrot=0, vline=None):
def _plot_stats(st, pdf):
"Generate plots and save to report PDF"
R = report.Report(pdf)
_plot_pd_bars(st.loc[st.Category == "Classification", ].copy(), "Classification of output reads", R, ann=True)
rs = st.loc[st.Category == "Classification", ]
_plot_pd_bars(rs.copy(), "Classification of output reads", R, ann=True)
found, rescue, unusable = float(rs.loc[rs.Name == "Primers_found", ].Value), float(rs.loc[rs.Name == "Rescue", ].Value), float(rs.loc[rs.Name == "Unusable", ].Value)
total = found + rescue + unusable
found = found / total * 100
rescue = rescue / total * 100
unusable = unusable / total * 100
sys.stderr.write("-----------------------------------\n")
sys.stderr.write("Reads with two primers:\t{:.2f}%\nRescued reads:\t\t{:.2f}%\nUnusable reads:\t\t{:.2f}%\n".format(found, rescue, unusable))
sys.stderr.write("-----------------------------------\n")
_plot_pd_bars(st.loc[st.Category == "Strand", ].copy(), "Strand of oriented reads", R, ann=True)
_plot_pd_bars(st.loc[st.Category == "RescueStrand", ].copy(), "Strand of rescued reads", R, ann=True)
_plot_pd_bars(st.loc[st.Category == "UnclassHitNr", ].copy(), "Number of hits in unclassified reads", R)
_plot_pd_bars(st.loc[st.Category == "RescueHitNr", ].copy(), "Number of hits in rescued reads", R)
_plot_pd_bars(st.loc[st.Category == "RescueSegmentNr", ].copy(), "Number of usable segments per rescued read", R)
if args.q is None:
_plot_pd_line(st.loc[st.Category == "AutotuneSample", ].copy(), "Usable bases as function of cutoff(q). Best q={:.4f}".format(args.q), R, vline=args.q)
if q_bak is None:
_plot_pd_line(st.loc[st.Category == "AutotuneSample", ].copy(), "Usable bases as function of cutoff(q). Best q={:.4g}".format(args.q), R, vline=args.q)
udf = st.loc[st.Category == "Unusable", ].copy()
udf.Name = np.log10(1.0 + np.array(udf.Name, dtype=float))
_plot_pd_line(udf, "Log10 length distribution of trimmed away sequences.", R)
Expand All @@ -244,17 +255,21 @@ def _plot_stats(st, pdf):
if args.c is not None:
CONFIG = open(args.c, "r").readline().strip()

kits = {"PCS109": {"HMM": os.path.join(os.path.dirname(phmm_data.__file__), "cDNA_SSP_VNP.hmm"), "FAS": os.path.join(os.path.dirname(primer_data.__file__), "cDNA_SSP_VNP.fas"), }, "PCS110": {
"HMM": os.path.join(os.path.dirname(phmm_data.__file__), "PCS110_primers.hmm"), "FAS": os.path.join(os.path.dirname(primer_data.__file__), "PCS110_primers.fas")}}

if args.g is None:
args.g = os.path.join(os.path.dirname(phmm_data.__file__), "cDNA_SSP_VNP.hmm")
args.g = kits[args.k]["HMM"]

if args.b is None:
args.b = os.path.join(os.path.dirname(primer_data.__file__), "cDNA_SSP_VNP.fas")
args.b = kits[args.k]["FAS"]

if args.x is not None and args.x in ('DCS109'):
if args.x == "DCS109":
CONFIG = "-:VNP,-VNP"

config = utils.parse_config_string(CONFIG)
sys.stderr.write("Using kit: {}\n".format(args.k))
sys.stderr.write("Configurations to consider: \"{}\"\n".format(CONFIG))

in_fh = sys.stdin
Expand Down Expand Up @@ -306,9 +321,10 @@ def backend(x, pool, q=None, mb=None):
else:
raise Exception("Invalid backend!")

# Pick the -q maximizing the number of classified reads using frid search:
# Pick the -q maximizing the number of classified reads using grid search:
nr_records = None
tune_df = None
q_bak = args.q
if args.q is None:
nr_cutoffs = args.L
cutoffs = np.linspace(0.0, 1.0, num=nr_cutoffs)
Expand Down Expand Up @@ -356,7 +372,7 @@ def backend(x, pool, q=None, mb=None):
tune_df["Value"] += [args.q]
if best_qi == (len(class_reads) - 1):
sys.stderr.write("Best cuttoff value is at the edge of the search interval! Using tuned value is not safe! Please pick a q value manually and QC your data!\n")
sys.stderr.write("Best cutoff (q) value is {:.4f} with {:.0f}% of the reads classified.\n".format(args.q, class_reads[best_qi] * 100 / len(read_sample)))
sys.stderr.write("Best cutoff (q) value is {:.4g} with {:.0f}% of the reads classified.\n".format(args.q, class_reads[best_qi] * 100 / len(read_sample)))

if nr_records is not None:
input_size = nr_records
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 2.4.0
current_version = 2.5.0
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

setup(
name='pychopper',
version='2.4.0',
version='2.5.0',
description="A tool to identify full length cDNA reads.",
long_description=readme,
author="ONT Applications Group",
Expand Down

0 comments on commit c6395e7

Please sign in to comment.