diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 47fec2af7..ae06734e0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -39,7 +39,7 @@ jobs: id: cache-miniconda uses: actions/cache@v2 env: - CACHE_NUMBER: 0 + CACHE_NUMBER: 1 with: repo-token: ${{ secrets.GITHUB_TOKEN }} path: ~/conda_pkgs_dir diff --git a/Mikado/_transcripts/scoring_configuration.py b/Mikado/_transcripts/scoring_configuration.py index fc866babe..dc75d6f24 100644 --- a/Mikado/_transcripts/scoring_configuration.py +++ b/Mikado/_transcripts/scoring_configuration.py @@ -23,6 +23,24 @@ def compiler(expression): raise InvalidConfiguration("Invalid expression:\n{}".format(expression)) +class Unique(validate.Validator): + message = "{input} is Not unique" + + def __init__(self, error=None): + self.error = error + + def _repr_args(self): + return "" + + def _format_error(self, value): + return self.message.format(input=value) + + def __call__(self, value): + if len(value) != len(set(value)): + raise validate.ValidationError(self._format_error(value)) + return value + + @dataclass class SizeFilter: operator: str = field(metadata={"required": True, "validate": validate.OneOf(["gt", "ge", "lt", "le"])}) @@ -30,6 +48,7 @@ class SizeFilter: metric: Optional[str] = field(metadata={"required": False}, default=None) name: Optional[str] = field(default=None) source: Optional[str] = field(default=None) + default: Optional[float] = field(default=0) @dataclass @@ -39,42 +58,28 @@ class NumBoolEqualityFilter: metric: Optional[str] = field(metadata={"required": False}, default=None) name: Optional[str] = field(default=None) source: Optional[str] = field(default=None) + default: Optional[Union[float, bool]] = field(default=0) @dataclass class InclusionFilter: - value: list = field(metadata={"required": True}) + value: list = field(metadata={"required": True, "validate": [Unique]}) operator: str = field(metadata={"required": True, "validate": validate.OneOf(["in", "not in"])}) metric: Optional[str] = field(metadata={"required": False}, default=None) name: Optional[str] = field(default=None) source: Optional[str] = field(default=None) + default: Optional[List[float]] = field(default=None) @dataclass class RangeFilter: - class Unique(validate.Validator): - message = "{input} is Not unique" - - def __init__(self, error=None): - self.error = error - - def _repr_args(self): - return "" - - def _format_error(self, value): - return self.message.format(input=value) - - def __call__(self, value): - if len(value) != len(set(value)): - raise validate.ValidationError(self._format_error(value)) - return value - value: List[float] = field(metadata={ "required": True, "validate": [validate.Length(min=2, max=2), Unique]}) operator: str = field(metadata={"required": True, "validate": validate.OneOf(["within", "not within"])}) metric: Optional[str] = field(metadata={"required": False}, default=None) name: Optional[str] = field(default=None) + default: Optional[float] = field(default=0) @dataclass diff --git a/Mikado/_transcripts/transcript_base.py b/Mikado/_transcripts/transcript_base.py index bed1cfcfe..2a5616218 100644 --- a/Mikado/_transcripts/transcript_base.py +++ b/Mikado/_transcripts/transcript_base.py @@ -213,7 +213,6 @@ def __init__(self, *args, self.__derived_children = set() self.__original_source = None self.__external_scores = Namespace(default=(0, False)) - self.__internal_orf_transcripts = [] self.__cds_not_maximal = None # Starting settings for everything else @@ -230,7 +229,6 @@ def __init__(self, *args, self.splices = set() self.selected_internal_orf_index = None self.__verified_introns = set() - self.segments = [] self.intron_range = intron_range self.internal_orfs = [] self.blast_hits = [] @@ -296,6 +294,30 @@ def __initialize_with_line(self, transcript_row): else: raise TypeError("Invalid data type: {0}".format(type(transcript_row))) + @staticmethod + def __parse_attributes(attributes): + new_attributes = dict() + booleans = {"True": True, "False": False, "None": None} + for key, val in attributes.items(): + if not isinstance(val, Hashable): + pass + elif val in booleans: + val = booleans[val] + elif isinstance(val, bool): + pass + else: + try: + val = int(val) + except (ValueError, OverflowError): + try: + val = float(val) + except ValueError: + pass + except TypeError: + pass + new_attributes[intern(key)] = val + return new_attributes + def __initialize_with_bed12(self, transcript_row: BED12): """ @@ -333,6 +355,8 @@ def __initialize_with_bed12(self, transcript_row: BED12): cds.append((int(max(exon[0], transcript_row.thick_start)), int(min(exon[1], transcript_row.thick_end)))) self.add_exons(cds, features="CDS") + + self.attributes = self.__parse_attributes(transcript_row.attributes) self.finalize() def __initialize_with_bam(self, transcript_row: pysam.AlignedSegment): @@ -423,26 +447,7 @@ def __initialize_with_gf(self, transcript_row: (GffLine, GtfLine)): self.score = transcript_row.score self.scores = dict() - booleans = {"True": True, "False": False, "None": None} - - for key, val in transcript_row.attributes.items(): - if not isinstance(val, Hashable): - pass - elif val in booleans: - val = booleans[val] - elif isinstance(val, bool): - pass - else: - try: - val = int(val) - except (ValueError, OverflowError): - try: - val = float(val) - except ValueError: - pass - except TypeError: - pass - self.attributes[intern(key)] = val + self.attributes = self.__parse_attributes(transcript_row.attributes) self.blast_hits = [] if transcript_row.is_transcript is False: @@ -794,8 +799,10 @@ def get_internal_orf_beds(self) -> List[BED12]: row.block_sizes = [0] row = BED12(row, seq, coding=False, transcriptomic=True, max_regression=0, start_adjustment=False, + lenient=True, table=self.codon_table) - assert row.invalid is False, ("\n".join([str(row), row.invalid_reason])) + if row.invalid is True: + raise AssertionError("\n".join([str(row), row.invalid_reason])) yield row else: for index, iorf in enumerate(self.internal_orfs): @@ -832,8 +839,10 @@ def get_internal_orf_beds(self) -> List[BED12]: new_row.phase = phase # self.logger.debug(new_row) new_row = BED12(new_row, + lenient=False, sequence=seq, phase=phase, + logger=self.logger, coding=True, transcriptomic=True, max_regression=0, start_adjustment=False, table=self.codon_table) if (cds_len - phase) % 3 != 0 and cds_end not in (self.start, self.end): @@ -889,39 +898,51 @@ def configuration(self): return self.__configuration @property - def frames(self): + def frames(self) -> dict: """This property will return a dictionary with three keys - the three possible frames, 0, 1 and 2 - and within each, a set of the positions that are in that frame. If the transcript does not have """ self.finalize() - frames = {0: set(), 1: set(), 2: set()} - for orf in self.internal_orfs: - if self.strand == "-": - exons = sorted([(_[1][0], _[1][1], _[2]) for _ in orf - if _[0] == "CDS"], key=operator.itemgetter(0, 1), reverse=True) - for start, end, phase in exons: - frame = ((3 - phase) % 3 - 1) % 3 - for pos in range(end, start - 1, -1): - frame = abs((frame + 1) % 3) - frames[frame].add(pos) - else: - exons = sorted([(_[1][0], _[1][1], _[2]) for _ in orf - if _[0] == "CDS"], key=operator.itemgetter(0, 1), reverse=False) - for start, end, phase in exons: - frame = ((3 - phase) % 3 - 1) % 3 # Previous frame before beginning of the feature - for pos in range(start, end + 1): - frame = abs((frame + 1) % 3) - frames[frame].add(pos) - return frames + + @functools.lru_cache() + def calculate_frames(internal_orfs, strand) -> dict: + frames = {0: set(), 1: set(), 2: set()} + for orf in internal_orfs: + if strand == "-": + exons = sorted([(_[1][0], _[1][1], _[2]) for _ in orf + if _[0] == "CDS"], key=operator.itemgetter(0, 1), reverse=True) + for start, end, phase in exons: + frame = ((3 - phase) % 3 - 1) % 3 + for pos in range(end, start - 1, -1): + frame = abs((frame + 1) % 3) + frames[frame].add(pos) + else: + exons = sorted([(_[1][0], _[1][1], _[2]) for _ in orf + if _[0] == "CDS"], key=operator.itemgetter(0, 1), reverse=False) + for start, end, phase in exons: + frame = ((3 - phase) % 3 - 1) % 3 # Previous frame before beginning of the feature + for pos in range(start, end + 1): + frame = abs((frame + 1) % 3) + frames[frame].add(pos) + for frame in frames: + frames[frame] = frozenset(frames[frame]) + return frames + + return calculate_frames(tuple(tuple(_) for _ in self.internal_orfs), self.strand) @property - def framed_codons(self): + def framed_codons(self) -> List: """Return the list of codons as calculated by self.frames.""" - codons = list(zip(*[sorted(self.frames[0]), sorted(self.frames[1]), sorted(self.frames[2])])) - if self.strand == "-": - codons = list(reversed(codons)) + @functools.lru_cache() + def calculate_codons(frames, strand) -> list: + # Reconvert to dictionary. We had to turn into a tuple of items for hashing. + frames = dict(frames) + codons = list(zip(*[sorted(frames[0]), sorted(frames[1]), sorted(frames[2])])) + if strand == "-": + codons = list(reversed(codons)) + return codons - return codons + return calculate_codons(tuple(self.frames.items()), self.strand) @property def _selected_orf_transcript(self): @@ -939,16 +960,14 @@ def _internal_orfs_transcripts(self): Note: this will exclude the UTR part, even when the transcript only has one ORF.""" self.finalize() - if not self.is_coding: - return [] - elif len(self.__internal_orf_transcripts) == len(self.internal_orfs): - return self.__internal_orf_transcripts - else: - for num, orf in enumerate(self.internal_orfs, start=1): + @functools.lru_cache() + def calculate_orf_transcripts(internal_orfs, chrom, strand, tid): + orf_transcripts = [] + for num, orf in enumerate(internal_orfs, start=1): torf = TranscriptBase() - torf.chrom, torf.strand = self.chrom, self.strand - torf.derives_from = self.id - torf.id = "{}.orf{}".format(self.id, num) + torf.chrom, torf.strand = chrom, strand + torf.derives_from = tid + torf.id = "{}.orf{}".format(tid, num) __exons, __phases = [], [] for segment in [_ for _ in orf if _[0] == "CDS"]: __exons.append(segment[1]) @@ -956,9 +975,13 @@ def _internal_orfs_transcripts(self): torf.add_exons(__exons, features="exon", phases=None) torf.add_exons(__exons, features="CDS", phases=__phases) torf.finalize() - self.__internal_orf_transcripts.append(torf) + orf_transcripts.append(torf) + return orf_transcripts - return self.__internal_orf_transcripts + orf_transcripts = calculate_orf_transcripts(tuple(tuple(_) for _ in self.internal_orfs), + self.chrom, self.strand, self.id) + + return orf_transcripts def as_bed12(self) -> BED12: @@ -992,14 +1015,6 @@ def remove_exon(self, exon): if exon in self.exons: self.exons.remove(exon) - if ("exon", exon) in self.segments: - self.segments.remove(("exon", exon)) - tr = set() - for segment in self.segments: - if self.overlap(segment[1], exon) >= 0: - tr.add(segment) - for _ in tr: - self.segments.remove(_) tr = set() for segment in self.combined_cds: if self.overlap(segment, exon) >= 0: @@ -1056,7 +1071,6 @@ def remove_utrs(self): self.start = cds_start self.end = cds_end self.internal_orfs, self.combined_utr = [], [] - self.segments = [] # Need to recalculate it self._cdna_length = None self.finalize() @@ -1089,7 +1103,6 @@ def strip_cds(self, strand_specific=True): self._selected_cds_introns = set() self.selected_internal_orf_index = None self.combined_utr = [] - self.segments = [] self.internal_orfs = [] self.finalize() @@ -1131,7 +1144,6 @@ def unfinalize(self): return self.internal_orfs = [] - self.__internal_orf_transcripts = [] self.combined_utr = [] self._cdna_length = None self.finalized = False @@ -1207,10 +1219,6 @@ def as_dict(self, remove_attributes=False): # Now let'add the things that we calculate with the basic lengths state["calculated"] = self.__get_calculated_stats() state["feature"] = self.feature - state["segments"] = [] - for segment in self.segments: - segment = [segment[0], [segment[1][0], segment[1][1]]] - state["segments"].append(segment) for metric in mmetrics: state[metric] = getattr(self, metric) @@ -1245,7 +1253,6 @@ def load_dict(self, state, trust_orf=False, accept_undefined_multi=False): self.combined_cds = [] self._selected_cds_introns = set() self._combined_cds_introns = set() - self.segments = [] self._cdna_length = None for dump, store in zip( @@ -1260,14 +1267,6 @@ def load_dict(self, state, trust_orf=False, accept_undefined_multi=False): else: store.append(tuple(iv)) - for segment in state["segments"]: - if len(segment) != 2: - raise CorruptIndex("Invalid segment values for {}: {}".format(self.id, segment)) - feature, (start, end) = segment - if not all([isinstance(feature, str), isinstance(start, int), isinstance(end, int)]): - raise CorruptIndex("Invalid segment values for {}: {}".format(self.id, segment)) - self.segments.append((feature, (start, end))) - self.splices = set(state["splices"]) self._trust_orf = trust_orf self._accept_undefined_multi = accept_undefined_multi @@ -1741,10 +1740,7 @@ def original_source(self, value): @property def gene(self): - if "gene_id" not in self.attributes: - self.attributes["gene_id"] = self.parent[0] - - return self.attributes["gene_id"] + return self.attributes.get("gene_id", self.parent[0]) @property def location(self): @@ -1774,7 +1770,7 @@ def score(self, score): if not isinstance(score, (float, int)): try: score = float(score) - except: + except (ValueError, TypeError): raise ValueError( "Invalid value for score: {0}, type {1}".format(score, type(score))) self.__score = score diff --git a/Mikado/_transcripts/transcript_methods/finalizing.py b/Mikado/_transcripts/transcript_methods/finalizing.py index e11958bbd..a5558c647 100644 --- a/Mikado/_transcripts/transcript_methods/finalizing.py +++ b/Mikado/_transcripts/transcript_methods/finalizing.py @@ -502,35 +502,31 @@ def _check_phase_correctness(transcript, strip_faulty_cds=True): :return: Mikado.loci.transcript.Transcript """ - segments, internal_orfs = transcript.segments, transcript.internal_orfs - - if min(len(segments), len(internal_orfs)) == 0: - # transcript.logger.debug("Redefining segments for %s", transcript.id) - segments = [("exon", tuple([e[0], e[1]])) for e in transcript.exons] - # Define CDS - if len(internal_orfs) > 0: - for segment in itertools.chain(internal_orfs): - if segment[0] == "exon": - continue - elif segment[0] == "UTR": - segments.append(("UTR", (segment[1][0], segment[1][1]))) - elif segment[0] == "CDS": - segments.append(("CDS", (segment[1][0], segment[1][1]))) - else: - segments.extend([("CDS", tuple([c[0], c[1]])) for c in transcript.combined_cds]) - # Define UTR segments - segments.extend([("UTR", tuple([u[0], u[1]])) for u in transcript.combined_utr]) - # Mix and sort - segments = sorted(segments, key=operator.itemgetter(1, 0)) - # Add to the store as a single entity - if not internal_orfs and any(_[0] == "CDS" for _ in segments): - internal_orfs = [segments] - else: - transcript.selected_internal_orf_index = None + internal_orfs = transcript.internal_orfs + + segments = [("exon", tuple([e[0], e[1]])) for e in transcript.exons] + # Define CDS + if len(internal_orfs) > 0: + for segment in itertools.chain(internal_orfs): + if segment[0] == "exon": + continue + elif segment[0] == "UTR": + segments.append(("UTR", (segment[1][0], segment[1][1]))) + elif segment[0] == "CDS": + segments.append(("CDS", (segment[1][0], segment[1][1]))) else: - pass + segments.extend([("CDS", tuple([c[0], c[1]])) for c in transcript.combined_cds]) + # Define UTR segments + segments.extend([("UTR", tuple([u[0], u[1]])) for u in transcript.combined_utr]) + # Mix and sort + segments = sorted(segments, key=operator.itemgetter(1, 0)) + # Add to the store as a single entity + if not internal_orfs and any(_[0] == "CDS" for _ in segments): + internal_orfs = [segments] + else: + transcript.selected_internal_orf_index = None - transcript.segments, transcript.internal_orfs = segments, internal_orfs + transcript.internal_orfs = internal_orfs __orfs_to_remove = [] orf_excs = [] @@ -669,7 +665,6 @@ def finalize(transcript): transcript.logger.exception(exc) transcript.combined_cds = [] transcript.combined_utr = [] - transcript.segments = [] transcript.internal_orfs = [] _basic_final_checks(transcript) _check_cdna_vs_utr(transcript) @@ -704,8 +699,6 @@ def finalize(transcript): try: _check_completeness(transcript) - assert all([segment[1] in transcript.exons for segment in transcript.segments if - segment[0] == "exon"]), (transcript.exons, transcript.segments) transcript.logger.debug("Verifying phase correctness for %s", transcript.id) _check_phase_correctness(transcript, strip_faulty_cds=strip_faulty_cds) transcript.logger.debug("Calculating intron correctness for %s", transcript.id) diff --git a/Mikado/_transcripts/transcript_methods/printing.py b/Mikado/_transcripts/transcript_methods/printing.py index c6fb2d23d..4ee688828 100644 --- a/Mikado/_transcripts/transcript_methods/printing.py +++ b/Mikado/_transcripts/transcript_methods/printing.py @@ -377,22 +377,8 @@ def as_bed12(transcript, transcriptomic=False, with_cds=True): except KeyError: raise KeyError((transcript.selected_cds[-1], transcript.phases)) - name = "ID={ID};coding={coding};phase={phase}".format( - ID=transcript.id, - coding=transcript.is_coding, - # Now we have to get the phase of the first CDS exon .. - phase=phase) - else: - name = "ID={ID};coding={coding}".format( - ID=transcript.id, - coding=transcript.is_coding, - # Now we have to get the phase of the first CDS exon .. - ) - - if transcript.alias is not None and transcript.alias != transcript.id: - name += ";alias={}".format(transcript.alias) - - bed12.name = name + bed12.coding = transcript.is_coding + bed12.name = transcript.id bed12.score = transcript.score if transcript.score else 0 bed12.strand = transcript.strand if transcript.is_coding and with_cds is True: @@ -407,6 +393,9 @@ def as_bed12(transcript, transcriptomic=False, with_cds=True): bed12.thick_end = bed12.end bed12.coding = False + for key, val in transcript.attributes.items(): + bed12.attributes[key] = val + bed12.block_count = transcript.exon_num bed12.block_sizes = [exon[1] - exon[0] + 1 for exon in transcript.exons] _introns = np.concatenate([np.array([intron[1] - intron[0] + 1 for intron in sorted(transcript.introns)], @@ -419,6 +408,7 @@ def as_bed12(transcript, transcriptomic=False, with_cds=True): bed12 = bed12.to_transcriptomic(alias=transcript.alias, start_adjustment=False, coding=(transcript.is_coding and with_cds)) bed12.chrom = transcript.id + return bed12 diff --git a/Mikado/configuration/configurator.py b/Mikado/configuration/configurator.py index e41114a2a..c8d01320c 100644 --- a/Mikado/configuration/configurator.py +++ b/Mikado/configuration/configurator.py @@ -185,11 +185,14 @@ def load_and_validate_config(raw_configuration: Union[None, MikadoConfiguration, try: config = DaijinConfiguration.Schema().load(config, partial=external) except marshmallow.exceptions.ValidationError as exc: - logger.critical("The configuration file is invalid. Validation errors:\n%s\n%s\n\n", - pprint.pformat(mikado_exc.messages), - pprint.pformat(exc.messages)) + + exc = marshmallow.exceptions.ValidationError( + "The configuration file is invalid. \n" + f"Validation errors if a Daijin configuration file was expected:\n{exc}\n" + f"Validation errors if a Mikado configuration file was expected:\n{mikado_exc}" + ) logger.critical(exc) - raise + raise exc config.filename = raw_configuration if config.scoring is None: @@ -202,7 +205,7 @@ def load_and_validate_config(raw_configuration: Union[None, MikadoConfiguration, raise except Exception as exc: logger.exception("Loading the configuration file failed with error:\n%s\n\n\n", exc) - raise InvalidConfiguration("The configuration file passed is invalid. Please double check.") + raise InvalidConfiguration(f"The configuration file passed is invalid. Please double check. Exception: {exc}") random.seed(config.seed % (2 ** 32 - 1)) diff --git a/Mikado/loci/abstractlocus.py b/Mikado/loci/abstractlocus.py index cdd5c6362..ecc7a15f8 100644 --- a/Mikado/loci/abstractlocus.py +++ b/Mikado/loci/abstractlocus.py @@ -375,6 +375,10 @@ def evaluate(param: Union[str, int, bool, float], comparison = (param not in conf.value) elif comparison_operator == "within": start, end = sorted([conf.value[0], conf.value[1]]) + try: + float(param) + except (TypeError, ValueError): + raise ValueError(param) comparison = (start <= float(param) <= end) elif comparison_operator == "not within": start, end = sorted([conf.value[0], conf.value[1]]) @@ -1079,6 +1083,20 @@ def _create_metrics_row(self, tid: str, metrics: dict, transcript: Transcript) - value = "NA" row[key] = value + # Add the attributes + for param_dict in [self.configuration.scoring.scoring, self.configuration.scoring.cds_requirements.parameters, + self.configuration.scoring.as_requirements.parameters, + self.configuration.scoring.not_fragmentary.parameters, + self.configuration.scoring.requirements.parameters]: + for attr_name, attr_metric in [(key, metric) for key, metric in param_dict.items() + if key.startswith("attributes")]: + value = transcript.attributes.get(attr_name.replace("attributes.", ""), attr_metric.default) + if isinstance(value, float): + value = round(value, 2) + elif value is None or value == "": + value = "NA" + row[attr_name] = value + return row def print_metrics(self): @@ -1281,16 +1299,21 @@ def _check_not_passing(self, previous_not_passing=(), section_name="requirements evaluated = dict() for key in section.parameters: - if section.parameters[key].name is not None: - name = section.parameters[key].name + if key.startswith("attributes.") and len(key) > len("attributes."): + key_parts = key.split('.', 1) + default = section.parameters[key].default + value = self.transcripts[tid].attributes.get(key_parts[1], default) else: - name = key - try: - value = operator.attrgetter(name)(self.transcripts[tid]) - except AttributeError: - raise AttributeError((section_name, key, section.parameters[key])) - if "external" in key: - value = value[0] + if section.parameters[key].name is not None: + name = section.parameters[key].name + else: + name = key + try: + value = operator.attrgetter(name)(self.transcripts[tid]) + except AttributeError: + raise AttributeError((section_name, key, section.parameters[key])) + if "external" in key: + value = value[0] evaluated[key] = self.evaluate(value, section.parameters[key]) # pylint: disable=eval-used diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py index a97c78529..675ac26aa 100644 --- a/Mikado/loci/locus.py +++ b/Mikado/loci/locus.py @@ -10,6 +10,8 @@ import operator from collections import defaultdict import pysam + +from ..scales.resultstorer import ResultStorer from ..transcripts.pad import pad_transcript from ..transcripts.transcript import Transcript from .abstractlocus import Abstractlocus @@ -580,9 +582,14 @@ def _check_as_requirements(self, transcript: Transcript, is_reference=False) -> evaluated = dict() for key in section.parameters: name = section.parameters[key].name - value = operator.attrgetter(name)(transcript) - if "external" in key: - value = value[0] + if 'attributes' in name: + name_parts = name.split('.') + default = section.parameters[key].default + value = transcript.attributes.get(name_parts[1], default) + else: + value = operator.attrgetter(name)(transcript) + if "external" in key: + value = value[0] evaluated[key] = self.evaluate(value, section.parameters[key]) # pylint: disable=eval-used @@ -611,7 +618,11 @@ def is_putative_fragment(self): evaluated = dict() for key, params in self.configuration.scoring.not_fragmentary.parameters.items(): name = params.name - value = operator.attrgetter(name)(self.primary_transcript) + if name.startswith("attributes"): + default = params.default + value = self.primary_transcript.attributes.get(name.replace("attributes.", ""), default) + else: + value = operator.attrgetter(name)(self.primary_transcript) if "external" in key: value = value[0] try: @@ -640,7 +651,7 @@ def is_putative_fragment(self): assert self.id == current_id return fragment - def other_is_fragment(self, other): + def other_is_fragment(self, other) -> (bool, Union[None, ResultStorer]): """ If the 'other' locus is marked as a potential fragment (see 'is_putative_fragment'), then this function will check whether the other locus is within the distance and with the correct comparison class code to be @@ -657,6 +668,10 @@ def other_is_fragment(self, other): self.logger.debug("Self-comparisons are not allowed!") return False, None + if other.is_putative_fragment() is False: + self.logger.debug(f"{other.id} cannot be a fragment according to the scoring file definition.") + return False, None + if any(other.transcripts[tid].is_reference is True for tid in other.transcripts.keys()): self.logger.debug("Locus %s has a reference transcript, hence it will not be discarded", other.id) return False, None diff --git a/Mikado/parsers/__init__.py b/Mikado/parsers/__init__.py index 6c8f17847..73cc6aa19 100644 --- a/Mikado/parsers/__init__.py +++ b/Mikado/parsers/__init__.py @@ -15,6 +15,7 @@ from . import bed12 from . import blast_utils from . import bam_parser +from .bed12 import BED12 def parser_factory(string, input_format=None): diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index a7312dbd0..d9482f4c6 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -9,7 +9,7 @@ import os from Bio import Seq import Bio.SeqRecord -from .parser import Parser +from .parser import Parser, _attribute_definition from sys import intern import copy from ..exceptions import InvalidParsingFormat @@ -233,6 +233,11 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], :param transcriptomic: boolean flag :type transcriptomic: bool + :param lenient: boolean flag. If set as True, the BED12 object will skip certain checks on the validity of the + CDS (e.g. the CDS length being required to be a multiple of three, after accounting for the + phase). + :type lenient: bool + Constructor method. Each instance will have: @@ -320,6 +325,8 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], self.score = 0 self.strand = None self.rgb = '' + self.stop_codon = self.start_codon = None + self.__has_start = self.__has_stop = False self.__block_sizes = np.zeros(1, dtype=np.int64) self.__block_starts = np.zeros(1, dtype=np.int64) self.__block_count = 1 @@ -339,6 +346,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], self.alias = None self.__logger = create_null_logger() self.logger = logger + self.attributes = dict() self.logger.debug("Set the basic properties for %s", self.chrom) if len(args) == 0: @@ -390,6 +398,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], raise InvalidParsingFormat("I need an ordered array, not {0}".format(type(self._line))) else: self._fields = self._line + print("Line", self._fields) self.__set_values_from_fields() self.__check_validity(transcriptomic, fasta_index, sequence) @@ -404,7 +413,6 @@ def __init__(self, *args: Union[str, list, tuple, GffLine], @property def is_transcript(self): """BED12 files are always transcripts for Mikado.""" - return True @property @@ -495,6 +503,7 @@ def _parse_attributes(self, attributes): """ self.attribute_order = [] + print("Parsing", attributes) infolist = self._attribute_pattern.findall(attributes.rstrip().rstrip(";")) @@ -514,6 +523,8 @@ def _parse_attributes(self, attributes): elif key.lower() == "id": self.name = val else: + print(key.capitalize(), val) + self.attributes[key.capitalize()] = _attribute_definition(val) continue def __set_values_from_fields(self): @@ -546,9 +557,10 @@ def __set_values_from_fields(self): self.block_starts = [int(x) for x in block_starts.split(",") if x] else: self.block_starts = [int(x) for x in block_starts] - self._parse_attributes(self.name) + to_parse = self.name if len(self._fields) == 13: - self._parse_attributes(self._fields[-1]) + to_parse = ";".join([to_parse, self._fields[-1]]) + self._parse_attributes(to_parse) self.has_start_codon = False self.has_stop_codon = False self.start_codon = None @@ -668,7 +680,7 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): self.chrom, self.has_start_codon, self.has_stop_codon, not self.invalid) # Get only a proper multiple of three - if self.__lenient is False: + if self.lenient is False and self.coding is True: if self.strand != "-": orf_sequence = sequence[ (self.thick_start - 1 if not self.phase @@ -685,6 +697,19 @@ def __check_validity(self, transcriptomic, fasta_index, sequence): gap='N') self._internal_stop_codons = str(translated_seq).count("*") + if self._internal_stop_codons == 0 and len(orf_sequence[last_pos:]) > 3: + if orf_sequence[last_pos:][:3] in self.table.stop_codons: + # We need to shift either the thick start or the thick end. Note that last_pos is negative + # so we need to minus it. + if self.strand == "-": + self.thick_start += -last_pos % 3 + self.logger.warning( + f"Shifting the position of the thick start of {self.name} by {-last_pos % 3}") + else: + self.thick_end -= -last_pos % 3 + self.logger.warning( + f"Shifting the position of the thick end of {self.name} by {-last_pos % 3}") + del self.invalid if self.__is_invalid() is True: return @@ -743,7 +768,7 @@ def _adjust_start(self, sequence, orf_sequence): self.chrom, self.invalid, self.invalid_reason) break else: - self.__regression(orf_sequence) + self._regression(orf_sequence) if self.has_start_codon is False: # The validity will be automatically checked @@ -772,7 +797,7 @@ def _adjust_start(self, sequence, orf_sequence): self.logger.debug("%s is not coding after checking. Reason: %s", self.chrom, self.invalid_reason) self.coding = False - def __regression(self, orf_sequence): + def _regression(self, orf_sequence): self.logger.debug( "Starting the regression algorithm to find an internal start for %s (end: %s; thick start/end: %s, %s; phase %s)", self.chrom, self.end, self.thick_start, self.thick_end, self.phase) @@ -834,18 +859,12 @@ def __str__(self): return "#" line = [self.chrom, self.start - 1, self.end] - - if self.transcriptomic is True: - name = "ID={};coding={}".format(self.id, self.coding) - if self.coding: - name += ";phase={}".format(self.phase) - if self.alias is not None and self.alias != self.id: - name += ";alias={}".format(self.alias) - - line.append(name) - else: - line.append(self.name) - + name = "ID={};coding={}".format(self.id, self.coding) + if self.coding: + name += ";phase={}".format(self.phase) + if self.alias is not None and self.alias != self.id: + name += ";alias={}".format(self.alias) + line.append(name) if not self.score: line.append(0) else: @@ -862,6 +881,20 @@ def __str__(self): line.append(self.block_count) line.append(",".join([str(x) for x in self.block_sizes])) line.append(",".join([str(x) for x in self.block_starts])) + attributes = dict((key.lower(), val) for key, val in self.attributes.items() if key.lower() not in + ("geneid", "gene_id", "name", "phase", "coding", "alias", "id")) + if self.parent is not None: + attributes["Parent"] = self.parent[0] + assert "Parent" in attributes + elif "parent" in attributes: + parent = attributes.pop("parent") + if isinstance(parent, (tuple, list)) and len(parent) > 0: + parent = ",".join(parent) + elif isinstance(parent, (tuple, list)) and len(parent) == 0: + parent = None + attributes["Parent"] = parent + if attributes: + line.append(";".join(f"{key}={val}" for key, val in attributes.items() if val is not None)) return "\t".join([str(x) for x in line]) def __eq__(self, other): @@ -1011,6 +1044,10 @@ def invalid(self): def invalid(self): self.__invalid = None + @property + def lenient(self): + return self.__lenient + def __is_invalid(self): if self._internal_stop_codons >= 1: @@ -1045,7 +1082,7 @@ def __is_invalid(self): ) return True - if self.__lenient is True: + if self.lenient is True: pass else: if (self.cds_len - self.phase) % 3 != 0: diff --git a/Mikado/parsers/gfannotation.py b/Mikado/parsers/gfannotation.py index 189811924..5578f31a9 100644 --- a/Mikado/parsers/gfannotation.py +++ b/Mikado/parsers/gfannotation.py @@ -9,6 +9,7 @@ import copy from sys import intern import re +from .parser import _attribute_definition __author__ = 'Luca Venturini' @@ -18,22 +19,6 @@ [intern(_) for _ in ["+", "-", "?", "true", "True", "false", "False"]] -def _attribute_definition(val): - try: - val = float(val) - if val.is_integer(): - return int(val) - return val - except (ValueError, TypeError): - if val.lower() in ("true", "false"): - val = val.capitalize() - if val == "True": - return True - else: - return False - return val - - # This class has exactly how many attributes I need it to have # pylint: disable=too-many-instance-attributes class GFAnnotation(metaclass=abc.ABCMeta): diff --git a/Mikado/parsers/parser.py b/Mikado/parsers/parser.py index 4222af9a4..d729abe6e 100644 --- a/Mikado/parsers/parser.py +++ b/Mikado/parsers/parser.py @@ -117,3 +117,19 @@ def __setstate__(self, state): self.__dict__.update(state) self._handle = self.__get_handle(state["_handle"], position=position) + +def _attribute_definition(val): + try: + val = float(val) + if val.is_integer(): + return int(val) + return val + except (ValueError, TypeError): + if val.lower() in ("true", "false"): + val = val.capitalize() + if val == "True": + return True + else: + return False + return val + diff --git a/Mikado/picking/_loci_serialiser.py b/Mikado/picking/_loci_serialiser.py index 3adf64b3e..3df09c9db 100644 --- a/Mikado/picking/_loci_serialiser.py +++ b/Mikado/picking/_loci_serialiser.py @@ -23,8 +23,8 @@ def serialise_locus(stranded_loci: [Superlocus], if print_subloci is True: batch = [] batch.append(stranded_locus.__str__(level="subloci", print_cds=print_cds)) - for sublocus in stranded_locus.subloci: - assert sublocus.scores_calculated is True, (sublocus.id, sublocus.scores_calculated) + if not all(sublocus.scores_calculated is True for sublocus in stranded_locus.subloci): + raise RuntimeError("The locus we are trying to serialise has not been processed correctly.") sub_metrics_rows = [_ for _ in stranded_locus.print_subloci_metrics() if _ != {} and "tid" in _] sub_scores_rows = [_ for _ in stranded_locus.print_subloci_scores() if _ != {} and "tid" in _] diff --git a/Mikado/picking/loci_processer.py b/Mikado/picking/loci_processer.py index 94b1f110b..dc1b737bc 100644 --- a/Mikado/picking/loci_processer.py +++ b/Mikado/picking/loci_processer.py @@ -349,6 +349,10 @@ def terminate(self): self.__close_handles() super().terminate() + def kill(self) -> None: + self.__close_handles() + super(LociProcesser, self).kill() + def __close_handles(self): """Private method to flush and close all handles.""" if self.engine is not None: @@ -437,10 +441,12 @@ def run(self): self.status_queue.put(accumulator) self.logger.debug("Put all the remaining results into the queue for %s (size %s)", self.name, sys.getsizeof(accumulator)) - self.locus_queue.task_done() + # self.locus_queue.task_done() self.locus_queue.put((counter, None)) + accumulator = [] break - else: + + try: try: transcripts = msgpack.loads(transcripts, raw=False) except TypeError as err: @@ -487,6 +493,8 @@ def run(self): if len(stranded_loci) == 0: self.logger.warning("No loci left for index %d", counter) + accumulator = serialise_locus([], accumulator, counter, print_cds=print_cds, + print_monosubloci=print_monoloci, print_subloci=print_subloci) else: if len(accumulator) >= 100: accumulator = zlib.compress(msgpack.dumps(accumulator)) @@ -496,6 +504,19 @@ def run(self): accumulator = serialise_locus( stranded_loci, accumulator, counter, print_cds=print_cds, print_monosubloci=print_monoloci, print_subloci=print_subloci) + except (InvalidTranscript, InvalidCDS, AssertionError, ValueError, RuntimeError, TypeError) as exc: + self.logger.error(f"{self.name} has encountered an error, details:\n{exc}") + current = len(accumulator) + accumulator = serialise_locus([], accumulator, counter, print_cds=print_cds, + print_monosubloci=print_monoloci, print_subloci=print_subloci) + assert len(accumulator) > current + finally: self.locus_queue.task_done() + if accumulator: + assert isinstance(accumulator, list) + self.logger.error(accumulator) + accumulator = zlib.compress(msgpack.dumps(accumulator)) + self.status_queue.put(accumulator) + self.logger.info(f"{self.name} has finished") return diff --git a/Mikado/picking/picker.py b/Mikado/picking/picker.py index 16b8911b9..5fe448b47 100644 --- a/Mikado/picking/picker.py +++ b/Mikado/picking/picker.py @@ -36,7 +36,7 @@ from ..loci.superlocus import Superlocus from ..configuration.configurator import load_and_validate_config from ..utilities import dbutils -from ..exceptions import UnsortedInput, InvalidConfiguration, InvalidTranscript +from ..exceptions import UnsortedInput, InvalidConfiguration, InvalidTranscript, InvalidCDS from .loci_processer import analyse_locus, LociProcesser, merge_loci from ._locus_single_printer import print_locus import multiprocessing.managers @@ -460,6 +460,18 @@ def __get_output_files(self): requested_external.update({param for param in self.configuration.scoring.scoring.keys() if param.startswith("external")}) + attribute_metrics = set() + attribute_metrics.update({param for param in self.configuration.scoring.requirements.parameters.keys() + if param.startswith("attributes")}) + attribute_metrics.update({param for param in self.configuration.scoring.not_fragmentary.parameters.keys() + if param.startswith("attributes")}) + attribute_metrics.update({param for param in self.configuration.scoring.cds_requirements.parameters.keys() + if param.startswith("attributes")}) + attribute_metrics.update({param for param in self.configuration.scoring.cds_requirements.parameters.keys() + if param.startswith("attributes")}) + attribute_metrics.update({param for param in self.configuration.scoring.scoring.keys() + if param.startswith("attributes")}) + # Check that the external scores are all present. If they are not, raise a warning. if requested_external - set(available_external_metrics): self.logger.error( @@ -486,6 +498,7 @@ def __get_output_files(self): metrics.extend(available_external_metrics) else: metrics.extend(requested_external) + metrics.extend(attribute_metrics) metrics = Superlocus.available_metrics[:5] + sorted(metrics) session.close() engine.dispose() @@ -689,8 +702,18 @@ def __submit_multi_threading(self): self.logger.error("We have not retrieved all loci!") raise ValueError results = msgpack.loads(zlib.decompress(results)) + self.logger.info(f"Length of results to analyse: {len(results)}") for result in results: - counter, chrom, num_genes, loci, subloci, monoloci = result + try: + counter, chrom, num_genes, loci, subloci, monoloci = result + except (InvalidTranscript, InvalidCDS, TypeError, RuntimeError, ValueError, AssertionError) as exc: + self.logger.critical( + f"Mikado has encountered a fatal crash. Invalid result:\n{result}\n" + "Please inspect the logs and report the bug at " + "https://github.com/EI-CoreBioinformatics/mikado/issues. Exception:\n" + f"{exc}") + [_.kill() for _ in working_processes] + sys.exit(1) mapper["done"].add(counter) if counter in mapper["results"]: self.logger.fatal("%d double index found!", counter) @@ -708,7 +731,9 @@ def __submit_multi_threading(self): mapper[chrom]["done"].add(counter) if mapper[chrom]["done"] == mapper[chrom]["submit"]: self.logger.info("Finished with chromosome %s", chrom) + status_queue.task_done() + self.logger.info("Joining children processes") [_.join() for _ in working_processes] self.logger.info("Joined children processes; starting to merge partial files") diff --git a/Mikado/subprograms/prepare.py b/Mikado/subprograms/prepare.py index 156243e3a..c582d545a 100644 --- a/Mikado/subprograms/prepare.py +++ b/Mikado/subprograms/prepare.py @@ -123,6 +123,9 @@ def parse_prepare_options(args, mikado_config) -> Union[DaijinConfiguration, Mik mikado_config.prepare.strip_faulty_cds = True if getattr(args, "strip_faulty_cds", None) else \ mikado_config.prepare.strip_faulty_cds mikado_config.prepare.strip_cds = True if getattr(args, "strip_cds", None) else mikado_config.prepare.strip_cds + if len(mikado_config.prepare.files.strip_cds) == 0: + mikado_config.prepare.files.strip_cds = [mikado_config.prepare.strip_cds] * len(mikado_config.prepare.files.gff) + mikado_config.serialise.codon_table = str(args.codon_table) if ( getattr(args, "codon_table", None) not in (None, False, True)) else mikado_config.serialise.codon_table @@ -228,7 +231,7 @@ def positive(string): 1- add the "transcript" feature 2- sort by coordinates 3- check the strand""") - parser.add_argument("--fasta", "--reference", dest="reference", + parser.add_argument("--fasta", "--reference", "--genome", dest="reference", type=argparse.FileType(), help="Genome FASTA file. Required.") verbosity = parser.add_mutually_exclusive_group() verbosity.add_argument("--verbose", default=None, dest="log_level", action="store_const", const="DEBUG") diff --git a/Mikado/subprograms/util/trim.py b/Mikado/subprograms/util/trim.py index d5c01e496..4733dad2a 100644 --- a/Mikado/subprograms/util/trim.py +++ b/Mikado/subprograms/util/trim.py @@ -134,7 +134,6 @@ def trim_end(transcript, cds_end, max_length=0): transcript.end = newlast[1] assert all([(exon[1] <= newlast[1] for exon in transcript.exons)]) - assert all([(segment[1][1] <= newlast[1] for segment in transcript.segments)]) return transcript diff --git a/Mikado/tests/locus_test.py b/Mikado/tests/locus_test.py index 052c6d620..ed9224f48 100644 --- a/Mikado/tests/locus_test.py +++ b/Mikado/tests/locus_test.py @@ -2271,7 +2271,7 @@ def test_only_CDS_overlap(self): with self.subTest(min_overlap=min_overlap): cds_overlap = 0 for frame in range(3): - cds_overlap += len(set.intersection( + cds_overlap += len(frozenset.intersection( self.t1.frames[frame], t2.frames[frame] )) diff --git a/Mikado/tests/test_bed12.py b/Mikado/tests/test_bed12.py index e86160d51..f507f46a1 100644 --- a/Mikado/tests/test_bed12.py +++ b/Mikado/tests/test_bed12.py @@ -437,7 +437,7 @@ def test_tran_to_bed12_neg(self): t.add_exons([(71, 100), (201, end)], features="CDS", phases=[0, phase]) t.finalize() r = t.as_bed12() - self.assertEqual(r.name, "ID={};coding={};phase={}".format(t.id, True, phase)) + self.assertEqual(r.name, t.id) self.assertEqual(r.phase, phase, (end, phase)) self.assertEqual(r.thick_end, end) self.assertFalse(r.invalid) diff --git a/Mikado/tests/test_external_scores.py b/Mikado/tests/test_external_scores.py index a9dadc79d..d1b30e6ae 100644 --- a/Mikado/tests/test_external_scores.py +++ b/Mikado/tests/test_external_scores.py @@ -2,7 +2,7 @@ import unittest from .. import create_default_logger -from .._transcripts.scoring_configuration import ScoringFile, MinMaxScore +from .._transcripts.scoring_configuration import ScoringFile, MinMaxScore, SizeFilter, RangeFilter from ..loci import Transcript, Superlocus import pkg_resources import os @@ -35,6 +35,18 @@ def setUp(self): self.transcript2 = self.transcript.copy() self.transcript2.id = "ENST00000560637" # self.assertIn("scoring", self.conf) + + # This is for the is_fragment test + self.other_transcript = Transcript() + self.other_transcript.start = 48053001 + self.other_transcript.end = 48053301 + + exons = [(48053001, 48053301)] + + self.other_transcript.strand = "+" + self.other_transcript.add_exons(exons) + self.other_transcript.id = "ENST00000560646" + self.other_transcript.parent = "ENSG00000137873" def test_copying(self): self.transcript.external_scores.update({"test": 0, "test1": 1}) @@ -185,3 +197,167 @@ def test_attribute_use_raw_percentage(self): sup.get_metrics() self.assertIn("attributes.tpm", sup._metrics[self.transcript.id]) self.assertEqual(sup._metrics[self.transcript.id]["attributes.tpm"], 0.1) + sup.define_loci() + + def test_attribute_use_as_alternative_splicing(self): + from Mikado.loci import Locus + checked_conf = self.conf.copy() + self.transcript.attributes['cov'] = 10 + loci = Locus(self.transcript, configuration=checked_conf) + loci.configuration.scoring.as_requirements.parameters['attributes.cov'] = SizeFilter(operator='ge', value=3, + metric=None, + name='attributes.cov') + loci.configuration.scoring.as_requirements.expression = [ + loci.configuration.scoring.as_requirements.expression[0] + ' and attributes.cov'] + loci.configuration.scoring.as_requirements._check_my_requirements() + self.assertTrue(loci._check_as_requirements(self.transcript)) + + def test_attribute_use_as_alternative_splicing_fail_filter(self): + from Mikado.loci import Locus + checked_conf = self.conf.copy() + self.transcript.attributes['cov'] = 10 + loci = Locus(self.transcript, configuration=checked_conf) + loci.configuration.scoring.as_requirements.parameters['attributes.cov'] = SizeFilter(operator='ge', value=15, + metric=None, + name='attributes.cov') + loci.configuration.scoring.as_requirements.expression = [ + loci.configuration.scoring.as_requirements.expression[0] + ' and attributes.cov'] + loci.configuration.scoring.as_requirements._check_my_requirements() + self.assertFalse(loci._check_as_requirements(self.transcript)) + + def test_attributes_use_as_cds_requirements(self): + from Mikado.loci import Locus + checked_conf = self.conf.copy() + checked_conf.scoring.cds_requirements.expression = ["attributes.something"] + checked_conf.scoring.cds_requirements.parameters = {"attributes.something": SizeFilter(value=1, operator="ge")} + checked_conf.scoring.cds_requirements._check_my_requirements() + self.transcript.attributes['something'] = 2 + loci = Locus(self.transcript, configuration=checked_conf) + not_passing = loci._check_not_passing(section_name="cds_requirements") + self.assertSetEqual(not_passing, set()) + + def test_attributes_use_as_fragment_filter(self): + from Mikado.loci import Locus + checked_conf = self.conf.copy() + checked_conf.pick.fragments.max_distance = checked_conf.pick.clustering.flank = 5000 + checked_conf.scoring.not_fragmentary.expression = ["attributes.something"] + checked_conf.scoring.not_fragmentary.parameters = {"attributes.something": SizeFilter(value=1, operator="ge")} + checked_conf.scoring.not_fragmentary._check_my_requirements() + self.transcript.attributes['something'] = 2 + logger = create_default_logger("test_attributes_use_as_fragment_filter", level="DEBUG") + loci = Locus(self.transcript, configuration=checked_conf, logger=logger) + self.other_transcript.attributes["something"] = 0.5 + fragment = Locus(self.other_transcript, configuration=checked_conf, logger=logger) + self.assertEqual(fragment.other_is_fragment(loci), (False, None)) + self.assertTrue(loci.other_is_fragment(fragment)[0]) + del self.other_transcript.attributes["something"] + fragment = Locus(self.other_transcript, configuration=checked_conf, logger=logger) + self.assertEqual(fragment.other_is_fragment(loci), (False, None)) + self.assertTrue(loci.other_is_fragment(fragment)[0]) + + def test_attributes_range_use_as_fragment_filter(self): + from Mikado.loci import Locus + checked_conf = self.conf.copy() + checked_conf.pick.fragments.max_distance = checked_conf.pick.clustering.flank = 5000 + checked_conf.scoring.not_fragmentary.expression = ["attributes.something"] + checked_conf.scoring.not_fragmentary.parameters = {"attributes.something": RangeFilter(value=[10, 50], + operator="within")} + checked_conf.scoring.not_fragmentary._check_my_requirements() + self.transcript.attributes['something'] = 11 + logger = create_default_logger("test_attributes_range_as_fragment_filter", level="DEBUG") + loci = Locus(self.transcript, configuration=checked_conf, logger=logger) + self.other_transcript.attributes["something"] = 0.5 + fragment = Locus(self.other_transcript, configuration=checked_conf, logger=logger) + self.assertEqual(fragment.other_is_fragment(loci), (False, None)) + self.assertTrue(loci.other_is_fragment(fragment)[0]) + del self.other_transcript.attributes["something"] + fragment = Locus(self.other_transcript, configuration=checked_conf, logger=logger) + self.assertEqual(fragment.other_is_fragment(loci), (False, None)) + self.assertTrue(loci.other_is_fragment(fragment)[0]) + # Now change the default. + checked_conf.scoring.not_fragmentary.parameters = {"attributes.something": RangeFilter(value=[10, 50], + operator="within", + default=20)} + checked_conf.scoring.not_fragmentary._check_my_requirements() + not_fragment = Locus(self.other_transcript, configuration=checked_conf, logger=logger) + self.assertNotIn("something", not_fragment.transcripts[self.other_transcript.id].attributes) + # Not a fragment any more, the default of 15 is within the range. + self.assertFalse(loci.other_is_fragment(fragment)[0]) + + def test_print_metrics_with_attributes_from_scoring(self): + from Mikado.loci import Locus, Sublocus + checked_conf = self.conf.copy() + checked_conf.scoring.scoring = {"attributes.something": MinMaxScore(default=0, rescaling="max", filter=None)} + checked_conf.scoring.cds_requirements.expression = ["cdna_length"] + checked_conf.scoring.cds_requirements.parameters = {"cdna_length": SizeFilter(operator="ge", value=1)} + checked_conf.scoring.requirements.expression = ["cdna_length"] + checked_conf.scoring.requirements.parameters = {"cdna_length": SizeFilter(operator="ge", value=1)} + checked_conf.scoring.check(checked_conf.pick.orf_loading.minimal_orf_length) + for lclass in [Locus, Sublocus]: + logger = create_default_logger(f"test_print_metrics_with_attributes_{lclass.name}", level="DEBUG") + locus = lclass(self.transcript, configuration=checked_conf, logger=logger) + self.assertIn(self.transcript.id, locus.transcripts) + rows = list(locus.print_metrics()) + self.assertEqual(len(rows), 1) + row = rows[0] + self.assertNotIn("something", locus.transcripts[self.transcript.id].attributes) + self.assertIn("attributes.something", row.keys()) + self.assertEqual(row["attributes.something"], checked_conf.scoring.scoring["attributes.something"].default) + self.transcript.attributes["something"] = 5 + locus = Locus(self.transcript, configuration=checked_conf) + rows = list(locus.print_metrics()) + self.assertEqual(len(rows), 1) + row = rows[0] + self.assertIn("something", locus.transcripts[self.transcript.id].attributes) + self.assertIn("attributes.something", row.keys()) + self.assertEqual(row["attributes.something"], 5) + del self.transcript.attributes["something"] + + def test_print_metrics_with_attributes_from_requirements(self): + from Mikado.loci import Locus, Sublocus + checked_conf = self.conf.copy() + checked_conf.scoring.scoring = {"attributes.something": MinMaxScore(default=0, rescaling="max", filter=None)} + checked_conf.scoring.cds_requirements.expression = ["attributes.cds"] + checked_conf.scoring.cds_requirements.parameters = {"attributes.cds": SizeFilter(operator="ge", value=1, + default=1)} + checked_conf.scoring.requirements.expression = ["attributes.req"] + checked_conf.scoring.requirements.parameters = {"attributes.req": SizeFilter(operator="ge", value=1, + default=1)} + checked_conf.scoring.as_requirements.expression = ["attributes.as"] + checked_conf.scoring.as_requirements.parameters = {"attributes.as": SizeFilter(operator="ge", value=1, + default=1)} + checked_conf.scoring.not_fragmentary.expression = ["attributes.frag"] + checked_conf.scoring.not_fragmentary.parameters = {"attributes.frag": SizeFilter(operator="ge", value=1, + default=1)} + sections = {"something": checked_conf.scoring.scoring, + "req": checked_conf.scoring.requirements.parameters, + "as": checked_conf.scoring.as_requirements.parameters, + "cds": checked_conf.scoring.cds_requirements.parameters, + "frag": checked_conf.scoring.not_fragmentary.parameters} + + checked_conf.scoring.check(checked_conf.pick.orf_loading.minimal_orf_length) + for lclass in [Locus, Sublocus]: + logger = create_default_logger(f"test_print_metrics_with_attributes_{lclass.name}", level="DEBUG") + locus = lclass(self.transcript, configuration=checked_conf, logger=logger) + self.assertIn(self.transcript.id, locus.transcripts) + rows = list(locus.print_metrics()) + self.assertEqual(len(rows), 1) + row = rows[0] + for key, section in sections.items(): + self.assertNotIn(key, locus.transcripts[self.transcript.id].attributes) + self.assertIn(f"attributes.{key}", row.keys()) + self.assertEqual(row[f"attributes.{key}"], + section[f"attributes.{key}"].default) + for key in sections: + self.transcript.attributes[key] = 5 + locus = Locus(self.transcript, configuration=checked_conf) + rows = list(locus.print_metrics()) + self.assertEqual(len(rows), 1) + row = rows[0] + for key, section in sections.items(): + self.assertIn(key, locus.transcripts[self.transcript.id].attributes) + self.assertIn(f"attributes.{key}", row.keys()) + self.assertEqual(row["attributes.something"], 5) + # Reset the object + for key in sections: + del self.transcript.attributes[key] diff --git a/Mikado/tests/test_modifications.py b/Mikado/tests/test_modifications.py index 3dba896b4..e12170f3f 100644 --- a/Mikado/tests/test_modifications.py +++ b/Mikado/tests/test_modifications.py @@ -1,3 +1,5 @@ +import os + from Mikado._transcripts.scoring_configuration import SizeFilter from Mikado.scales.assignment import Assigner import itertools @@ -5,7 +7,7 @@ from Mikado.parsers.GFF import GffLine from Mikado.loci import Transcript from ..loci.locus import Locus, pad_transcript -from ..parsers.bed12 import BED12 +from ..parsers.bed12 import BED12, Bed12Parser from ..subprograms.util.trim import trim_coding, trim_noncoding import unittest import pysam @@ -272,14 +274,15 @@ def test_removal_after_padding(self): locus.add_transcript_to_locus(t2_1, check_in_locus=False) locus.add_transcript_to_locus(t2_2, check_in_locus=False) self.assertTrue(locus.primary_transcript_id == t1.id) - locus.logger.setLevel("DEBUG") + locus.logger.setLevel("INFO") locus.finalize_alternative_splicing(_scores={t1.id: 20, t2_1.id: 15, t2_2.id: 10}) self.assertIn(t1.id, locus.transcripts) if t2_1.id in locus.transcripts: for tid1, tid2 in itertools.combinations(locus.transcripts.keys(), 2): res, _ = Assigner.compare(locus[tid1], locus[tid2]) print(tid1, tid2, res.ccode) - self.assertNotIn(t2_1.id, locus.transcripts) + self.assertNotIn(t2_1.id, locus.transcripts.keys(), + [(key, val.start, val.end) for key, val in locus.transcripts.items()]) self.assertIn(t2_2.id, locus.transcripts, "\n".join(tr.format("bed12") for tr in locus)) self.assertTrue(locus[t2_2.id].attributes["padded"]) @@ -347,6 +350,22 @@ def test_add_two_partials(self): (locus[ref.id].start, ref.start, template2.start, template1.start)) ) + @unittest.skip + def test_failed_expansion(self): + logger = create_default_logger("test_failed_expansion", level="WARNING") + raw = [Transcript(line, logger=logger) for line in Bed12Parser( + open(pkg_resources.resource_filename("Mikado.tests", os.path.join("test_pick_pad", "fail.bed12"))))] + transcripts = dict((_.id, _) for _ in raw) + [_.finalize() for _ in transcripts.values()] + template = transcripts["template"] # 4535908 4540293 + candidate = transcripts["candidate"] # 4536444 4540027 + backup = candidate.copy() + fai = pysam.FastaFile(pkg_resources.resource_filename("Mikado.tests", os.path.join("test_pick_pad", + "failing_seq.fa.gz"))) + logger.setLevel("DEBUG") + candidate.logger.setLevel("DEBUG") + pad_transcript(candidate, backup, start_transcript=template, end_transcript=template, fai=fai, logger=logger) + if __name__ == "__main__": unittest.main() diff --git a/Mikado/tests/test_transcript_checker.py b/Mikado/tests/test_transcript_checker.py index 7ba6de9fc..400d5efd6 100644 --- a/Mikado/tests/test_transcript_checker.py +++ b/Mikado/tests/test_transcript_checker.py @@ -526,6 +526,57 @@ def test_codon_finder_negative_3(self): self.assertTrue(tc.has_start_codon, tc.cdna) self.assertTrue(tc.has_stop_codon, tc.cdna) + def test_codon_finder_negative_strip_cds(self): + gtf_lines = """Chr5 TAIR10 mRNA 5335 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; +Chr5 TAIR10 CDS 5697 5769 . - 0 transcript_id "AT5G01015.1"; gene_id "AT5G01015";; +Chr5 TAIR10 exon 5697 5769 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015"; +Chr5 TAIR10 CDS 5335 5576 . - 1 transcript_id "AT5G01015.1"; gene_id "AT5G01015"; +Chr5 TAIR10 exon 5335 5576 . - . transcript_id "AT5G01015.1"; gene_id "AT5G01015";""" + + gtf_lines = [GtfLine(_) for _ in gtf_lines.split("\n")] + t = Transcript(gtf_lines[0]) + t.add_exons(gtf_lines[1:]) + t.finalize() + + seq = str(self.genome[t.chrom][t.start - 1:t.end]) + # Basically insert an internal stop codon. This will make the ORF tests fail, leading to the ORF being stripped + seq = seq[:72] + Bio.Seq.reverse_complement("TAG") + seq[75:] + correct_seq = "".join("""ATGGAGTCTAGCTTGCATAGTGTGATTTTCTTAGGTTTGCTTGCGACGATTCTGGTTACG +ACCAATGGCCAAGGAGACGGGACGGGGCTAAATGCAGAAGAAATGTGGCCAGTGGAGGTG +GGGATGGAGTATAGAGTATGGAGGAGAAAGCTGATGACGCCATTGGAGCTGTGCTTGGAG +TGCAAATGCTGCTCCTCCACCACTTGTGCCACCATGCCTTGCTGTTTCGGCATCAATTGC +TAGCTTCCCAACAAGCCATTTGGCGTTTGTGCCTTTGTTCCCAAGTCATGCCATTGTAAT +TCTTGCTCCATTTGA""".split("\n")) + logger = create_default_logger("test_codon_finder_negative_3", level="WARNING") + + with self.assertRaises(InvalidTranscript): + for lenient in (False, True): + tc = TranscriptChecker(t, seq, logger=logger, lenient=lenient, strip_faulty_cds=False) + tc.finalize() + tc.check_orf() + + for lenient in (False, True): + tc = TranscriptChecker(t, seq, logger=logger, lenient=lenient, strip_faulty_cds=True) + tc.finalize() + correct_length = (5576 - 5335 + 1) + (5769 - 5697 + 1) + self.assertEqual(correct_length, len(correct_seq), (correct_length, len(correct_seq))) + self.assertEqual(tc.cdna_length, correct_length, (correct_length, tc.cdna_length)) + self.assertEqual(len(tc.cdna), tc.cdna_length) + self.assertEqual(correct_seq, tc.cdna) + + tc.check_orf() + self.assertFalse(tc.is_coding) + tc_orfs = tc.find_overlapping_cds(tc.get_internal_orf_beds()) + self.assertEqual(1, len(tc_orfs)) + self.assertFalse(tc_orfs[0].has_stop_codon, (tc_orfs[0], tc_orfs[0].stop_codon)) + self.assertFalse(tc_orfs[0].has_start_codon, (tc_orfs[0], tc_orfs[0].start_codon)) + + self.assertFalse(tc.is_coding) + self.assertNotIn("has_stop_codon", tc.attributes) + self.assertNotIn("has_start_codon", tc.attributes) + self.assertFalse(tc.has_start_codon, tc.cdna) + self.assertFalse(tc.has_stop_codon, tc.cdna) + if __name__ == '__main__': unittest.main() diff --git a/Mikado/tests/test_transcript_methods.py b/Mikado/tests/test_transcript_methods.py index 540f701e1..8bfd42255 100644 --- a/Mikado/tests/test_transcript_methods.py +++ b/Mikado/tests/test_transcript_methods.py @@ -454,6 +454,8 @@ def setUp(self): transcript.name = transcript.id self.transcripts[transcript.id] = transcript + for tid, transcript in self.transcripts.items(): + self.assertFalse(any(key.lower() == "id=id" for key in transcript.attributes), (tid, transcript.attributes)) self.assertEqual(len(self.transcripts), 4) def test_bed12(self): diff --git a/Mikado/tests/test_transcript_negative.py b/Mikado/tests/test_transcript_negative.py index 53e58bdb1..241a69dfe 100644 --- a/Mikado/tests/test_transcript_negative.py +++ b/Mikado/tests/test_transcript_negative.py @@ -117,10 +117,12 @@ def test_print(self): str(self.tr), diff) - g_bed12 = "Chr1 5927 8737 ID=AT1G01020.1;coding=True;phase=0 0 - 6914 8666 0 10 336,633,76,67,86,74,46,90,48,167 0,509,1229,1456,1636,1834,2014,2308,2489,2643" + g_bed12 = "Chr1 5927 8737 ID=AT1G01020.1;coding=True;phase=0 0 - 6914 8666 0"\ + " 10 336,633,76,67,86,74,46,90,48,167 0,509,1229,1456,1636,1834,2014,2308,2489,2643\tParent=AT1G01020" self.assertEqual(g_bed12, self.tr.format("bed12", transcriptomic=False)) - t_bed12 = "AT1G01020.1 0 1623 ID=AT1G01020.1;coding=True;phase=0 0 + 71 809 0 10 167,48,90,46,74,86,67,76,633,336 0,167,215,305,351,425,511,578,654,1287" + t_bed12 = "AT1G01020.1 0 1623 ID=AT1G01020.1;coding=True;phase=0 0 + 71 809 0 10 "\ + "167,48,90,46,74,86,67,76,633,336 0,167,215,305,351,425,511,578,654,1287" self.assertEqual(t_bed12, self.tr.format("bed12", transcriptomic=True)) def test_empty(self): diff --git a/Mikado/transcripts/pad.py b/Mikado/transcripts/pad.py index 88d87d830..da648ba1e 100644 --- a/Mikado/transcripts/pad.py +++ b/Mikado/transcripts/pad.py @@ -47,22 +47,30 @@ def pad_transcript(transcript: Transcript, assert strand == transcript.strand upstream, up_exons, new_first_exon, up_remove = _enlarge_start(transcript, backup, start_transcript) - downstream, up_exons, down_exons, down_remove = _enlarge_end(transcript, - backup, end_transcript, up_exons, new_first_exon) + downstream, up_exons, down_exons, new_exon, down_remove = _enlarge_end(transcript, + backup, end_transcript, up_exons, new_first_exon) - first_exon, last_exon = transcript.exons[0], transcript.exons[-1] + # first_exon, last_exon = transcript.exons[0], transcript.exons[-1] assert upstream >= 0 and downstream >= 0 - if up_remove is True: - # Remove the first exon - transcript.remove_exon(first_exon) - if down_remove is True: - if not (up_remove is True and first_exon == last_exon): - transcript.remove_exon(last_exon) + if transcript.monoexonic is False: + if up_remove is True: + # Remove the first exon + up_exons = sorted(up_exons) + transcript.exons[0] = up_exons[-1] + up_exons.remove(up_exons[-1]) + # transcript.remove_exon(first_exon) + if down_remove is True: # (up_remove is False or first_exon != last_exon): + down_exons = sorted(down_exons) + transcript.exons[-1] = down_exons[-1] + down_exons.remove(down_exons[-1]) + # transcript.remove_exon(last_exon) + elif new_exon is not None: + transcript.exons[0] = new_exon new_exons = up_exons + down_exons - if not new_exons: + if not any([len(new_exons) > 0, up_remove, down_remove]): logger.debug("%s does not need to be expanded, exiting", transcript.id) return backup @@ -211,6 +219,7 @@ def _enlarge_end(transcript: Transcript, downstream = 0 down_exons = [] to_remove = False + new_exon = new_first_exon[:] if new_first_exon is not None else None if end_transcript: transcript.end = end_transcript.end @@ -218,7 +227,7 @@ def _enlarge_end(transcript: Transcript, [_ for _ in end_transcript.find_downstream(transcript.exons[-1][0], transcript.exons[-1][1]) if _.value == "exon"]) intersecting_downstream = sorted(end_transcript.search( - transcript.exons[-1][0], transcript.exons[-1][1])) + transcript.exons[-1][0] + 1, transcript.exons[-1][1])) if not intersecting_downstream: raise KeyError("No exon or intron found to be intersecting with %s vs %s, this is a mistake", transcript.id, end_transcript.id) @@ -275,7 +284,7 @@ def _enlarge_end(transcript: Transcript, downstream += sum(_[1] - _[0] + 1 for _ in downstream_exons) down_exons.extend([(_[0], _[1]) for _ in downstream_exons]) - return downstream, up_exons, down_exons, to_remove + return downstream, up_exons, down_exons, new_exon, to_remove def check_expanded(transcript, backup, start_transcript, end_transcript, fai, upstream, downstream, logger) -> str: diff --git a/Mikado/transcripts/reference_gene.py b/Mikado/transcripts/reference_gene.py index 122b91416..d77cb278d 100644 --- a/Mikado/transcripts/reference_gene.py +++ b/Mikado/transcripts/reference_gene.py @@ -354,10 +354,10 @@ def load_dict(self, state: dict, exclude_utr=False, protein_coding=False, trust_ transcript.load_dict(tvalues, trust_orf=trust_orf) transcript.finalize() if protein_coding is True and transcript.is_coding is False: - self.logger.debug("{0} is non coding ({1}, {2})".format( + self.logger.debug("{0} is non coding ({1})".format( transcript.id, transcript.combined_cds, - transcript.segments)) + )) continue if exclude_utr is True: has_utrs = (transcript.utr_length > 0) diff --git a/Mikado/transcripts/transcriptchecker.py b/Mikado/transcripts/transcriptchecker.py index b3835b782..cdc9403d7 100644 --- a/Mikado/transcripts/transcriptchecker.py +++ b/Mikado/transcripts/transcriptchecker.py @@ -50,8 +50,9 @@ def __init__(self, gffline, seq, :param strand_specific: flag. If set, transcripts will not have their strand changed. :type strand_specific: bool - :param lenient: boolean flag. If set, incorrect transcripts will be - flagged rather than discarded. + :param lenient: boolean flag. If set, incorrect transcripts will be flagged rather than discarded. + Please note that this flag will *not* be passed on to the BED12 constructor when checking the + ORFs; Mikado prepare requires correct CDSs, if one has to be present at all. :type lenient: bool """ self.__strand_specific = False @@ -324,19 +325,20 @@ def check_orf(self): self.finalize() else: self.logger.warning("Transcript %s has an invalid CDS and must therefore be discarded.", self.id) + raise InvalidTranscript("Transcript %s has an invalid CDS and must therefore be discarded.") if self.is_coding is False: return try: - orfs = list(self.get_internal_orf_beds()) - assert len(orfs) >= 1 - except AssertionError as exc: # Invalid ORFs found + orfs = self.orfs + if len(orfs) < 1: + raise AssertionError("No ORFs remaining.") + except AssertionError as exc: if self.strip_faulty_cds is False: - raise InvalidTranscript("Invalid ORF(s) for {}. Discarding it. Error: {}".format(self.id, exc)) + raise InvalidTranscript(f"Invalid ORF(s) for {self.id}. Discarding it. Error: {exc}") else: - self.logger.warning("Invalid ORF(s) for %s. Stripping it of its CDS. Error: %s", - self.id, exc) + self.logger.warning(f"Invalid ORF(s) for {self.id}. Stripping it of its CDS. Error: {exc}") self.strip_cds() return @@ -361,7 +363,9 @@ def check_orf(self): self.has_stop_codon = orf.has_stop_codon self.attributes["has_start_codon"] = orf.has_start_codon self.attributes["has_stop_codon"] = orf.has_stop_codon - return + + self.strip_cds() + self.load_orfs(orfs) @property def cdna(self) -> str: diff --git a/README.md b/README.md index 330bfba45..e47db5f9d 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![GitHub Downloads](https://img.shields.io/github/downloads/EI-CoreBioinformatics/mikado/total.svg?style=social&logo=github&label=download)](https://github.com/EI-CoreBioinformatics/mikado/releases) [![Release](https://img.shields.io/github/release/EI-CoreBioinformatics/mikado.svg)](https://github.com/EI-CoreBioinformatics/mikado/releases) [![PyPI](https://img.shields.io/pypi/v/mikado.svg?style=flat)](https://pypi.python.org/pypi/mikado) -[![Build Status](https://travis-ci.org/lucventurini/mikado.svg?branch=master)](https://travis-ci.org/lucventurini/mikado) +[![Build Status](https://github.com/EI-CoreBioinformatics/mikado/workflows/Mikado/badge.svg)](https://github.com/EI-CoreBioinformatics/mikado/actions/workflows/python-package.yml) # Mikado - pick your transcript: a pipeline to determine and select the best RNA-Seq prediction @@ -69,10 +69,8 @@ If you use Mikado in your work, please consider to cite: > Venturini L., Caim S., Kaithakottil G., Mapleson D.L., Swarbreck D. Leveraging multiple transcriptome assembly methods for improved gene structure annotation. GigaScience, Volume 7, Issue 8, 1 August 2018, giy093, [doi:10.1093/gigascience/giy093](https://doi.org/10.1093/gigascience/giy093) - If you also use Portcullis to provide reliable junctions to Mikado, either independently or as part of the Daijin pipeline, please consider to cite: - > Mapleson D.L., Venturini L., Kaithakottil G., Swarbreck D. Efficient and accurate detection of splice junctions from RNAseq with Portcullis. GigaScience, Volume 7, Issue 12, 12 December 2018, giy131, [doi:10.1093/gigascience/giy131](https://doi.org/10.1093/gigascience/giy131) [Portcullis]: https://github.com/maplesond/portcullis diff --git a/docs/Algorithms.rst b/docs/Algorithms.rst index 60f699c62..9be0a91d3 100644 --- a/docs/Algorithms.rst +++ b/docs/Algorithms.rst @@ -122,13 +122,11 @@ If any BLAST hit *spans* the two ORFs, then the model will be considered as a no These modes can be controlled directly from the :ref:`pick command line `, or during the :ref:`initial configuration stage `. -.. _scoring_files: +.. _scoring_algorithm: Transcript measurements and scoring ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _scoring_algorithm: - In order to determine the best transcript for each locus, Mikado measures each available candidate according to various different :ref:`metrics ` and assigning a specific score for each of those. Similarly to `RAMPART `_ [Rampart]_, Mikado will assign a score to each transcript for each metric by assessing it relatively to the other transcripts in the locus. The particular feature rescaling equation used for a given metric depends on the type of feature it represents: * metrics where higher values represent better transcript assemblies ("maximum"). @@ -148,525 +146,11 @@ Finally, the scores for each metric will be summed up to produce a final score f :math:`s_{t} = \sum_{m \forall m \in M} s_{mt}`. Not all the available metrics will be necessarily used for scoring; the choice of which to employ and how to score and weight each of them is left to the experimenter, although Mikado provides some pre-configured scoring files. -Values that are guaranteed to be between 0 and 1 (e.g. a percentage value) can be used directly as scores, by setting the *use_raw* parameter as true for them (see below). - -.. important:: The scoring algorithm is dependent on the other transcripts in the locus, so each score should not be taken as an *absolute* measure of the reliability of a transcript, but rather as a measure of its **relative goodness compared with the alternatives**. Shifting a transcript from one locus to another can have dramatic effects on the scoring of a transcript, even while the underlying metric values remain unchanged. This is why the score assigned to each transcript changes throughout the Mikado run, as transcripts are moved to subloci, monoloci and finally loci. - -Scoring files -~~~~~~~~~~~~~ - -Mikado employs user-defined configuration files to define the desirable features in genes. These files are in either YAML or JSON format (default YAML) and are composed of four sections: - - #. a *requirements* section, specifying the minimum requirements that a transcript must satisfy to be considered as valid. **Any transcript failing these requirements will be scored at 0 and purged.** - #. a *not_fragmentary* section, specifying the minimum requirements that the primary transcript of a locus has to satisfy in order for the locus **not** to be considered as a putative fragment. - #. an *as_requirements* section, which specifies the minimum requirements for transcripts for them to be considered as possible valid alternative splicing events. - #. a *scoring* section, specifying which features Mikado should look for in transcripts, and how each of them will be weighted. - -Conditions are specified using a strict set of :ref:`available operators ` and the values they have to consider. - -.. important:: Although at the moment Mikado does not offer any method to establish machine-learning based scoring configurations, it is a topic we plan to investigate in the future. Mikado already supports `Random Forest Regressors as scorers through Scikit-learn `_, but we have yet to devise a proper way to create such regressors. - -We provide a guide on how to write your own scoring files in a :ref:`separate tutorial `. - -.. _operators: - -Operators ---------- - -Mikado allows the following operators to express a relationship inside the scoring files: - -* *eq*: equal to (:math:`=`). Valid for comparisons with numbers, boolean values, and strings. -* *ne*: different from (:math:`\neq`). Valid for comparisons with numbers, boolean values, and strings. -* *lt*: less than (:math:`<`). Valid for comparisons with numbers. -* *gt*: greater than (:math:`>`). Valid for comparisons with numbers. -* *le*: less or equal than (:math:`\le`). Valid for comparisons with numbers. -* *ge*: greater or equal than (:math:`\ge`). Valid for comparisons with numbers. -* *in*: member of (:math:`\in`). Valid for comparisons with arrays or sets. -* *not in*: not member of (:math:`\notin`). Valid for comparisons with arrays or sets. -* *within*: value comprised in the range of the two values, inclusive. -* *not within*: value *not* comprised in the range of the two values, inclusive. - -Mikado will fail if an operator not present on this list is specified, or if the operator is assigned to compare against the wrong data type (eg. *eq* with an array). - -.. _requirements-section: - -The "requirements", "as_requirements" and "not_fragmentary" sections --------------------------------------------------------------------- - -These sections specifies the minimum requirements for a transcript at various stages. - -* A transcript failing to pass the *requirements* check will be discarded outright (if "purge" is selected) or given a score of 0 otherwise. -* If a transcript has not been selected as the primary transcript of a locus, it has to pass the *as_requirements* check to be considered as a valid alternative splicing event. -* Finally, after loci have been defined, the primary transcripts of loci that do not pass the *not_fragmentary* section mark their loci to be compared against neighbouring loci which have passed this same check. - -**It is strongly advised to use lenient parameters in the requirements section**, as failing to do so might result in discarding whole loci. Typically, transcripts filtered at this step should be obvious fragments, eg monoexonic transcripts produced by RNA-Seq with a total length lower than the *library* fragment length. -This section is composed by two parts: - -* *parameters*: a list of the metrics to be considered. Each metric can be considered multiple times, by suffixing it with a "." construct (eg cdna_length.*mono* vs. cdna_length.*multi* to distinguish two uses of the cdna_length metric - once for monoexonic and once for multiexonic transcripts). Any parameter which is not a :ref:`valid metric name `, after removal of the suffix, **will cause an error**. Parameters have to specify the following: - - * a *value* that the metric has to be compared against - * an *operator* that specifies the target operation. See :ref:`the operators section `. - -* *expression*: a string array that will be compiled into a valid boolean expression. All the metrics present in the expression string **must be present in the parameters section**. If an unrecognized metric is present, Mikado will crash immediately, complaining that the scoring file is invalid. Apart from brackets, Mikado accepts only the following boolean operators to chain the metrics: - - * *and* - * *or* - * *not* - * *xor* - -.. hint:: if no *expression* is specified, Mikado will construct one by chaining all the provided parameters with and *and* operator. Most of the time, this would result in an unexpected behaviour - ie Mikado assigning a score of 0 to most transcripts. It is **strongly advised** to explicitly provide a valid expression. - -As an example, the following snippet replicates a typical requirements section found in a scoring file: - -.. code-block:: yaml - - requirements: - expression: [((exon_num.multi and cdna_length.multi and max_intron_length and min_intron_length), or, - (exon_num.mono and cdna_length.mono))] - parameters: - cdna_length.mono: {operator: gt, value: 50} - cdna_length.multi: {operator: ge, value: 100} - exon_num.mono: {operator: eq, value: 1} - exon_num.multi: {operator: gt, value: 1} - max_intron_length: {operator: le, value: 20000} - min_intron_length: {operator: ge, value: 5} - -In the parameters section, we ask for the following: - - * *exon_num.mono*: monoexonic transcripts must have one exon ("eq") - * *exon_num.multi*: multiexonic transcripts must have more than one exon ("gt") - * *cdna_length.mono*: monoexonic transcripts must have a length greater than 50 bps (the ".mono" suffix is arbitrary, as long as it is unique for all calls of *cdna_length*) - * *cdna_length.multi*: multiexonic transcripts must have a length greater than or equal to 100 bps (the ".multi" suffix is arbitrary, as long as it is unique for all calls of *cdna_length*) - * *max_intron_length*: multiexonic transcripts should not have any intron longer than 200,000 bps. - * *min_intron_length*: multiexonic transcripts should not have any intron smaller than 5 bps. - -The *expression* field will be compiled into the following expression:: - - (exon_num > 1 and cdna_length >= 100 and max_intron_length <= 200000 and min_intron_length >= 5) or (exon_num == 1 and cdna_length > 50) - - -Any transcript for which the expression evaluates to ``false`` will be assigned a score of 0 outright and discarded, unless the user has chosen to disable the purging of such transcripts. - -.. _scoring-section: - -The scoring section -------------------- - -This section specifies which metrics will be used by Mikado to score the transcripts. Each metric to be used is specified as a subsection of the configuration, and will have the following attributes: - -* *rescaling*: the only compulsory attribute. It specifies the kind of scoring that will be applied to the metric, and it has to be one of "max", "min", or "target". See :ref:`the explanation on the scoring algorithm ` for details. -* *value*: compulsory if the chosen rescaling algorithm is "target". This should be either a number or a boolean value. -* *multiplier*: the weight assigned to the metric in terms of scoring. This parameter is optional; if absent, as it is in the majority of cases, Mikado will consider the multiplier to equal to 1. This is the :math:`w_{m}` element in the :ref:`equations above `. -* *filter*: It is possible to specify a filter which the metric has to fulfill to be considered for scoring, eg, "cdna_length >= 200". If the transcript fails to pass this filter, the score *for this metric only* will be set to 0. A "filter" subsection has to specify the following: - - * *operator*: the operator to apply for the boolean expression. See the :ref:`relative section `. - * *value*: value that will be used to assess the metric. - -.. hint:: the purpose of the *filter* section is to allow for fine-tuning of the scoring mechanism; ie it allows to penalise transcripts with undesirable qualities (eg a possible retained intron) without discarding them outright. As such, it is a less harsh version of the :ref:`requirements section ` and it is the preferred way of specifying which transcript features Mikado should be wary of. - -For example, this is a snippet of a scoring section: - -.. code-block:: yaml +Values that are guaranteed to be between 0 and 1 (e.g. a percentage value) can be used directly as scores, by setting the *use_raw* parameter as true for them. - scoring: - blast_score: {rescaling: max} - cds_not_maximal: {rescaling: min} - combined_cds_fraction: {rescaling: target, value: 0.8, multiplier: 2} - five_utr_length: - filter: {operator: le, value: 2500} - rescaling: target - value: 100 - end_distance_from_junction: - filter: {operator: lt, value: 55} - rescaling: min - non_verified_introns_num: - rescaling: max - multiplier: -10 - filter: - operator: gt - value: 1 - metric: exons_num - - -Using this snippet as a guide, Mikado will score transcripts in each locus as follows: - -* Assign a full score (one point, as no multiplier is specified) to transcripts which have the greatest *blast_score* -* Assign a full score (one point, as no multiplier is specified) to transcripts which have the lowest amount of CDS bases in secondary ORFs (*cds_not_maximal*) -* Assign a full score (**two points**, as a multiplier of 2 is specified) to transcripts that have a total amount of CDS bps approximating 80% of the transcript cDNA length (*combined_cds_fraction*) -* Assign a full score (one point, as no multiplier is specified) to transcripts that have a 5' UTR whose length is nearest to 100 bps (*five_utr_length*); if the 5' UTR is longer than 2,500 bps, this score will be 0 (see the filter section) -* Assign a full score (one point, as no multiplier is specified) to transcripts which have the lowest distance between the CDS end and the most downstream exon-exon junction (*end_distance_from_junction*). If such a distance is greater than 55 bps, assign a score of 0, as it is a probable target for NMD (see the filter section). -* Assign a maximum penalty (**minus 10 points**, as a **negative** multiplier is specified) to the transcript with the highest number of non-verified introns in the locus. - - * Again, we are using a "filter" section to define which transcripts will be exempted from this scoring (in this case, a penalty) - * However, please note that we are using the keyword **metric** in this section. This tells Mikado to check a *different* metric for evaluating the filter. Nominally, in this case we are excluding from the penalty any *monoexonic* transcript. This makes sense as a monoexonic transcript will never have an intron to be confirmed to start with. - -.. note:: The possibility of using different metrics for the "filter" section is present from Mikado 1.3 onwards. - -.. _Metrics: - -Metrics -~~~~~~~ - -These are all the metrics available to quantify transcripts. The documentation for this section has been generated using the :ref:`metrics utility `. - -Metrics belong to one of the following categories: - -* **Descriptive**: these metrics merely provide a description of the transcript (eg its ID) and are not used for scoring. - -* **cDNA**: these metrics refer to basic features of any transcript such as its number of exons, its cDNA length, etc. - -* **Intron**: these metrics refer to features related to the number of introns and their lengths. - -* **CDS**: these metrics refer to features related to the CDS assigned to the transcript. - -* **UTR**: these metrics refer to features related to the UTR of the transcript. In the case in which a transcript has been assigned multiple ORFs, unless otherwise stated the UTR metrics will be derived only considering the *selected* ORF, not the combination of all of them. - -* **Locus**: these metrics refer to features of the transcript in relationship to all other transcripts in its locus, eg how many of the introns present in the locus are present in the transcript. These metrics are calculated by Mikado during the picking phase, and as such their value can vary during the different stages as the transcripts are shifted to different groups. - -* **External**: these metrics are derived from accessory data that is recovered for the transcript during the run time. Examples include data regarding the number of introns confirmed by external programs such as PortCullis, or the BLAST score of the best hits. - -.. hint:: Starting from version 1 beta8, Mikado allows to use externally defined metrics for the transcripts. These can be accessed using the keyword "external." within the configuration file. See the :ref:`relevant section ` for details. - -.. important:: Starting from Mikado 1 beta 8, it is possible to use metrics with values between 0 and 1 directly as scores, without rescaling. This feature is available only for metrics whose values naturally lie between 0 and 1, or that are boolean in nature. - -.. topic:: Available metrics - -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| Metric name | Description | Category | Data type | Usable raw | -+=====================================+===========================================================+=============+=============+==============+ -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| tid | ID of the transcript - cannot be an undefined value. | Descriptive | str | False | -| | Alias of id. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| parent | Name of the parent feature of the transcript. | Descriptive | str | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| score | Numerical value which summarizes the reliability of the | Descriptive | str | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| external_scores | **SPECIAL** this Namespace contains all the information | External | Namespace | True | -| | regarding external scores for the transcript. If an | | | | -| | absent property is not defined in the Namespace, Mikado | | | | -| | will set a default value of 0 into the Namespace and | | | | -| | return it. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| best_bits | Metric that returns the best BitS associated with the | External | float | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| blast_identity | This metric will return the alignment identity for the | External | float | True | -| | best BLAST hit according to the evalue. If no BLAST hits | | | | -| | are available for the sequence, it will return 0. | | | | -| | :return: :return: | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| blast_query_coverage | This metric will return the **query** coverage for the | External | float | True | -| | best BLAST hit according to the evalue. If no BLAST hits | | | | -| | are available for the sequence, it will return 0. | | | | -| | :return: | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| blast_score | Interchangeable alias for testing different blast-related | External | float | False | -| | scores. Current: best bit score. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| blast_target_coverage | This metric will return the **target** coverage for the | External | float | True | -| | best BLAST hit according to the evalue. If no BLAST hits | | | | -| | are available for the sequence, it will return 0. | | | | -| | :return: :return: | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| canonical_intron_proportion | This metric returns the proportion of canonical introns | Intron | float | True | -| | of the transcript on its total number of introns. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| cdna_length | This property returns the length of the transcript. | cDNA | int | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| cds_not_maximal | This property returns the length of the CDS excluded from | CDS | int | False | -| | the selected ORF. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| cds_not_maximal_fraction | This property returns the fraction of bases not in the | CDS | float | True | -| | selected ORF compared to the total number of CDS bases in | | | | -| | the cDNA. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_cds_fraction | This property return the percentage of the CDS part of | CDS | float | True | -| | the transcript vs. the cDNA length | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_cds_intron_fraction | This property returns the fraction of CDS introns of the | Locus | | True | -| | transcript vs. the total number of CDS introns in the | | | | -| | Locus. If the transcript is by itself, it returns 1. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_cds_length | This property return the length of the CDS part of the | CDS | int | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_cds_locus_fraction | This metric returns the fraction of CDS bases of the | Locus | float | True | -| | transcript vs. the total of CDS bases in the locus. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_cds_num | This property returns the number of non-overlapping CDS | CDS | int | False | -| | segments in the transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_cds_num_fraction | This property returns the fraction of non-overlapping CDS | CDS | float | True | -| | segments in the transcript vs. the total number of exons | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_utr_fraction | This property returns the fraction of the cDNA which is | UTR | float | True | -| | not coding according to any ORF. Complement of | | | | -| | combined_cds_fraction | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| combined_utr_length | This property return the length of the UTR part of the | UTR | int | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| end_distance_from_junction | This metric returns the cDNA distance between the stop | CDS | int | False | -| | codon and the last junction in the transcript. In many | | | | -| | eukaryotes, this distance cannot exceed 50-55 bps | | | | -| | otherwise the transcript becomes a target of NMD. If the | | | | -| | transcript is not coding or there is no junction | | | | -| | downstream of the stop codon, the metric returns 0. This | | | | -| | metric considers the combined CDS end. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| end_distance_from_tes | This property returns the distance of the end of the | CDS | int | False | -| | combined CDS from the transcript end site. If no CDS is | | | | -| | defined, it defaults to 0. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| exon_fraction | This property returns the fraction of exons of the | Locus | float | True | -| | transcript which are contained in the sublocus. If the | | | | -| | transcript is by itself, it returns 1. Set from outside. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| exon_num | This property returns the number of exons of the | cDNA | int | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| five_utr_length | Returns the length of the 5' UTR of the selected ORF. | UTR | | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| five_utr_num | This property returns the number of 5' UTR segments for | UTR | int | False | -| | the selected ORF. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| five_utr_num_complete | This property returns the number of 5' UTR segments for | UTR | int | False | -| | the selected ORF, considering only those which are | | | | -| | complete exons. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| has_start_codon | Boolean. True if the selected ORF has a start codon. | CDS | bool | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| has_stop_codon | Boolean. True if the selected ORF has a stop codon. | CDS | bool | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| highest_cds_exon_number | This property returns the maximum number of CDS segments | CDS | int | False | -| | among the ORFs; this number can refer to an ORF | | | | -| | *DIFFERENT* from the maximal ORF. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| highest_cds_exons_num | Returns the number of CDS segments in the selected ORF | CDS | int | False | -| | (irrespective of the number of exons involved) | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| intron_fraction | This property returns the fraction of introns of the | Locus | float | True | -| | transcript vs. the total number of introns in the Locus. | | | | -| | If the transcript is by itself, it returns 1. Set from | | | | -| | outside. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| is_complete | Boolean. True if the selected ORF has both start and end. | CDS | bool | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| max_exon_length | This metric will return the length of the biggest exon in | cDNA | int | False | -| | the transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| max_intron_length | This property returns the greatest intron length for the | Intron | int | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| min_exon_length | This metric will return the length of the biggest exon in | | | False | -| | the transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| min_intron_length | This property returns the smallest intron length for the | Intron | int | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| non_verified_introns_num | This metric returns the number of introns of the | External | int | False | -| | transcript which are not validated by external data. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| num_introns_greater_than_max | This metric returns the number of introns greater than | Intron | int | False | -| | the maximum acceptable intron size indicated in the | | | | -| | constructor. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| num_introns_smaller_than_min | This metric returns the number of introns smaller than | Intron | int | False | -| | the mininum acceptable intron size indicated in the | | | | -| | constructor. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| number_internal_orfs | This property returns the number of ORFs inside a | CDS | int | False | -| | transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| only_non_canonical_splicing | This metric will return True if the canonical_number is 0 | Intron | bool | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| proportion_verified_introns | This metric returns, as a fraction, how many of the | External | float | True | -| | transcript introns are validated by external data. | | | | -| | Monoexonic transcripts are set to 1. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| proportion_verified_introns_inlocus | This metric returns, as a fraction, how many of the | Locus | float | True | -| | verified introns inside the Locus are contained inside | | | | -| | the transcript. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| retained_fraction | This property returns the fraction of the cDNA which is | Locus | float | True | -| | contained in retained introns. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| retained_intron_num | This property records the number of introns in the | Locus | int | False | -| | transcripts which are marked as being retained. See the | | | | -| | corresponding method in the sublocus class. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_cds_exons_fraction | Returns the fraction of CDS segments in the selected ORF | CDS | float | True | -| | (irrespective of the number of exons involved) | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_cds_fraction | This property calculates the fraction of the selected CDS | CDS | float | True | -| | vs. the cDNA length. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_cds_intron_fraction | This property returns the fraction of CDS introns of the | CDS | float | True | -| | selected ORF of the transcript vs. the total number of | | | | -| | CDS introns in the Locus (considering only the selected | | | | -| | ORF). If the transcript is by itself, it should return 1. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_cds_length | This property calculates the length of the CDS selected | CDS | int | False | -| | as best inside the cDNA. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_cds_locus_fraction | This metric returns the fraction of CDS bases of the | Locus | float | True | -| | transcript vs. the total of CDS bases in the locus. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_cds_num | This property calculates the number of CDS exons for the | CDS | int | False | -| | selected ORF | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_cds_number_fraction | This property returns the proportion of best possible CDS | CDS | float | False | -| | segments vs. the number of exons. See | | | | -| | selected_cds_number. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_end_distance_from_junction | This metric returns the distance between the stop codon | CDS | int | False | -| | and the last junction of the transcript. In many | | | | -| | eukaryotes, this distance cannot exceed 50-55 bps, | | | | -| | otherwise the transcript becomes a target of NMD. If the | | | | -| | transcript is not coding or there is no junction | | | | -| | downstream of the stop codon, the metric returns 0. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_end_distance_from_tes | This property returns the distance of the end of the best | CDS | int | False | -| | CDS from the transcript end site. If no CDS is defined, | | | | -| | it defaults to 0. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| selected_start_distance_from_tss | This property returns the distance of the start of the | CDS | int | False | -| | best CDS from the transcript start site. If no CDS is | | | | -| | defined, it defaults to 0. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| snowy_blast_score | Metric that indicates how good a hit is compared to the | External | float | False | -| | competition, in terms of BLAST similarities. As in | | | | -| | SnowyOwl, the score for each hit is calculated by taking | | | | -| | the coverage of the target and dividing it by (2 * | | | | -| | len(self.blast_hits)). IMPORTANT: when splitting | | | | -| | transcripts by ORF, a blast hit is added to the new | | | | -| | transcript only if it is contained within the new | | | | -| | transcript. This WILL screw up a bit the homology score. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| source_score | This metric returns a score that is assigned to the | External | int | False | -| | transcript in virtue of its origin. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| start_distance_from_tss | This property returns the distance of the start of the | CDS | int | False | -| | combined CDS from the transcript start site. If no CDS is | | | | -| | defined, it defaults to 0. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| suspicious_splicing | This metric will return True if the transcript either has | Intron | bool | False | -| | canonical introns on both strands (probably a chimeric | | | | -| | artifact between two neighbouring loci, or if it has no | | | | -| | canonical splicing event but it would if it were assigned | | | | -| | to the opposite strand (probably a strand misassignment | | | | -| | on the part of the assembler/predictor). | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| three_utr_length | Returns the length of the 5' UTR of the selected ORF. | | int | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| three_utr_num | This property returns the number of 3' UTR segments | UTR | int | False | -| | (referred to the selected ORF). | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| three_utr_num_complete | This property returns the number of 3' UTR segments for | UTR | int | False | -| | the selected ORF, considering only those which are | | | | -| | complete exons. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| utr_fraction | This property calculates the length of the UTR of the | UTR | float | True | -| | selected ORF vs. the cDNA length. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| utr_length | Returns the sum of the 5'+3' UTR lengths | UTR | int | False | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| utr_num | Returns the number of UTR segments (referred to the | UTR | int | False | -| | selected ORF). | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| utr_num_complete | Returns the number of UTR segments which are complete | UTR | int | False | -| | exons (referred to the selected ORF). | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ -| verified_introns_num | This metric returns the number of introns of the | External | int | False | -| | transcript which are validated by external data. | | | | -+-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ - - -.. _external-metrics: - -External metrics -~~~~~~~~~~~~~~~~ - -Starting from version 1 beta 8, Mikado allows to load external metrics into the database, to be used for evaluating transcripts. Metrics of this kind **must** have a value comprised between 0 and 1. -The file can be provided either by specifying it in the :ref:`coonfiguration file `, under "serialise/files/external_scores", or on the command line with the "--external-scores" parameters to mikado :ref:`serialise `. -The external scores file should have the following format: - -+--------------------------+------------------+------------------+-------+-----------------+ -| TID | Metric_one | Metric_two | ... | Metric_N | -+==========================+==================+==================+=======+=================+ -| Transcript_one | value | value | | value | -+--------------------------+------------------+------------------+-------+-----------------+ -| Transcript_two | value | value | | value | -+--------------------------+------------------+------------------+-------+-----------------+ -| ... | ... | ... | | ... | -+--------------------------+------------------+------------------+-------+-----------------+ -| Transcript_N | value | value | | value | -+--------------------------+------------------+------------------+-------+-----------------+ - - -Please note the following: - -* the header is mandatory. -* the metric names at the head of the table should **not** contain any space or spcecial characters, apart from the underscore (_) -* the header provides the name of the metric as will be seen by Mikado. As such, it is advised to choose sensible and informative names (e.g. "fraction_covered") rather than uninformative ones (e.g. the "metric_one" from above) -* Column names **must be unique**. -* The transcript names present in the first column **must** be present in the FASTA file. -* The table should be tab-separated. -* Values can be of any numerical or boolean type. However, only values that are determined **at serialisation** to be comprised within 0 and 1 (inclusive) can be used as raw values. - -A proper way of generating and using external scores would, therefore, be the following: - -* Run Mikado prepare on the input dataset. -* Run all necessary supplementary analyses (ORF calling and/or homology analysis through DIAMOND or BLAST). -* Run supplementary analyses to assess the transcripts, e.g. expression analysis. Normalise results so that they can be expressed with values between 0 and 1. - - * Please note that boolean results (e.g. presence or absence) can be expressed with 0 and 1 intead of "False" and "True". Customarily, in Python 0 stands for False and 1 for True, but you can choose to switch the order if you so desire. -* Aggregate all results in a text table, like the one above, tab separated. -* Call mikado serialise, specifying the location of this table either through the configuration file or on the command line invocation. - -Given the open ended nature of the external scores, the Daijin pipeline currently does not offer any system to generate these scores. This might change in the future. - -Adding external scores to the scoring file ------------------------------------------- - -Once the external metrics have been properly loaded, it is necessary to tell Mikado how to use them. This requires :ref:`modifying the scoring file itself `. The header that we used in the table above does provide the names of the metrics as they will be seen by Mikado. - -Let us say that we have performed an expression analysis on our transcripts, and we have created and loaded the following three metrics: - -* "fraction_covered", ie the percentage of the transcript covered by at least X reads (where X is decided by the experimenter) -* "samples_expressed", ie the percentage of samples where the transcript was expressed over a certain threshold (e.g. 1 TPM) -* "has_coverage_gaps", ie a boolean metrics that indicates whether there are windows *within* the transcript that lowly or not at all covered (e.g. a 100bp stretch with no coverage between two highly covered regions, indicating a possilble intron retention or chimera). For this example, a value of "0" indicates that there no coverage gaps (ie. it is *False* that there are gaps), "1" otherwise (it is *True* that there are coverage gaps). - -We can now use these metrics as normal, by invoking them as "external." followed by the name of the metrics: e.g., "external.fraction_covered". -So for example, if we wished to prioritise transcripts that are expressed in the highest number of samples and are completely covered by RNASeq data upon reads realignment, under "scoring", we can add the following: - -.. code-block:: yaml - - scoring: - # [ ... other metrics ... ] - - external.samples_expressed: {rescaling: max} - - external.fraction_covered: {rescaling: max} - -And if we wanted to consider any primary transcript with coverage gaps as a potential fragment, under the "fragmentary" section we could do so: - -.. code-block:: yaml - - not_fragmentary: - expression: - # other metrics .. - - and (external.has_coverage_gaps) - # Finished expression - parameters: - # other metrics ... - external.has_coverage_gaps: {operator: eq, value: 0} # Please note, again, that here "0" means "no coverage gaps detected". - # other metrics ... - -As external metrics allow Mikado to accept any arbitrary metric for each transcript, they allow the program to assess transcripts in any way the experimenter desires. However, currently we do not provide any way of automating the process. - -.. note:: also for external metrics, it is necessary to add a suffix to them if they are invoked more than once in an expression (see the :ref:`tutorial `). An invocation of e.g. "external.samples_expressed.mono" and "external.samples_expressed.multi", to distinguish between monoexonic and multiexonic transcripts, would be perfectly valid and actually *required* by Mikado. Notice the double use of the dot (".") as separator. Its usage as such is the reason that it cannot be present in the name of the metric itself (so, for example, "has.coverage.gaps" would be an invalid metric name). +Details on the structure of scoring files can be found :ref:`in a dedicated section `; we also provide a tutorial on :ref:`how to create your own scoring file `. +.. important:: The scoring algorithm is dependent on the other transcripts in the locus, so each score should not be taken as an *absolute* measure of the reliability of a transcript, but rather as a measure of its **relative goodness compared with the alternatives**. Shifting a transcript from one locus to another can have dramatic effects on the scoring of a transcript, even while most or all of the underlying metric values remain unchanged. This is why the score assigned to each transcript changes throughout the Mikado run, as transcripts are moved to subloci, monoloci and finally loci. .. _padding: @@ -679,7 +163,8 @@ of missing exons from neighbouring data. The procedure is similar to the one emp 1. A transcript can function as **template** for a candidate if: - the candidate's terminal exon falls within an **exon** of the template - - the extension would enlarge the candidate by at most *"ts_distance"* basepairs (not including introns), default **1000** bps + - the extension would enlarge the candidate by at most *"ts_distance"* basepairs (not including introns), default + **2000** bps - the extension would add at most *"ts_max_splices"* splice sites to the candidate, default **2**. 2. A graph of possible extensions is built for both the 5' end and the 3' end of the locus. Transcripts are then divided in extension groups, starting with the outmost (ie the potential **template** for the group). Links that would cause chains diff --git a/docs/Library/Mikado.daijin.rst b/docs/Library/Mikado.daijin.rst index e4f32dc1e..f6feb53cc 100644 --- a/docs/Library/Mikado.daijin.rst +++ b/docs/Library/Mikado.daijin.rst @@ -1,6 +1,15 @@ Mikado.daijin package ===================== +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + Mikado.daijin.assemble + Mikado.daijin.mikado + Module contents --------------- diff --git a/docs/Library/Mikado.parsers.rst b/docs/Library/Mikado.parsers.rst index 40fa0f14d..c3cba9bc2 100644 --- a/docs/Library/Mikado.parsers.rst +++ b/docs/Library/Mikado.parsers.rst @@ -13,6 +13,7 @@ Submodules Mikado.parsers.bed12 Mikado.parsers.blast_utils Mikado.parsers.gfannotation + Mikado.parsers.parser Module contents --------------- diff --git a/docs/Library/Mikado.scales.rst b/docs/Library/Mikado.scales.rst index e3aefd2ee..919c6a4aa 100644 --- a/docs/Library/Mikado.scales.rst +++ b/docs/Library/Mikado.scales.rst @@ -22,6 +22,8 @@ Submodules Mikado.scales.class_codes Mikado.scales.compare Mikado.scales.contrast + Mikado.scales.contrast + Mikado.scales.contrast Mikado.scales.resultstorer Module contents diff --git a/docs/Library/Mikado.serializers.blast_serializer.rst b/docs/Library/Mikado.serializers.blast_serializer.rst index 5bcec5af8..10d241c4f 100644 --- a/docs/Library/Mikado.serializers.blast_serializer.rst +++ b/docs/Library/Mikado.serializers.blast_serializer.rst @@ -7,9 +7,13 @@ Submodules .. toctree:: :maxdepth: 4 + Mikado.serializers.blast_serializer.aln_string_parser + Mikado.serializers.blast_serializer.aln_string_parser Mikado.serializers.blast_serializer.aln_string_parser Mikado.serializers.blast_serializer.blast_serialiser Mikado.serializers.blast_serializer.btop_parser + Mikado.serializers.blast_serializer.btop_parser + Mikado.serializers.blast_serializer.btop_parser Mikado.serializers.blast_serializer.hit Mikado.serializers.blast_serializer.hsp Mikado.serializers.blast_serializer.query diff --git a/docs/Library/Mikado.subprograms.util.rst b/docs/Library/Mikado.subprograms.util.rst index 0759b3b1a..6f7faccd9 100644 --- a/docs/Library/Mikado.subprograms.util.rst +++ b/docs/Library/Mikado.subprograms.util.rst @@ -9,6 +9,7 @@ Submodules Mikado.subprograms.util.awk_gtf Mikado.subprograms.util.class_codes + Mikado.subprograms.util.collapse_compare_stats Mikado.subprograms.util.convert Mikado.subprograms.util.grep Mikado.subprograms.util.metrics diff --git a/docs/Library/Mikado.transcripts.rst b/docs/Library/Mikado.transcripts.rst index 604f4a730..8407dd2d6 100644 --- a/docs/Library/Mikado.transcripts.rst +++ b/docs/Library/Mikado.transcripts.rst @@ -15,6 +15,7 @@ Submodules .. toctree:: :maxdepth: 4 + Mikado.transcripts.pad Mikado.transcripts.reference_gene Mikado.transcripts.transcript Mikado.transcripts.transcriptchecker diff --git a/docs/Library/Mikado.utilities.rst b/docs/Library/Mikado.utilities.rst index 8ea346127..6c9ac6c49 100644 --- a/docs/Library/Mikado.utilities.rst +++ b/docs/Library/Mikado.utilities.rst @@ -9,11 +9,17 @@ Submodules Mikado.utilities.dbutils Mikado.utilities.f1 + Mikado.utilities.f1 + Mikado.utilities.f1 Mikado.utilities.file_type Mikado.utilities.intervaltree + Mikado.utilities.intervaltree + Mikado.utilities.intervaltree Mikado.utilities.log_utils Mikado.utilities.namespace Mikado.utilities.overlap + Mikado.utilities.overlap + Mikado.utilities.overlap Module contents --------------- diff --git a/docs/References.rst b/docs/References.rst index 97b8c6141..1ccc8e5be 100644 --- a/docs/References.rst +++ b/docs/References.rst @@ -44,4 +44,10 @@ References .. [PYinterval] https://github.com/chaimleib/intervaltree .. [BXPython] https://bitbucket.org/james_taylor/bx-python/overview .. [Snakeviz] https://jiffyclub.github.io/snakeviz/ -.. [PASA] **Improving the Arabidopsis genome annotation using maximal transcript alignment assemblies** Haas, B.J., Delcher, A.L., Mount, S.M., Wortman, J.R., Smith Jr, R.K., Jr., Hannick, L.I., Maiti, R., Ronning, C.M., Rusch, D.B., Town, C.D. et al. *Nucleic Acids Res*, 2003, 31, 5654-5666. doi:10.1093/nar/gkg770 \ No newline at end of file +.. [PASA] **Improving the Arabidopsis genome annotation using maximal transcript alignment assemblies** Haas, B.J., Delcher, A.L., Mount, S.M., Wortman, J.R., Smith Jr, R.K., Jr., Hannick, L.I., Maiti, R., Ronning, C.M., Rusch, D.B., Town, C.D. et al. *Nucleic Acids Res*, 2003, 31, 5654-5666. doi:10.1093/nar/gkg770 +.. [GffRead] **GFF Utilities: GffRead and GffCompare** Pertea, G. and Pertea, M. *F1000*, 2020, 9, ISCB Comm J-304. doi:10.12688/f1000research.23297.2 +.. [Prodigal] **Prodigal: prokaryotic gene recognition and translation initiation site identification.** Hyatt, D., + Chen, GL., LoCascio, P.F. et al. BMC Bioinformatics 11, 119 (2010). doi:10.1186/1471-2105-11-119 +.. [TransDecoder] https://github.com/TransDecoder/TransDecoder/wiki +.. [Pandas] **pandas-dev/pandas: Pandas**, The pandas development team. Zenodo, March 2021. doi:10.5281/zenodo.4572994 +.. [PySAM] https://github.com/pysam-developers/pysam \ No newline at end of file diff --git a/docs/Scoring_files.rst b/docs/Scoring_files.rst new file mode 100644 index 000000000..27236c66d --- /dev/null +++ b/docs/Scoring_files.rst @@ -0,0 +1,640 @@ +.. _scoring_files: + +Scoring files +============= + +Mikado employs user-defined configuration files to define the desirable features in genes. These files are in TOML, YAML or JSON format (default YAML) and are composed of five sections: + + #. a *requirements* section, specifying the minimum requirements that a transcript must satisfy to be considered as valid. **Any transcript failing these requirements will be scored at 0 and purged.** + #. a *cds_requirements* section, specifying minimal conditions for a transcript to retain its CDS. If a transcript fails this check, it will be stripped of its coding component. If this section is not provided, the default will be to copy the *requirements* section above (in practice disabling it). + #. a *not_fragmentary* section, specifying the minimum requirements that the primary transcript of a locus has to satisfy in order for the locus **not** to be considered as a putative fragment. + #. an *as_requirements* section, which specifies the minimum requirements for transcripts for them to be considered as possible valid alternative splicing events. + #. a *scoring* section, specifying which features Mikado should look for in transcripts, and how each of them will be weighted. + +Conditions are specified using a strict set of :ref:`available operators ` and the values they have to consider. + +.. important:: Although at the moment Mikado does not offer any method to establish machine-learning based scoring configurations, it is a topic we plan to investigate in the future. Mikado already supports `Random Forest Regressors as scorers through Scikit-learn `_, but we have yet to devise a proper way to create such regressors. + +We provide a guide on how to write your own scoring files in a :ref:`separate tutorial `. + +.. _operators: + +Operators +~~~~~~~~~ + +Mikado allows the following operators to express a relationship inside the scoring files: + +* *eq*: equal to (:math:`=`). Valid for comparisons with numbers, boolean values, and strings. +* *ne*: different from (:math:`\neq`). Valid for comparisons with numbers, boolean values, and strings. +* *lt*: less than (:math:`<`). Valid for comparisons with numbers. +* *gt*: greater than (:math:`>`). Valid for comparisons with numbers. +* *le*: less or equal than (:math:`\le`). Valid for comparisons with numbers. +* *ge*: greater or equal than (:math:`\ge`). Valid for comparisons with numbers. +* *in*: member of (:math:`\in`). Valid for comparisons with arrays or sets. +* *not in*: not member of (:math:`\notin`). Valid for comparisons with arrays or sets. +* *within*: value comprised in the range of the two values, inclusive. +* *not within*: value *not* comprised in the range of the two values, inclusive. + +Mikado will fail if an operator not present on this list is specified, or if the operator is assigned to compare against the wrong data type (eg. *eq* with an array). + +.. _requirements-section: + +The "requirements", "cds_requirements", "as_requirements" and "not_fragmentary" sections +---------------------------------------------------------------------------------------- + +These sections specifies the minimum requirements for a transcript at various stages. + +* A transcript failing to pass the *requirements* check will be discarded outright (if "purge" is selected) or given a + score of 0 otherwise. +* After passing the *rquirements* section, if the transcript is coding, Mikado will check whether its CDS passes the + requirements specified in *cds_requirements*. If the check fails, the transcripts will be **stripped of its CDS**, and + will only be considered further as a non-coding transcript. *This check can be used to properly consider transcripts + that have a suspicious coding structure - e.g. a single coding exon in a transcript with five or more exons, or a low + Coding Potential score coming from a third-party tool*. +* If a transcript has not been selected as the primary transcript of a locus, it has to pass the *as_requirements* check + to be considered as a valid alternative splicing event. +* Finally, after loci have been defined, the primary transcript of each locus will be checked against the + *not_fragmentary* expression. Any locus failing this check will be marked as a potential fragment and, if in the + vicinity of other loci, might be purged out of the final output or clearly marked as a fragment (depending on whether + the *purge* switch is set to true or false, respectively). + +**It is strongly advised to use lenient parameters in the first requirements section**, as failing to do so might result +in discarding whole loci. Typically, transcripts filtered at this step should be obvious artefacts, eg monoexonic +transcripts produced by RNA-Seq with a total length lower than the *library* fragment length. + +Each of these sections follows the same template, and they are composed by two parts: + +* *parameters*: a list of the metrics to be considered. Each metric can be considered multiple times, by suffixing + it with a "." construct (eg cdna_length.*mono* vs. cdna_length.*multi* to distinguish two uses of the cdna_length + metric - once for monoexonic and once for multiexonic transcripts). Any parameter which is not a :ref:`valid metric + name `, after removal of the suffix, **will cause an error**. Parameters have to specify the following: + + * a *value* that the metric has to be compared against + * an *operator* that specifies the target operation. See :ref:`the operators section `. + +* *expression*: a string *array* that will be compiled into a valid boolean expression. All the metrics present in the + expression string **must be present in the parameters section**. If an unrecognized metric is present, Mikado will + crash immediately, complaining that the scoring file is invalid. Apart from brackets, Mikado accepts only the + following boolean operators to chain the metrics: + + * *and* + * *or* + * *not* + * *xor* + +.. hint:: if no *expression* is specified, Mikado will construct one by chaining all the provided parameters with and + *and* operator. Most of the time, this would result in an unexpected behaviour - ie Mikado assigning a score of 0 to + most transcripts. It is **strongly advised** to explicitly provide a valid expression. + +As an example, the following snippet replicates a typical requirements section found in a scoring file: + +.. code-block:: yaml + + requirements: + expression: [((exon_num.multi and cdna_length.multi and max_intron_length and min_intron_length), or, + (exon_num.mono and cdna_length.mono))] + parameters: + cdna_length.mono: {operator: gt, value: 50} + cdna_length.multi: {operator: ge, value: 100} + exon_num.mono: {operator: eq, value: 1} + exon_num.multi: {operator: gt, value: 1} + max_intron_length: {operator: le, value: 20000} + min_intron_length: {operator: ge, value: 5} + +In the parameters section, we ask for the following: + + * *exon_num.mono*: monoexonic transcripts must have one exon ("eq") + * *exon_num.multi*: multiexonic transcripts must have more than one exon ("gt") + * *cdna_length.mono*: monoexonic transcripts must have a length greater than 50 bps (the ".mono" suffix is + arbitrary, as long as it is unique for all calls of *cdna_length*) + * *cdna_length.multi*: multiexonic transcripts must have a length greater than or equal to 100 bps (the ".multi" + suffix is arbitrary, as long as it is unique for all calls of *cdna_length*) + * *max_intron_length*: multiexonic transcripts should not have any intron longer than 200,000 bps. + * *min_intron_length*: multiexonic transcripts should not have any intron smaller than 5 bps. + +The *expression* field will be compiled into the following expression:: + + (exon_num > 1 and cdna_length >= 100 and max_intron_length <= 200000 and min_intron_length >= 5) or (exon_num == 1 and cdna_length > 50) + + +Any transcript for which the expression evaluates to ``false`` will be assigned a score of 0 outright and discarded, +unless the user has chosen to disable the purging of such transcripts. + +.. _scoring-section: + +The scoring section +~~~~~~~~~~~~~~~~~~~ + +This section specifies which metrics will be used by Mikado to score the transcripts. Each metric to be used is +specified as a subsection of the configuration, and will have the following attributes: + +* *rescaling*: the only compulsory attribute. It specifies the kind of scoring that will be applied to the metric, and + it has to be one of "max", "min", or "target". See :ref:`the explanation on the scoring algorithm ` + for details. +* *value*: compulsory if the chosen rescaling algorithm is "target". This should be either a number or a boolean value. +* *multiplier*: the weight assigned to the metric in terms of scoring. This parameter is optional; if absent, as it is + in the majority of cases, Mikado will consider the multiplier to equal to 1. This is the :math:`w_{m}` element in the + :ref:`equations above `. +* *filter*: It is possible to specify a filter which the metric has to fulfill to be considered for scoring, eg, + "cdna_length >= 200". If the transcript fails to pass this filter, the score *for this metric only* will be set to 0. + **The filter can apply to a different metric**, so it is possible to e.g. assign a score of 0 for, say, "combined_cds" + to any transcript whose "cdna_length" is, say, below 150 bps. + A "filter" subsection has to specify the following: + + * *operator*: the operator to apply for the boolean expression. See the :ref:`relative section `. + * *value*: value that will be used to assess the metric. + * *metric*: *optional*. The metric that this filter refers to. If omitted, this will be identical to the metric + under examination. + +.. hint:: the purpose of the *filter* section is to allow for fine-tuning of the scoring mechanism; ie it allows to + penalise transcripts with undesirable qualities (eg a possible retained intron) without discarding them + outright. As such, it is a less harsh version of the :ref:`requirements section ` and + it is the preferred way of specifying which transcript features Mikado should be wary of. + +For example, this is a snippet of a scoring section: + +.. code-block:: yaml + + scoring: + blast_score: {rescaling: max} + cds_not_maximal: {rescaling: min} + combined_cds_fraction: {rescaling: target, value: 0.8, multiplier: 2} + five_utr_length: + filter: {operator: le, value: 2500} + rescaling: target + value: 100 + end_distance_from_junction: + filter: {operator: lt, value: 55} + rescaling: min + non_verified_introns_num: + rescaling: max + multiplier: -10 + filter: + operator: gt + value: 1 + metric: exons_num + + +Using this snippet as a guide, Mikado will score transcripts in each locus as follows: + +* Assign a full score (one point, as no multiplier is specified) to transcripts which have the greatest *blast_score* +* Assign a full score (one point, as no multiplier is specified) to transcripts which have the lowest amount of CDS + bases in secondary ORFs (*cds_not_maximal*) +* Assign a full score (**two points**, as a multiplier of 2 is specified) to transcripts that have a total amount of CDS + bps approximating 80% of the transcript cDNA length (*combined_cds_fraction*) +* Assign a full score (one point, as no multiplier is specified) to transcripts that have a 5' UTR whose length is + nearest to 100 bps (*five_utr_length*); if the 5' UTR is longer than 2,500 bps, this score will be 0 + (see the filter section) +* Assign a full score (one point, as no multiplier is specified) to transcripts which have the lowest distance between + the CDS end and the most downstream exon-exon junction (*end_distance_from_junction*). If such a distance is greater + than 55 bps, assign a score of 0, as it is a probable target for NMD (see the filter section). +* Assign a maximum penalty (**minus 10 points**, as a **negative** multiplier is specified) to the transcript with the + highest number of non-verified introns in the locus. + + * Again, we are using a "filter" section to define which transcripts will be exempted from this scoring + (in this case, a penalty) + * However, please note that we are using the keyword **metric** in this section. This tells Mikado to check a + *different* metric for evaluating the filter. Nominally, in this case we are excluding from the penalty any + *monoexonic* transcript. This makes sense as a monoexonic transcript will never have an intron to be confirmed to + start with. + +.. note:: The possibility of using different metrics for the "filter" section is present from Mikado 1.3 onwards. + +.. _Metrics: + +Metrics +~~~~~~~ + +These are all the metrics available to quantify transcripts. The documentation for this section has been generated using +the :ref:`metrics utility `. + +Metrics belong to one of the following categories: + +* **Descriptive**: these metrics merely provide a description of the transcript (eg its ID) and are not used for scoring. + +* **cDNA**: these metrics refer to basic features of any transcript such as its number of exons, its cDNA length, etc. + +* **Intron**: these metrics refer to features related to the number of introns and their lengths. + +* **CDS**: these metrics refer to features related to the CDS assigned to the transcript. + +* **UTR**: these metrics refer to features related to the UTR of the transcript. In the case in which a transcript has + been assigned multiple ORFs, unless otherwise stated the UTR metrics will be derived only considering the *selected* + ORF, not the combination of all of them. + +* **Locus**: these metrics refer to features of the transcript in relationship to all other transcripts in its locus, eg + how many of the introns present in the locus are present in the transcript. These metrics are calculated by Mikado during the picking phase, and as such their value can vary during the different stages as the transcripts are shifted to different groups. + +* **External**: these metrics are derived from accessory data that is recovered for the transcript during the run time. + Examples include data regarding the number of introns confirmed by external programs such as Portcullis, or the BLAST + score of the best hits. + +* **Attributes**: these metrics are extracted at runtime from attributes present in the input files. An example of this + could be the TPM or FPKM values assigned to transcripts by rna expression analysis software. + +.. hint:: Starting from version 1 beta8, Mikado allows to use externally defined metrics for the transcripts. These can + be accessed using the keyword "external." within the *configuration* file. See the + :ref:`relevant section ` for details. + +.. hint:: Starting from version 2, Mikado allows to use attribute defined metrics for the transcripts. These can be + accessed using the keyword "attributes." within the *scoring* file. See the + :ref:`relevant section ` for details. + +.. important:: Starting from Mikado 1 beta 8, it is possible to use metrics with values between 0 and 1 directly as + scores, without rescaling. From Mikado 2, it is also possible to treat values that are between 0 and 100 + in the same way; Mikado will convert them automatically to be between 0 and 1, internally. + It is also possible to instruct Mikado, during scoring, to use certain values as percentages by adding the + :code:`percentage = True` for the scoring parameter. + +.. topic:: Available metrics + ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| Metric name | Description | Category | Data type | Usable raw | ++=====================================+===========================================================+=============+=============+==============+ ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| tid | ID of the transcript - cannot be an undefined value. | Descriptive | str | False | +| | Alias of id. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| parent | Name of the parent feature of the transcript. | Descriptive | str | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| score | Numerical value which summarizes the reliability of the | Descriptive | str | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| external_scores | **SPECIAL** this Namespace contains all the information | External | Namespace | True | +| | regarding external scores for the transcript. If an | | | | +| | absent property is not defined in the Namespace, Mikado | | | | +| | will set a default value of 0 into the Namespace and | | | | +| | return it. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| alias | This property returns the alias of the transcript, if | Descriptive | str | False | +| | present, else its ID | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| best_bits | Metric that returns the best BitS associated with the | External | float | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| blast_identity | This metric will return the alignment identity for the | External | float | True | +| | best BLAST hit according to the evalue. If no BLAST hits | | | | +| | are available for the sequence, it will return 0. | | | | +| | :return: :return: | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| blast_query_coverage | This metric will return the **query** coverage for the | External | float | True | +| | best BLAST hit according to the evalue. If no BLAST hits | | | | +| | are available for the sequence, it will return 0. | | | | +| | :return: | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| blast_score | Interchangeable alias for testing different blast-related | External | float | False | +| | scores. Current: best bit score. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| blast_target_coverage | This metric will return the **target** coverage for the | External | float | True | +| | best BLAST hit according to the evalue. If no BLAST hits | | | | +| | are available for the sequence, it will return 0. | | | | +| | :return: :return: | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| canonical_intron_proportion | This metric returns the proportion of canonical introns | Intron | float | True | +| | of the transcript on its total number of introns. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| cdna_length | This property returns the length of the transcript. | cDNA | int | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| cds_disrupted_by_ri | This property describes whether the CDS is interrupted | Locus | bool | True | +| | within a retained intron. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| cds_not_maximal | This property returns the length of the CDS excluded from | CDS | int | False | +| | the selected ORF. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| cds_not_maximal_fraction | This property returns the fraction of bases not in the | CDS | float | True | +| | selected ORF compared to the total number of CDS bases in | | | | +| | the cDNA. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_cds_fraction | This property return the percentage of the CDS part of | CDS | float | True | +| | the transcript vs. the cDNA length | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_cds_intron_fraction | This property returns the fraction of CDS introns of the | Locus | float | True | +| | transcript vs. the total number of CDS introns in the | | | | +| | Locus. If the transcript is by itself, it returns 1. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_cds_length | This property return the length of the CDS part of the | CDS | int | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_cds_locus_fraction | This metric returns the fraction of CDS bases of the | Locus | float | True | +| | transcript vs. the total of CDS bases in the locus. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_cds_num | This property returns the number of non-overlapping CDS | CDS | int | False | +| | segments in the transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_cds_num_fraction | This property returns the fraction of non-overlapping CDS | CDS | float | True | +| | segments in the transcript vs. the total number of exons | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_utr_fraction | This property returns the fraction of the cDNA which is | UTR | float | True | +| | not coding according to any ORF. Complement of | | | | +| | combined_cds_fraction | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| combined_utr_length | This property return the length of the UTR part of the | UTR | int | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| end_distance_from_junction | This metric returns the cDNA distance between the stop | CDS | int | False | +| | codon and the last junction in the transcript. In many | | | | +| | eukaryotes, this distance cannot exceed 50-55 bps | | | | +| | otherwise the transcript becomes a target of NMD. If the | | | | +| | transcript is not coding or there is no junction | | | | +| | downstream of the stop codon, the metric returns 0. This | | | | +| | metric considers the combined CDS end. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| end_distance_from_tes | This property returns the distance of the end of the | CDS | int | False | +| | combined CDS from the transcript end site. If no CDS is | | | | +| | defined, it defaults to 0. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| exon_fraction | This property returns the fraction of exons of the | Locus | float | True | +| | transcript which are contained in the sublocus. If the | | | | +| | transcript is by itself, it returns 1. Set from outside. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| exon_num | This property returns the number of exons of the | cDNA | int | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| five_utr_length | Returns the length of the 5' UTR of the selected ORF. | UTR | float | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| five_utr_num | This property returns the number of 5' UTR segments for | UTR | int | False | +| | the selected ORF. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| five_utr_num_complete | This property returns the number of 5' UTR segments for | UTR | int | False | +| | the selected ORF, considering only those which are | | | | +| | complete exons. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| has_start_codon | Boolean. True if the selected ORF has a start codon. | CDS | bool | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| has_stop_codon | Boolean. True if the selected ORF has a stop codon. | CDS | bool | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| highest_cds_exon_number | This property returns the maximum number of CDS segments | CDS | int | False | +| | among the ORFs; this number can refer to an ORF | | | | +| | *DIFFERENT* from the maximal ORF. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| highest_cds_exons_num | Returns the number of CDS segments in the selected ORF | CDS | int | False | +| | (irrespective of the number of exons involved) | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| intron_fraction | This property returns the fraction of introns of the | Locus | float | True | +| | transcript vs. the total number of introns in the Locus. | | | | +| | If the transcript is by itself, it returns 1. Set from | | | | +| | outside. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| is_complete | Boolean. True if the selected ORF has both start and end. | CDS | bool | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| is_reference | Checks whether the transcript has been marked as | External | bool | False | +| | reference by Mikado prepare | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| max_exon_length | This metric will return the length of the biggest exon in | cDNA | int | False | +| | the transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| max_intron_length | This property returns the greatest intron length for the | Intron | int | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| min_exon_length | This metric will return the length of the biggest exon in | cDNA | int | False | +| | the transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| min_intron_length | This property returns the smallest intron length for the | Intron | int | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| non_verified_introns_num | This metric returns the number of introns of the | External | int | False | +| | transcript which are not validated by external data. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| num_introns_greater_than_max | This metric returns the number of introns greater than | Intron | int | False | +| | the maximum acceptable intron size indicated in the | | | | +| | constructor. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| num_introns_smaller_than_min | This metric returns the number of introns smaller than | Intron | int | False | +| | the mininum acceptable intron size indicated in the | | | | +| | constructor. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| number_internal_orfs | This property returns the number of ORFs inside a | CDS | int | False | +| | transcript. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| only_non_canonical_splicing | This metric will return True if the canonical_number is 0 | Intron | bool | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| original_source | This property returns the original source assigned to the | Descriptive | str | False | +| | transcript (before Mikado assigns its own final source | | | | +| | value). | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| proportion_verified_introns | This metric returns, as a fraction, how many of the | External | float | True | +| | transcript introns are validated by external data. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| proportion_verified_introns_inlocus | This metric returns, as a fraction, how many of the | Locus | float | True | +| | verified introns inside the Locus are contained inside | | | | +| | the transcript. In loci without *any* verified introns, | | | | +| | this metric will be set to 1. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| retained_fraction | This property returns the fraction of the cDNA which is | Locus | float | True | +| | contained in retained introns. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| retained_intron_num | This property records the number of introns in the | Locus | int | False | +| | transcripts which are marked as being retained. See the | | | | +| | corresponding method in the sublocus class. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_cds_exons_fraction | Returns the fraction of CDS segments in the selected ORF | CDS | float | True | +| | (irrespective of the number of exons involved) | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_cds_fraction | This property calculates the fraction of the selected CDS | CDS | float | True | +| | vs. the cDNA length. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_cds_intron_fraction | This property returns the fraction of CDS introns of the | CDS | float | True | +| | selected ORF of the transcript vs. the total number of | | | | +| | CDS introns in the Locus (considering only the selected | | | | +| | ORF). If the transcript is by itself, it should return 1. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_cds_length | This property calculates the length of the CDS selected | CDS | int | False | +| | as best inside the cDNA. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_cds_locus_fraction | This metric returns the fraction of CDS bases of the | Locus | float | True | +| | transcript vs. the total of CDS bases in the locus. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_cds_num | This property calculates the number of CDS exons for the | CDS | int | False | +| | selected ORF | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_cds_number_fraction | This property returns the proportion of best possible CDS | CDS | float | False | +| | segments vs. the number of exons. See | | | | +| | selected_cds_number. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_end_distance_from_junction | This metric returns the distance between the stop codon | CDS | int | False | +| | and the last junction of the transcript. In many | | | | +| | eukaryotes, this distance cannot exceed 50-55 bps, | | | | +| | otherwise the transcript becomes a target of NMD. If the | | | | +| | transcript is not coding or there is no junction | | | | +| | downstream of the stop codon, the metric returns 0. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_end_distance_from_tes | This property returns the distance of the end of the best | CDS | int | False | +| | CDS from the transcript end site. If no CDS is defined, | | | | +| | it defaults to 0. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| selected_start_distance_from_tss | This property returns the distance of the start of the | CDS | int | False | +| | best CDS from the transcript start site. If no CDS is | | | | +| | defined, it defaults to 0. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| snowy_blast_score | Metric that indicates how good a hit is compared to the | External | float | False | +| | competition, in terms of BLAST similarities. As in | | | | +| | SnowyOwl, the score for each hit is calculated by taking | | | | +| | the coverage of the target and dividing it by (2 * | | | | +| | len(self.blast_hits)). IMPORTANT: when splitting | | | | +| | transcripts by ORF, a blast hit is added to the new | | | | +| | transcript only if it is contained within the new | | | | +| | transcript. This WILL screw up a bit the homology score. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| source_score | This metric returns a score that is assigned to the | External | float | False | +| | transcript in virtue of its origin. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| start_distance_from_tss | This property returns the distance of the start of the | CDS | int | False | +| | combined CDS from the transcript start site. If no CDS is | | | | +| | defined, it defaults to 0. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| suspicious_splicing | This metric will return True if the transcript either has | Intron | bool | False | +| | canonical introns on both strands (probably a chimeric | | | | +| | artifact between two neighbouring loci, or if it has no | | | | +| | canonical splicing event but it would if it were assigned | | | | +| | to the opposite strand (probably a strand misassignment | | | | +| | on the part of the assembler/predictor). | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| three_utr_length | Returns the length of the 5' UTR of the selected ORF. | | int | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| three_utr_num | This property returns the number of 3' UTR segments | UTR | int | False | +| | (referred to the selected ORF). | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| three_utr_num_complete | This property returns the number of 3' UTR segments for | UTR | int | False | +| | the selected ORF, considering only those which are | | | | +| | complete exons. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| utr_fraction | This property calculates the length of the UTR of the | UTR | float | True | +| | selected ORF vs. the cDNA length. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| utr_length | Returns the sum of the 5'+3' UTR lengths | UTR | int | False | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| utr_num | Returns the number of UTR segments (referred to the | UTR | int | False | +| | selected ORF). | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| utr_num_complete | Returns the number of UTR segments which are complete | UTR | int | False | +| | exons (referred to the selected ORF). | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ +| verified_introns_num | This metric returns the number of introns of the | External | int | False | +| | transcript which are validated by external data. | | | | ++-------------------------------------+-----------------------------------------------------------+-------------+-------------+--------------+ + + +.. _external-metrics: + +External metrics +~~~~~~~~~~~~~~~~ + +Starting from version 1 beta 8, Mikado allows to load external metrics into the database, to be used for evaluating transcripts. Metrics of this kind **must** have a value comprised between 0 and 1. +The file can be provided either by specifying it in the :ref:`coonfiguration file `, under "serialise/files/external_scores", or on the command line with the "--external-scores" parameters to mikado :ref:`serialise `. +The external scores file should have the following format: + ++--------------------------+------------------+------------------+-------+-----------------+ +| TID | Metric_one | Metric_two | ... | Metric_N | ++==========================+==================+==================+=======+=================+ +| Transcript_one | value | value | | value | ++--------------------------+------------------+------------------+-------+-----------------+ +| Transcript_two | value | value | | value | ++--------------------------+------------------+------------------+-------+-----------------+ +| ... | ... | ... | | ... | ++--------------------------+------------------+------------------+-------+-----------------+ +| Transcript_N | value | value | | value | ++--------------------------+------------------+------------------+-------+-----------------+ + + +Please note the following: + +* the header is mandatory. +* the metric names at the head of the table should **not** contain any space or spcecial characters, apart from the underscore (_) +* the header provides the name of the metric as will be seen by Mikado. As such, it is advised to choose sensible and informative names (e.g. "fraction_covered") rather than uninformative ones (e.g. the "metric_one" from above) +* Column names **must be unique**. +* The transcript names present in the first column **must** be present in the FASTA file. +* The table should be tab-separated. +* Values can be of any numerical or boolean type. However, only values that are determined **at serialisation** to be comprised within 0 and 1 (inclusive), or between 0 and 100 (ie *percentages*) can be used as raw values. + +A proper way of generating and using external scores would, therefore, be the following: + +* Run Mikado prepare on the input dataset. +* Run all necessary supplementary analyses (ORF calling and/or homology analysis through DIAMOND or BLAST). +* Run supplementary analyses to assess the transcripts, e.g. expression analysis. Normalise results so that they can be expressed with values between 0 and 1. + + * Please note that boolean results (e.g. presence or absence) can be expressed with 0 and 1 intead of "False" and "True". Customarily, in Python 0 stands for False and 1 for True, but you can choose to switch the order if you so desire. +* Aggregate all results in a text table, like the one above, tab separated. +* Call mikado serialise, specifying the location of this table either through the configuration file or on the command line invocation. + +Given the open ended nature of the external scores, the Daijin pipeline currently does not offer any system to generate these scores. This might change in the future. + +.. note:: our laboratory has implemented a novel pipeline, Minos (:ref:`https://github.com/EI-CoreBioinformatics/minos `) for integrating multiple sources of evidence using Mikado. Internally, Minos makes use of the external metrics described here. We recommend having a look at the pipeline for inspiration. + +Adding external scores to the scoring file +------------------------------------------ + +Once the external metrics have been properly loaded, it is necessary to tell Mikado how to use them. This requires :ref:`modifying the scoring file itself `. The header that we used in the table above does provide the names of the metrics as they will be seen by Mikado. + +Let us say that we have performed an expression analysis on our transcripts, and we have created and loaded the following three metrics: + +* "fraction_covered", ie the percentage of the transcript covered by at least X reads (where X is decided by the experimenter) +* "samples_expressed", ie the percentage of samples where the transcript was expressed over a certain threshold (e.g. 1 TPM) +* "has_coverage_gaps", ie a boolean metrics that indicates whether there are windows *within* the transcript that lowly or not at all covered (e.g. a 100bp stretch with no coverage between two highly covered regions, indicating a possilble intron retention or chimera). For this example, a value of "0" indicates that there no coverage gaps (ie. it is *False* that there are gaps), "1" otherwise (it is *True* that there are coverage gaps). + +We can now use these metrics as normal, by invoking them as "external." followed by the name of the metrics: e.g., "external.fraction_covered". +So for example, if we wished to prioritise transcripts that are expressed in the highest number of samples and are completely covered by RNASeq data upon reads realignment, under "scoring", we can add the following: + +.. code-block:: yaml + + scoring: + # [ ... other metrics ... ] + - external.samples_expressed: {rescaling: max} + - external.fraction_covered: {rescaling: max} + +And if we wanted to consider any primary transcript with coverage gaps as a potential fragment, under the "fragmentary" section we could do so: + +.. code-block:: yaml + + not_fragmentary: + expression: + # other metrics .. + - and (external.has_coverage_gaps) + # Finished expression + parameters: + # other metrics ... + external.has_coverage_gaps: {operator: eq, value: 0} # Please note, again, that here "0" means "no coverage gaps detected". + # other metrics ... + +As external metrics allow Mikado to accept any arbitrary metric for each transcript, they allow the program to assess transcripts in any way the experimenter desires. However, currently we do not provide any way of automating the process. + +.. note:: also for external metrics, it is necessary to add a suffix to them if they are invoked more than once in an expression (see the :ref:`tutorial `). An invocation of e.g. "external.samples_expressed.mono" and "external.samples_expressed.multi", to distinguish between monoexonic and multiexonic transcripts, would be perfectly valid and actually *required* by Mikado. Notice the double use of the dot (".") as separator. Its usage as such is the reason that it cannot be present in the name of the metric itself (so, for example, "has.coverage.gaps" would be an invalid metric name). + +.. _attributes-metrics: + +Attributes metrics +~~~~~~~~~~~~~~~~~~ +Starting from version 2, Mikado allows the usage of metrics defined in the attributes of the input files, these metrics +behave as the rest of the metrics but they are gathered at runtime from the input datasets. It is important to note that +these metrics must be equivalent in all the inputs and are by default initialised to "0" when a transcript does not have +an attribute defining the metric. The default initialisation value can be overridden in the scoring file. + +Attribute metrics along with the required **rescaling** parameter, can define an *rtype* parameter as one of (float, int +or bool) which will be used to cast the value of the attribute internally. Currently the types available are: integers +(int), reals (float) and booleans (bool). The *rtype* will define how the value in the attribute will be represented or +treated internally in mikado, i.e an attribute has a value of "3.2" but the *rtype* is defined as int, this value will be +casted from a real number to an integer which usually in python just truncates to "3" for any number between 3 and 4. +If the type was treated as a 'float' the internal value in mikado for this attribute would be "3.2", finally were this +parameter's *rtype* be defined as bool it's value internally in mikado would be 'True' which is the case for any number +not equal to "0". Finally, a *percentage* boolean which indicates that the values are in the 0-100 range and enables a +transformation to the 0-1 range so that these can be used as 'raw' scores +(see the :ref:`scoring algorithm section <_scoring_algorithm>`). + +An example for the usage of these metrics could be:: + + Chr5 Cufflinks transcript 26581218 26583874 1000 - . gene_id "cufflinks_star_at.23551";transcript_id "cufflinks_star_at.23551.1";exon_number "1";FPKM "0.4343609420";conf_hi "0.577851";frac "0.751684";cov "11.982854";conf_lo "0.293994";percentage_score "42.42" + Chr5 Cufflinks exon 26581218 26581528 . - . gene_id "cufflinks_star_at.23551";transcript_id "cufflinks_star_at.23551.1"; + Chr5 Cufflinks exon 26583335 26583874 . - . gene_id "cufflinks_star_at.23551";transcript_id "cufflinks_star_at.23551.1"; + + +If the scoring file defines: + +.. code-block:: yaml + + scoring: + # [ ... other metrics ... ] + - attributes.FPKM: {rescaling: max} + - attributes.frac: {rescaling: max, use_raw: true} + - attributes.percentage_score: {rescaling: max, use_raw: true, percentage: true} + - attributes.cov: {rescaling: max, use_raw: true, multiplier: 1, rtype: int, default: 1} + +The same scoring rules defined previously will apply to metrics obtained from the transcript's attributes. \ No newline at end of file diff --git a/docs/Tutorial/Adapting.rst b/docs/Tutorial/Adapting.rst index 1de0d8fab..f0fb73b09 100644 --- a/docs/Tutorial/Adapting.rst +++ b/docs/Tutorial/Adapting.rst @@ -249,7 +249,6 @@ Looking at the scoring section of the file, we do not need to apply anything par :class: toggle .. code-block:: yaml - :linenos: scoring: snowy_blast_score: {rescaling: max} diff --git a/docs/Tutorial/Daijin_tutorial.rst b/docs/Tutorial/Daijin_tutorial.rst index 5f952e199..07c804105 100644 --- a/docs/Tutorial/Daijin_tutorial.rst +++ b/docs/Tutorial/Daijin_tutorial.rst @@ -16,10 +16,7 @@ Tutorial for Daijin This tutorial will guide you through the task of configuring and running the whole Daijin pipeline on a *Drosophila melanogaster* dataset comprising two different samples, using one aligner (HISAT) and two assemblers (Stringtie and CLASS2) as chosen methods. A modern desktop computer with a multicore processor and 4GB of RAM or more should suffice to execute the pipeline within two hours. -.. warning:: Please note that **development of Daijin Assemble has now been discontinued**. - Daijin will be superseded by a different pipeline manager, which is currently in the works. We will continue actively maintening the - "mikado" part of the pipeline, which is dedicated to run the steps between a set of input transcript assemblies and/or cDNA alignments until - the final Mikado output. +.. warning:: Please note that **development of Daijin Assemble has now been discontinued**. Daijin will be superseded by a different pipeline manager, which is currently in the works. We will continue the active maintenance the "mikado" part of the pipeline, which is dedicated to run the steps between a set of input transcript assemblies and/or cDNA alignments until the final Mikado output. Overview ~~~~~~~~ @@ -243,11 +240,11 @@ Step 2: running the assemble part Now that we have created a proper configuration file, it is time to launch Daijin assemble and inspect the results. Issue the command:: - daijin assemble --cores daijin.yaml + daijin assemble --cores daijin.toml After checking that the configuration file is valid, Daijin will start the alignment and assembly of the dataset. On a normal desktop computer, this should take less than 2 hours. Before launching the pipeline, you can obtain a graphical representation of the steps with:: - daijin assemble --dag daijin.yaml | dot -Tsvg > assemble.svg + daijin assemble --dag daijin.toml | dot -Tsvg > assemble.svg .. figure:: assemble_pipeline.png :scale: 50% @@ -255,7 +252,7 @@ After checking that the configuration file is valid, Daijin will start the align You can also ask Daijin to display the steps to be executed, inclusive of their command lines, by issuing the following command:: - daijin assemble --dryrun daijin.yaml + daijin assemble --dryrun daijin.toml When Daijin is finished, have a look inside the folder Dmelanogaster/3-assemblies/output/; you will find the following GTF files: diff --git a/docs/Tutorial/Scoring_tutorial.rst b/docs/Tutorial/Scoring_tutorial.rst index 5dfdf8191..b741a7c27 100644 --- a/docs/Tutorial/Scoring_tutorial.rst +++ b/docs/Tutorial/Scoring_tutorial.rst @@ -15,16 +15,16 @@ expose a general way to create your own configuration file. The purpose of scoring files: introductory note ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Mikado is an annotation pipeline, and as such, it does not have, at its core, the mission to represent every possible expressed transcript. Some sequenced transcripts may originate from transcriptional errors, exist only transiently and may have no functional role. When performing a sequencing experiment, we create a “snapshot” of the transient transcriptional state of the cell: inclusive of these erroneous events, along with immature transcripts and artifacts arising from misassemblies, fragmentation, genomic inclusion, etc. +Mikado is an annotation pipeline, and as such, it does not have, at its core, the mission to represent every possible expressed transcript. Some sequenced transcripts may originate from transcriptional errors, exist only transiently or may have no functional role. When performing a sequencing experiment, we create a “snapshot” of the transient transcriptional state of the cell: inclusive of erroneous events, along with immature transcripts and artifacts arising from misassemblies, fragmentation during library construction, genomic DNA contamination, etc. -In Mikado, we make a choice regarding what transcripts we want to retain in our annotation (as is the case with any genome annotation). For example, as you will see in this tutorial, the standard configuration penalises transcripts with retained CDS introns, and transcripts that are NMD targets. This choice does not mean that we think that these transcripts are artifacts; rather, it signifies that we prioritise those transcripts that are more likely to represent functionally active genes. Annotators will differ on the point: for example, the human reference annotation retains and marks these events, rather than discarding them. +In Mikado, we make a choice regarding what transcripts we want to retain in our annotation (as is the case with any genome annotation). For example, as you will see in this tutorial, the standard configuration penalises transcripts with retained CDS introns, and transcripts that are NMD targets. This choice does not mean that we think that these transcripts are artifacts; rather, it signifies that we prioritise those transcripts that are more likely to represent functionally active genes. Annotators will differ on this point: for example, the human reference annotation retains and marks these events, rather than discarding them. Mikado allows the experimenter to make these choices simply and transparently. Our pre-defined configuration files strive to replicate the choices made by annotators over the years, and thus allow to replicate - as much as possible - the reference annotations of various species starting from partial sequencing data. However, if as an experimenter you are interested in a more stringent approach - say, only coding transcripts with a complete ORF and a UTR - or you would like instead to perform only a minimal cleaning up - say, just discarding obvious NMD targets - Mikado will allow you to do so. This tutorial will show you how. Obtaining pre-defined scoring files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Mikado comes with pre-packaged scoring files for multiple species. These can be found in the installation directory, under "configuration/scoring_files"; or, when launching mikado configure, you can request to copy a template file to the working directory ("--copy-scoring" command flag). In the rest of the tutorial, however, we will presume that no suitable scoring file exists and that a new one has to be created from scratch. +Mikado comes with two pre-packaged scoring files, one for plant species as well as another generic for mammalian species. These can be found in the installation directory, under "configuration/scoring_files"; or, when launching mikado configure, you can request to copy a template file to the working directory ("--copy-scoring" command flag). In the rest of the tutorial, however, we will presume that no suitable scoring file exists and that a new one has to be created from scratch. .. _scoring-tutorial-first-reqs: diff --git a/docs/Tutorial/index.rst b/docs/Tutorial/index.rst index 61f6b1f46..417427693 100644 --- a/docs/Tutorial/index.rst +++ b/docs/Tutorial/index.rst @@ -3,80 +3,110 @@ Tutorial ======== -This tutorial will guide you through a simple analysis of Mikado, using a small amount of data coming from an experiment on *Arabidopsis thaliana*. RNA-Seq data was obtained from `study PRJEB7093 on ENA `_, aligned with STAR [STAR]_ against the `TAIR10 `_ reference genome, and assembled with four different programs. For this small example, we are going to focus on a small genomic region: Chr5, from 26,575,364 to 26,614,205. +This tutorial will guide you through a simple analysis of Mikado, using a small amount of data coming from an +experiment on *Arabidopsis thaliana*. RNA-Seq data was obtained from `study PRJEB7093 on ENA `_, aligned with STAR [STAR]_ against the `TAIR10 `_ +reference genome, and assembled with four different programs. For this small example, we are going to focus on a +small genomic region: Chr5, from 26,575,364 to 26,614,205. During this analysis, you will require the following files: * :download:`chr5.fas.gz `: a FASTA file containing the Chr5 of *A. thaliana*. -* :download:`reference.gff3`: a GFF3 file with the annotation of the genomic slice we are interested in, for comparison purposes. -* :download:`junctions.bed `: a BED12 file of reliable splicing junctions in the region, identified using Portcullis [Portcullis]_ +* :download:`reference.gff3`: a GFF3 file with the annotation of the genomic slice we are interested in, for + comparison purposes. +* :download:`junctions.bed `: a BED12 file of reliable splicing junctions in the region, identified + using Portcullis [Portcullis]_ * :download:`class.gtf`: a GTF file of transcripts assembled using CLASS [Class2]_ * :download:`cufflinks.gtf`: a GTF file of transcripts assembled using Cufflinks [Cufflinks]_ * :download:`stringtie.gtf`: a GTF file of transcripts assembled using Stringtie [StringTie]_ * :download:`trinity.gff3`: a GFF3 file of transcripts assembled using Trinity [Trinity]_ and aligned using GMAP [GMAP]_ * :download:`orfs.bed`: a BED12 file containing the ORFs of the above transcripts, derived using TransDecoder [Trinity]_ -* :download:`uniprot_sprot_plants.fasta.gz`: a FASTA file containing the plant proteins released with SwissProt [Uniprot]_ +* :download:`uniprot_sprot_plants.fasta.gz`: a FASTA file containing the plant proteins released with SwissProt + [Uniprot]_ -All of this data can also be found in the ``sample_data`` directory of the `Mikado source `_. +All of this data can also be found in the ``sample_data`` directory of the `Mikado source `_. You will also require the following software: * a functioning installation of SQLite. * a functioning version of BLAST+ [Blastplus]_. +* a functioning version of Prodigal [Prodigal]_. Creating the configuration file for Mikado ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In the first step, we need to create a configuration file to drive Mikado. To do so, we will first create a tab-delimited file describing our assemblies (class.gtf, cufflinks.gtf, stringtie.gtf, trinity.gff3):: +In the first step, we need to create a configuration file to drive Mikado. To do so, we will first create a +tab-delimited file describing our assemblies (class.gtf, cufflinks.gtf, stringtie.gtf, trinity.gff3):: - class.gtf cl True - cufflinks.gtf cuff True - stringtie.gtf st True 1 - trinity.gff3 tr False -0.5 + class.gtf cl True False False True + cufflinks.gtf cuff True False False True + stringtie.gtf st True 1 False True True + trinity.gff3 tr False -0.5 False False True + reference.gff3 at True 5 True False False + pacbio.bam pb True 1 False False False -In this file, the three fields define the following: +In this file, the first three fields define the following: -#. The file location and name (if no folder is specified, Mikado will look for each file in the current working directory) +#. The file location and name (if no folder is specified, Mikado will look for each file in the current working +directory) #. An alias associated with the file, which has to be unique #. A binary flag (``True`` / ``False``) indicating whether the assembly is strand-specific or not -#. An optional fourth field which defines a score associated with that sample. All transcripts associated with the label will have their score corrected by the value on this field. So eg. in this example all Stringtie models will receive an additional point, and all Trinity models will be penalised by half a point. Class and Cufflinks have no special bonus or malus associated with them. -Then, we will decompress the chromosome FASTA file: +These fields are then followed by a series of **optional** fields: -.. code-block:: bash - - gunzip chr5.fas.gz # This will create the file chr5.fas +#. A score associated with that sample. All transcripts associated with the label will have their score corrected by + the value on this field. So eg. in this example all Stringtie models will receive an additional point, and all + Trinity models will be penalised by half a point. Class and Cufflinks have no special bonus or malus associated. +#. A binary flag (``True`` / ``False``) defining whether the sample is a reference or not. +#. A binary flag (``True`` / ``False``) defining whether to exclude redundant models or not. +#. A binary flag (``True`` / ``False``) indicating whether Mikado prepare should strip the CDS of faulty models, but + otherwise keep their cDNA structure in the final output (``True``) or whether instead it should completely discard + such models (``False``). +#. A binary flag (``True`` / ``False``) instructing Mikado about whether the chimera split routine should be skipped + for these models (``True``) or if instead it should proceed normally (``False``). Finally, we will create the configuration file itself using ``mikado configure``: .. code-block:: bash - mikado configure --list list.txt --reference chr5.fas --mode permissive --scoring plants.yaml --copy-scoring plants.yaml --junctions junctions.bed -bt uniprot_sprot_plants.fasta configuration.yaml + mikado configure --list list.txt --reference chr5.fas.gz --mode permissive --scoring plants.yaml --copy-scoring +plants.yaml --junctions junctions.bed -bt uniprot_sprot_plants.fasta configuration.yaml -This will create a configuration.yaml file with the parameters that were specified on the command line. This is :ref:`simplified configuration file `, containing all the necessary parameters for the Mikado run. It will also copy the ``plants.yaml`` file from the Mikado installation to your current working directory. -The parameters we used for the command line instruct Mikado in the following ways: +This will create a configuration.yaml file with the parameters that were specified on the command line. This is +:ref:`simplified configuration file `, containing all the necessary parameters for the Mikado run. It +will also copy the ``plants.yaml`` file from the Mikado installation to your current working directory. -Alternatively, we could use a BGZIP-compressed file as input for ``mikado configure``: +.. hint:: Mikado can accept compressed genome FASTA files, like in this example, as long as they have been compressed + with BGZip rather than the vanilla UNIX GZip. -.. code-block:: bash - bgzip chr5.fas - samtools faidx chr5.fas.gz - mikado configure --list list.txt --reference chr5.fas.gz --mode permissive --scoring plants.yaml --copy-scoring plants.yaml --junctions junctions.bed -bt uniprot_sprot_plants.fasta configuration.yaml - -* *--list list.txt*: this part of the command line instructs Mikado to read the file we just created to understand where the input files are and how to treat them. +* *--list list.txt*: this part of the command line instructs Mikado to read the file we just created to understand + where the input files are and how to treat them. +* *--scoring*: the scoring file to use. Mikado ships with two pre-calculated scoring files, `plant.yaml` and +`mammalian.yaml` +* *--copy-scoring*: instruct Mikado to copy the scoring file from the installation directory to the current +directory, so that the experimenter can modify it as needed. * *--reference chr5.fas*: this part of the command line instructs Mikado on the location of the genome file. -* *--mode permissive*: the mode in which Mikado will treat cases of chimeras. See the :ref:`documentation ` for details. -* *--junctions junctions.bed*: this part of the command line instructs Mikado to consider this file as the source of reliable splicing junctions. -* *-bt uniprot_sprot_plants.fasta*: this part of the command line instructs Mikado to consider this file as the BLAST database which will be used for deriving homology information. +* *--mode permissive*: the mode in which Mikado will treat cases of chimeras. See the :ref:`documentation + ` for details. +* *--junctions junctions.bed*: this part of the command line instructs Mikado to consider this file as the source of + reliable splicing junctions. +* *-bt uniprot_sprot_plants.fasta*: this part of the command line instructs Mikado to consider this file as the BLAST + database which will be used for deriving homology information. -.. hint:: The *--copy-scoring* argument is usually not necessary, however, it allows you to easily inspect the :ref:`scoring file ` we are going to use during this run. +.. hint:: The *--copy-scoring* argument is usually not necessary, however, it allows you to easily inspect the +:ref:`scoring file ` we are going to use during this run. -.. hint:: Mikado provides a handful of pre-configured scoring files for different species. However, we do recommend inspecting and tweaking your scoring file to cater to your species. We provide a guide on how to create your own configuration files :ref:`here `. +.. hint:: Mikado provides a handful of pre-configured scoring files for different species. However, we do recommend +inspecting and tweaking your scoring file to cater to your species. We provide a guide on how to create your own +configuration files :ref:`here `. Mikado prepare ~~~~~~~~~~~~~~ -The subsequent step involves running ``mikado prepare`` to create a :ref:`sorted, non-redundant GTF with all the input assemblies `. As we have already created a configuration file with all the details regarding the input files, this will require us only to issue the command: +The subsequent step involves running ``mikado prepare`` to create a :ref:`sorted, non-redundant GTF with all the +input assemblies `. As we have already created a configuration file with all the details regarding the input +files, this will require us only to issue the command: .. code-block:: bash @@ -117,48 +147,88 @@ To create this file, we will proceed as follows: .. code-block:: bash - blastx -max_target_seqs 5 -num_threads 10 -query mikado_prepared.fasta -outfmt 5 -db uniprot_sprot_plants.fasta -evalue 0.000001 2> blast.log | sed '/^$/d' | gzip -c - > mikado.blast.xml.gz + blastx -max_target_seqs 5 -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send +evalue bitscore ppos btop" + -num_threads 10 -query mikado_prepared.fasta -db uniprot_sprot_plants.fasta -out mikado_prepared.blast.tsv + +This will produce the ``mikado_prepared.blast.tsv`` file, which contains the homology information for the run. + +.. warning:: Mikado requires a **custom** tabular file from BLAST, as we rely on the information on extra fields such + as e.g. ``btop``. Therefore the custom fields following ``-outfmt 6`` are **not** optional. + +ORF calculation for the transcripts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many of the metrics used by Mikado to evaluate and rank transcripts rely on the definition or their coding regions +(CDS). It is therefore *highly recommended* to use an ORF predictor to define the coding regions for each transcript +identified by `mikado prepare`. We directly support two different products: + +* :ref:`Prodigal `, a fast ORF predictor, capable of calculating thousands of + ORFs in seconds. However, as it was originally developed for ORF calling in bacterial genomes, it may occasionally + not provide the best possible answer. +* :ref:`TransDecoder `, a slower ORF predictor that is however more + specialised for eukaryotes. + +For this tutorial we are going to use Prodigal. Using it is very straighforward: + +.. code-block:: bash + + prodigal -i mikado_prepared.fasta -g 1 -o mikado.orfs.gff3 -f gff + + +.. warning:: Prodigal by default uses the 'Bacterial' codon translation table, which is of course not appropriate at + all for our eukariote genome. Therefore, it is essential to set ``-g 1`` on the command line. + By the same token, as prodigal normally would output the CDS prediction in GenBank format (currently not + supported by Mikado), we have to instruct Prodigal to emit its CDS predictions in GFF format. -This will produce the ``mikado.blast.xml.gz`` file, which contains the homology information for the run. Mikado serialise ~~~~~~~~~~~~~~~~ -This step involves running ``mikado serialise`` to create a SQLite database with all the information that mikado needs to perform its analysis. As most of the parameters are already specified inside the configuration file, the command line is quite simple: +This step involves running ``mikado serialise`` to create a SQLite database with all the information that mikado +needs to perform its analysis. As most of the parameters are already specified inside the configuration file, the +command line is quite simple: .. code-block:: bash - mikado serialise --json-conf configuration.yaml --xml mikado.blast.xml.gz --orfs mikado.bed --blast_targets + mikado serialise --json-conf configuration.yaml --xml mikado_prepared.blast.tsv --orfs mikado.orfs.gff3 +--blast_targets uniprot_sprot_plants.fasta --junctions junctions.bed After mikado serialise has run, it will have created two files: #. ``mikado.db``, the SQLite database that will be used by ``pick`` to perform its analysis. #. ``serialise.log``, the log of the run. -If you inspect the SQLite database ``mikado.db``, you will see it contains six different tables:: +If you inspect the SQLite database ``mikado.db``, you will see it contains nine different tables:: + + $ sqlite3 mikado.db ".tables" + chrom hit orf + external hsp query + external_sources junctions target - $ sqlite3 mikado.db - SQLite version 3.8.2 2013-12-06 14:53:30 - Enter ".help" for instructions - Enter SQL statements terminated with a ";" - sqlite> .tables - chrom hit hsp junctions orf query target +These tables contain the information coming from the genome FAI, the BLAST XML, the junctions BED file, +the ORFs BED file, and finally the input transcripts and the proteins. There are two additional tables (``external`` +and ``external_sources``) which in other runs would contain information on additional data, provided as tabular files. -These tables contain the information coming, in order, from the genome FAI, the BLAST XML, the junctions BED file, the ORFs BED file, and finally the input transcripts and the proteins. For more details on the database structure, please refer to the section on :ref:`this step ` in this documentation. +For more details on the database structure, please refer to the section on :ref:`this step ` in this +documentation. Mikado pick ~~~~~~~~~~~ -Finally, during this step ``mikado pick`` will integrate the data present in the database with the positional and structural data present in the GTF file :ref:`to select the best transcript models `. The command line to be issued is the following: +Finally, during this step ``mikado pick`` will integrate the data present in the database with the positional and +structural data present in the GTF file :ref:`to select the best transcript models `. The command line to be +issued is the following: .. code-block:: bash - mikado pick --json-conf configuration.yaml --subloci_out mikado.subloci.gff3 + mikado pick --configuration configuration.yaml --subloci_out mikado.subloci.gff3 At this step, we have to specify only some parameters for ``pick`` to function: -* *json-conf*: the configuration file. This is the only compulsory option. -* *subloci_out*: the partial results concerning the *subloci* step during the selection process will be written to ``mikado.subloci.gff3``. +* *--configuration*: the configuration file. This is the only compulsory option. +* *--subloci_out*: the partial results concerning the *subloci* step during the selection process will be written to +``mikado.subloci.gff3``. ``mikado pick`` will produce the following output files: @@ -329,7 +399,8 @@ After selecting the best transcripts in each locus, Mikado has discarded most of Analysing the tutorial data with Snakemake ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The workflow described in this tutorial can be executed automatically using Snakemake [Snake]_ with :download:`this Snakefile `. Just execute: +The workflow described in this tutorial can be executed automatically using Snakemake [Snake]_ with :download:`this Snakefile `. +Just execute: .. code-block:: bash @@ -341,5 +412,3 @@ in the directory where you have downloaded all of the tutorial files. In graph r :align: center :scale: 50% :figwidth: 100% - - diff --git a/docs/Tutorial/mikado.png b/docs/Tutorial/mikado.png new file mode 100644 index 000000000..3113f6cc4 Binary files /dev/null and b/docs/Tutorial/mikado.png differ diff --git a/docs/Tutorial/snake.png b/docs/Tutorial/snake.png new file mode 100644 index 000000000..2128f4be1 Binary files /dev/null and b/docs/Tutorial/snake.png differ diff --git a/docs/Usage/Compare.rst b/docs/Usage/Compare.rst index 373f7b693..9868832d8 100644 --- a/docs/Usage/Compare.rst +++ b/docs/Usage/Compare.rst @@ -16,7 +16,11 @@ This Mikado utility allows the user to compare the transcripts from any two anno - To understand which prediction transcript best represent each reference model - To have a summary information about the similarity between the two annotations. -Mikado compare has been directly inspired by the popular `Cuffcompare`_ [Cufflinks]_ utility and by `ParsEval`_ [ParsEval]_. Please note that while superficially similar to Cuffcompare in the style of the output files, Mikado compare is more philosophically similar to ParsEval, as it will not try to aggregate transcripts in loci but will perform a pure comparison between the two annotation files. Both GTF and GFF files are accepted, in any combination. +Mikado compare has been directly inspired by the popular `Cuffcompare`_ [Cufflinks]_ utility and by `ParsEval`_ [ParsEval]_. +Please note that while superficially similar to Cuffcompare in the style of the output files, Mikado compare is more +philosophically similar to ParsEval, as it will not try to aggregate transcripts in loci but will perform a pure comparison +between the two annotation files. Mikado compare can accept BAM, BED12, GTF and GFF files as input, both for the reference +or for the prediction. Usage ~~~~~ @@ -24,13 +28,19 @@ Usage Mikado compare is invoked by specifying the *reference* annotation and the desired mode of analysis. There are three possible options: #. In its default mode, compare will ask for a *prediction* annotation to compare the reference against. - #. In the *"self"* mode, compare will do a self-comparison of the reference against itself, excluding as possible results the matches between a transcript and itself. It can be useful to glean the relationships between transcripts and genes in an annotation. - #. In the *"internal"* mode of operations, compare will again perform a self-comparison, focussed on multi-isoform genes. For those, compare will perform and report all possible comparisons. It is useful to understand the relationships between the transcripts in a single locus. + #. In the *"self"* mode, compare will do a self-comparison of the reference against itself, excluding as possible results + the matches between a transcript and itself. It can be useful to glean the relationships between transcripts and genes + in an annotation. + #. In the *"internal"* mode of operations, compare will again perform a self-comparison, focussed on multi-isoform genes. + For those, compare will perform and report all possible comparisons. It is useful to understand the relationships + between the transcripts in a single locus. +Mikado stores the information of the reference in a specialised SQLite index, with a ".midx" suffix, which will be created +by the program upon its first execution with a new reference. If the index file is already present, Mikado will try +to use it rather than read again the annotation. -Mikado stores the information of the reference in a specialised SQLite index, with a ".midx" suffix, which will be created by the program upon its first execution with a new reference. If the index file is already present, Mikado will try to use it rather than read again the annotation. - -.. note: Starting from version 1.5, Mikado compare supports multiprocessing. Please note that memory usage scales approximately **linearly** with the amount of processes requested. +.. note:: Starting from version 1.5, Mikado compare supports multiprocessing. Please note that memory usage scales + approximately **linearly** with the amount of processes requested. Command line ------------ @@ -100,7 +110,8 @@ Mikado compare produces two tabular files, tmap_ and refmap_, and one :ref:`stat TMAP files ---------- -TMAP are tabular files that store the information regarding the best match for each prediction in the reference. The columns are as follows: +TMAP are tabular files that store the information regarding the best match for each prediction in the reference. +The columns are as follows: #. **ref_id**: Transcript ID of the matched reference model(s). #. **ref_gene**: Gene ID of the matched reference model(s). @@ -115,7 +126,9 @@ TMAP are tabular files that store the information regarding the best match for e #. **j_prec**: Splice junction precision of the prediction model ( TP / (number of splice sites in the prediction)) #. **j_recall**: Splice junction recall of the reference model ( TP / (number of splice sites in the reference)) #. **j_f1**: `F1`_ of recall and precision at the splice junction level. -#. **e_prec**: Exon precision of the prediction model ( TP / (number of exons in the prediction)). **NB**: this value is calculated "leniently", ie terminal exons count as a match if the *internal* border is called correctly and the exon is terminal in both prediction and reference. +#. **e_prec**: Exon precision of the prediction model ( TP / (number of exons in the prediction)). + **NB**: this value is calculated "leniently", ie terminal exons count as a match if the *internal* border is called + correctly and the exon is terminal in both prediction and reference. #. **e_recall**: Exon recall of the reference model ( TP / (number of exons in the reference)) #. **e_f1**: `F1`_ of recall and precision at the exon level. #. **distance**: Distance of the model from its putative match. @@ -128,14 +141,16 @@ An example of TMAP file is as follows:: AT5G66600.2 AT5G66600 C cl_Chr5.6272 cl_Chr5.6272.gene 7 9 94.95 72.43 82.18 100.00 75.00 85.71 85.71 66.67 75.00 0 Chr5:26575000..26578087 AT5G66620.1,AT5G66630.1,AT5G66631.1 AT5G66620,AT5G66630,AT5G66631 f,j,j,G st_Stringtie_STAR.21710.15 st_Stringtie_STAR.21710.15.gene 8 11,10,1 19.13,19.95,35.98 54.57,45.65,100.00 28.33,27.76,52.92 28.57,64.29,0.00 20.00,50.00,0.00 23.53,56.25,0.00 12.50,37.50,0.00 9.09,30.00,0.00 10.53,33.33,0.00 0 Chr5:26588402..26598231 -You can notice that the third example is particular as the prediction transcript matches not one but multiple reference transcripts. This is a fusion_ event. +You can notice that the third example is particular as the prediction transcript matches not one but multiple reference +transcripts. This is a fusion_ event. .. _refmap: RefMap files ------------ -RefMap files are tabular files which store the information regarding the best match for each reference transcript, among all possible prediction models. The columns of the file are as follows: +RefMap files are tabular files which store the information regarding the best match for each reference transcript, +among all possible prediction models. The columns of the file are as follows: #. **ref_id**: Transcript ID of the reference model. #. **ccode**: class code of the match. See :ref:`the relevant section on Class codes `. @@ -143,7 +158,8 @@ RefMap files are tabular files which store the information regarding the best ma #. **gid**: Gene ID of the prediction model. #. **nF1**: `F1`_ of recall and precision at the nucleotide level. #. **jF1**: `F1`_ of recall and precision at the splice junction level. -#. **eF1**: `F1`_ of recall and precision at the exon level. **NB**: this value is calculated "leniently", ie terminal exons count as a match if the *internal* border is called correctly and the exon is terminal in both prediction and reference. +#. **eF1**: `F1`_ of recall and precision at the exon level. **NB**: this value is calculated "leniently", ie terminal + exons count as a match if the *internal* border is called correctly and the exon is terminal in both prediction and reference. #. **ref_gene**: Gene ID of the reference model. #. **best_ccode**: Best possible class code found for any of the transcripts of the gene. #. **best_tid**: Transcript ID of the prediction model which fit best one of the transcript models of the reference gene. @@ -164,6 +180,27 @@ An example of a RefMap file is as follows:: Please note that the third example (AT5G66630.1) has as best possible match a fusion_ event. +Extended RefMap files +^^^^^^^^^^^^^^^^^^^^^ + +Mikado can optionally produce more detailed RefMap files, listing the details on recall and precision for each match +(rather than just the F1 for each level). If the flag ``-erm`` is present on the command line, the following extra-fields +will be present in the file: + +#. **nRecall**: recall at the nucleotide level. +#. **nPrecision**: precision at the nucleotide level. +#. **jRecall**: recall at the junction level. +#. **jPrecision**: precision at the junction level. +#. **eRecall**: recall at the exon level. +#. **ePrecision**: precision at the exon level. +#. **best_nRecall**: recall at the nucleotide level, for the best possible comparison. +#. **best_nPrecision**: precision at the nucleotide level, for the best possible comparison. +#. **best_jRecall**: recall at the junction level, for the best possible comparison. +#. **best_jPrecision**: precision at the junction level, for the best possible comparison. +#. **best_eRecall**: recall at the exon level, for the best possible comparison. +#. **best_ePrecision**: precision at the exon level, for the best possible comparison. + + .. _stats: Stats files @@ -172,42 +209,45 @@ Stats files These files provide a summary of the comparison between the reference and the annotation. An example is as follows:: Command line: - /usr/users/ga002/venturil/py351/bin/mikado compare -r reference.gff3 -p mikado.loci.gff3 -o compare -l compare.log - 7 reference RNAs in 5 genes - 15 predicted RNAs in 8 genes + /home/lucve/miniconda3/envs/mikado2/bin/mikado compare -r reference.gff3 -p Daijin/5-mikado/pick/permissive/mikado-permissive.loci.gff3 -o compare -l compare.log + 18 reference RNAs in 12 genes + 22 predicted RNAs in 15 genes --------------------------------- | Sn | Pr | F1 | - Base level: 85.74 64.73 73.77 - Exon level (stringent): 63.83 42.86 51.28 - Exon level (lenient): 80.00 52.94 63.72 - Intron level: 89.47 59.65 71.58 - Intron chain level: 33.33 14.29 20.00 - Transcript level (stringent): 0.00 0.00 0.00 - Transcript level (>=95% base F1): 28.57 13.33 18.18 - Transcript level (>=80% base F1): 42.86 20.00 27.27 - Gene level (100% base F1): 0.00 0.00 0.00 - Gene level (>=95% base F1): 40.00 25.00 30.77 - Gene level (>=80% base F1): 60.00 37.50 46.15 + Base level: 94.90 83.22 88.68 + Exon level (stringent): 80.56 71.60 75.82 + Exon level (lenient): 91.18 76.54 83.22 + Splice site level: 95.19 81.15 87.61 + Intron level: 96.84 88.19 92.31 + Intron level (NR): 94.34 79.37 86.21 + Intron chain level: 69.23 50.00 58.06 + Intron chain level (NR): 69.23 50.00 58.06 + Transcript level (stringent): 55.56 45.45 50.00 + Transcript level (>=95% base F1): 72.22 59.09 65.00 + Transcript level (>=80% base F1): 72.22 59.09 65.00 + Gene level (100% base F1): 75.00 60.00 66.67 + Gene level (>=95% base F1): 83.33 66.67 74.07 + Gene level (>=80% base F1): 83.33 66.67 74.07 # Matching: in prediction; matched: in reference. - Matching intron chains: 2 - Matched intron chains: 2 - Matching monoexonic transcripts: 1 - Matched monoexonic transcripts: 1 - Total matching transcripts: 3 - Total matched transcripts: 3 - - Missed exons (stringent): 17/47 (36.17%) - Novel exons (stringent): 40/70 (57.14%) - Missed exons (lenient): 9/45 (20.00%) - Novel exons (lenient): 32/68 (47.06%) - Missed introns: 4/38 (10.53%) - Novel introns: 23/57 (40.35%) - - Missed transcripts: 0/7 (0.00%) - Novel transcripts: 6/15 (40.00%) - Missed genes: 0/5 (0.00%) - Novel genes: 2/8 (25.00%) + Matching intron chains: 9 + Matched intron chains: 9 + Matching monoexonic transcripts: 4 + Matched monoexonic transcripts: 4 + Total matching transcripts: 13 + Total matched transcripts: 13 + + Missed exons (stringent): 14/72 (19.44%) + Novel exons (stringent): 23/81 (28.40%) + Missed exons (lenient): 6/68 (8.82%) + Novel exons (lenient): 19/81 (23.46%) + Missed introns: 3/53 (5.66%) + Novel introns: 13/63 (20.63%) + + Missed transcripts (0% nF1): 0/18 (0.00%) + Novel transcripts (0% nF1): 3/22 (13.64%) + Missed genes (0% nF1): 0/12 (0.00%) + Novel genes (0% nF1): 3/15 (20.00%) The first section of the file describes: @@ -215,25 +255,39 @@ The first section of the file describes: #. Concordance of the two annotation at the exonic level (recall, precision, and F1), in two ways: * *"stringent"*: only perfect exonic matches are considered. - * *"lenient"*: in this mode, terminal exons are counted as a match if the **internal** border is matched. See the RGASP paper [RGASP]_ for details on the rationale. - + * *"lenient"*: in this mode, terminal exons are counted as a match if the **internal** border is matched. + See the RGASP paper [RGASP]_ for details on the rationale. + #. Concordance of the two annotations in regards with the splice junctions, analysed independently of one another. #. Concordance of the two annotations at the intron level. - #. Concordance of the two annotations at the intron chain level - how many intron chains of the reference are found identical in the prediction. Only multiexonic models are considered for this level. + #. Concordance of the two annotations at the intron chain level - how many intron chains of the reference are found + identical in the prediction. Only multiexonic models are considered for this level. #. Concordance of the two annotations at the transcript level, in three different modes: * *"stringent"*: in this mode, only perfect matches are considered. - * *"95% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *95%* and, for multiexonic transcripts, the intron chain is reconstructed perfectly. - * *"80% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *80%* and, for multiexonic transcripts, the intron chain is reconstructed perfectly. + * *"95% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *95%* and, + for multiexonic transcripts, the intron chain is reconstructed perfectly. + * *"80% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *80%* and, + for multiexonic transcripts, the intron chain is reconstructed perfectly. #. Concordance of the two annotations at the gene level, in three different modes: - * *"stringent"*: in this mode, we consider reference genes for which it was possible to find at least one perfect match for one of its transcripts. - * *"95% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *95%* and, for multiexonic transcripts, the intron chain is reconstructed perfectly. - * *"80% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *80%* and, for multiexonic transcripts, the intron chain is reconstructed perfectly. + * *"stringent"*: in this mode, we consider reference genes for which it was possible to find at least one perfect + match for one of its transcripts. + * *"95% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *95%* and, for + multiexonic transcripts, the intron chain is reconstructed perfectly. The best possible match is considered for + this statistic. + * *"80% base F1"*: in this mode, we only count instances where the nucleotide F1 is greater than *80%* and, for + multiexonic transcripts, the intron chain is reconstructed perfectly. The best possible match is considered for + this statistic. -In the second section, the file reports how many of the intron chains, monoexonic transcripts and total transcripts in the **reference** were *matched* by at least one *matching* **prediction** transcript. Finally, in the third section the file reports the number of missed (present in the reference but not in the prediction) or novel (viceversa - present in the prediction but not in the reference) features. +In the second section, the file reports how many of the intron chains, monoexonic transcripts and total transcripts in +the **reference** were *matched* by at least one *matching* **prediction** transcript. Finally, in the third section the +file reports the number of missed (present in the reference but not in the prediction) or novel (viceversa - present in +the prediction but not in the reference) features. -.. note:: Please note that a gene might be considered as "found" even if its best match is intronic, on the opposite strand, or not directly overlapping it, or is in the opposite strand (see :ref:`next section `, in particular the *Intronic*, *Fragment* and *No overlap* categories). +.. note:: Please note that a gene might be considered as "found" even if its best match is intronic, on the opposite strand, + or not directly overlapping it, or is in the opposite strand (see :ref:`next section `, in particular the + *Intronic*, *Fragment* and *No overlap* categories). .. _ccodes: @@ -241,11 +295,15 @@ In the second section, the file reports how many of the intron chains, monoexoni Class codes ~~~~~~~~~~~ -In addition to recall, precision and F1 values, Mikado assign each comparison between two transcripts a *class code*, which summarises the relationship between the two transcripts. The idea is lifted from the popular tool `Cuffcompare`_, although Mikado greatly extends the catalogue of possible class codes. +In addition to recall, precision and F1 values, Mikado assign each comparison between two transcripts a *class code*, +which summarises the relationship between the two transcripts. The idea is lifted from the popular tool `Cuffcompare`_, +although Mikado greatly extends the catalogue of possible class codes. All class codes fall within one of the following categories: - **Match**: class codes of this type indicate concordance between the two transcript models. - - **Extension**: class codes of this type indicate that one of the two models extends the intron chain of the other, without internal interruptions. The extension can be from either perspective - either the prediction extends the reference, or it is instead *contained* within the reference (so that switching perspectives, the reference would "extend" the prediction). + - **Extension**: class codes of this type indicate that one of the two models extends the intron chain of the other, + without internal interruptions. The extension can be from either perspective - either the prediction extends the + reference, or it is instead *contained* within the reference (so that switching perspectives, the reference would "extend" the prediction). - **Alternative splicing**: the two exon chains overlap but differ in significant ways. - **Intronic**: either the prediction is completely contained within the introns of the reference, or viceversa. - **Overlap**: the two transcript models generically overlap on their exonic sequence. @@ -254,7 +312,13 @@ All class codes fall within one of the following categories: .. _fusion: - - **Fusion**: this special class code is a qualifier and it never appears on its own. When a transcript is defined as a fusion, its class code in the *tmap* file will be an "f" followed by the class codes of the individual transcript matches, sperated by comma. So a prediction which matches two reference models, one with a "j" and another with a "o", will have a class code of **"f,j,o"**. In the *refmap* file, if the fusion is the best match, the class code will be "f" followed by the class code for the individual reference transcript; e.g., **"f,j"** + - **Fusion**: this special class code is a qualifier and it never appears on its own. When a transcript is defined as a + fusion between two or more reference transcript, it will appear on *multiple lines*, one for each of the matches. In + each line, the class code will be a "f," followed by the class code assigned to that particular comparison. + So e.g. a prediction which matches two reference models, one with a "j" and another with a "o", will be present in two + different lines; the first one with a class code of **f,j** and the second with a class code of **f,o**. + In the *refmap* file, if the fusion is the best match, the class code will be "f" followed by the class code for the + individual reference transcript; e.g., **"f,j"** .. topic:: Available class codes @@ -434,7 +498,9 @@ Each calculated match against a reference transcript is stored as a potential *b This function is used to select both for the best match *for the transcript*, as well as to select among these matches for the best match *for the gene*. -The interval tree data structure is created using Cython code originally part of the `bx-python `_, kindly provided by `Dr. Taylor `_ for modification and inclusion in Mikado. The code has been slightly modified for making it Python3 compliant. +The interval tree data structure is created using Cython code originally part of the `bx-python `_, kindly provided by `Dr. Taylor `_ for modification and inclusion in Mikado. We subsequently integrated the improvements made on the code by Dr. Brent Pedersen in his :ref:`quicksect fork `. + +We further modified the original code for allowing "fuzzy matches", as in, allowing some leeway on how far the edges of the query interval are from the target intervals present in the tree. The .midx files storing the annotation for repeated compare runs are SQLite files. In them, Mikado will store for each gene its coordinates, its transcripts, and the location of exons and CDS features. MIDX files make repeated runs quite faster, as the program will not have to re-parse the GFF file. diff --git a/docs/Usage/Configure.rst b/docs/Usage/Configure.rst index 81efa5d60..d98536bc1 100644 --- a/docs/Usage/Configure.rst +++ b/docs/Usage/Configure.rst @@ -30,20 +30,27 @@ The fields in this file are as follows, for each row: ..