diff --git a/.buildinfo b/.buildinfo new file mode 100644 index 0000000..99ec388 --- /dev/null +++ b/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 6ef85c61a07ec8e9f0ed07676e851c59 +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/.doctrees/cmsearch.doctree b/.doctrees/cmsearch.doctree new file mode 100644 index 0000000..64ac9e2 Binary files /dev/null and b/.doctrees/cmsearch.doctree differ diff --git a/.doctrees/cpg.doctree b/.doctrees/cpg.doctree new file mode 100644 index 0000000..dc27bf3 Binary files /dev/null and b/.doctrees/cpg.doctree differ diff --git a/.doctrees/dust.doctree b/.doctrees/dust.doctree new file mode 100644 index 0000000..e27f1d6 Binary files /dev/null and b/.doctrees/dust.doctree differ diff --git a/.doctrees/environment.pickle b/.doctrees/environment.pickle new file mode 100644 index 0000000..6f224fe Binary files /dev/null and b/.doctrees/environment.pickle differ diff --git a/.doctrees/eponine.doctree b/.doctrees/eponine.doctree new file mode 100644 index 0000000..b1f7732 Binary files /dev/null and b/.doctrees/eponine.doctree differ diff --git a/.doctrees/genblast.doctree b/.doctrees/genblast.doctree new file mode 100644 index 0000000..47cc0ef Binary files /dev/null and b/.doctrees/genblast.doctree differ diff --git a/.doctrees/index.doctree b/.doctrees/index.doctree new file mode 100644 index 0000000..800b59e Binary files /dev/null and b/.doctrees/index.doctree differ diff --git a/.doctrees/install.doctree b/.doctrees/install.doctree new file mode 100644 index 0000000..1efdc99 Binary files /dev/null and b/.doctrees/install.doctree differ diff --git a/.doctrees/license.doctree b/.doctrees/license.doctree new file mode 100644 index 0000000..6af76fe Binary files /dev/null and b/.doctrees/license.doctree differ diff --git a/.doctrees/minimap.doctree b/.doctrees/minimap.doctree new file mode 100644 index 0000000..ea4bd5f Binary files /dev/null and b/.doctrees/minimap.doctree differ diff --git a/.doctrees/red.doctree b/.doctrees/red.doctree new file mode 100644 index 0000000..b861098 Binary files /dev/null and b/.doctrees/red.doctree differ diff --git a/.doctrees/repeatmasker.doctree b/.doctrees/repeatmasker.doctree new file mode 100644 index 0000000..1cf46e0 Binary files /dev/null and b/.doctrees/repeatmasker.doctree differ diff --git a/.doctrees/scallop.doctree b/.doctrees/scallop.doctree new file mode 100644 index 0000000..501d6f9 Binary files /dev/null and b/.doctrees/scallop.doctree differ diff --git a/.doctrees/star.doctree b/.doctrees/star.doctree new file mode 100644 index 0000000..22a4124 Binary files /dev/null and b/.doctrees/star.doctree differ diff --git a/.doctrees/stringtie.doctree b/.doctrees/stringtie.doctree new file mode 100644 index 0000000..771b04c Binary files /dev/null and b/.doctrees/stringtie.doctree differ diff --git a/.doctrees/trf.doctree b/.doctrees/trf.doctree new file mode 100644 index 0000000..6635d44 Binary files /dev/null and b/.doctrees/trf.doctree differ diff --git a/.doctrees/trnascan.doctree b/.doctrees/trnascan.doctree new file mode 100644 index 0000000..d0a884d Binary files /dev/null and b/.doctrees/trnascan.doctree differ diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/_modules/ensembl/tools/anno/protein_annotation/genblast.html b/_modules/ensembl/tools/anno/protein_annotation/genblast.html new file mode 100644 index 0000000..494d08b --- /dev/null +++ b/_modules/ensembl/tools/anno/protein_annotation/genblast.html @@ -0,0 +1,603 @@ + + + + + + + ensembl.tools.anno.protein_annotation.genblast — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.protein_annotation.genblast

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+GenBlast identifies homologous gene sequences in genomic databases.
+One of the key features of GenBlast is its flexibility to handle
+comparative genomics tasks and accurately identify homologs even when
+the sequences have undergone significant evolutionary changes.
+This capability makes it a valuable resource for researchers studying gene
+evolution, gene families, and gene function across diverse species.
+
+GenBlast has been widely used in various genomic analyses and is available as
+a standalone command-line tool or as part of different bioinformatics pipelines.
+Researchers in the field of comparative genomics and gene function analysis
+often rely on GenBlast to perform sensitive homology searches and obtain
+valuable insights into the evolutionary relationships and functional conservation
+of genes in different organisms.
+
+
+She, R., Chu, J.S., Uyar, B., Wang, J., Wang, K., and Chen, N. (2011).
+GenBlastA: enabling BLAST to identify homologous gene sequences.
+Genome Res., 21(5): 936-949.
+"""
+__all__ = ["run_genblast"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from pathlib import Path
+import random
+import re
+import shutil
+import signal
+import subprocess
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +def run_genblast(#pylint:disable=dangerous-default-value + masked_genome: Path, + output_dir: Path, + protein_dataset: Path, + max_intron_length: int, + genblast_timeout_secs: int = 10800, + genblast_bin: Path = Path("genblast"), + convert2blastmask_bin: Path = Path("convert2blastmask"), + makeblastdb_bin: Path = Path("makeblastdb"), + num_threads: int = 1, + protein_set: str = ["uniprot", "orthodb"], +) -> None: + """ + Executes GenBlast on genomic slices + Args: + masked_genome : Masked genome file path. + output_dir: Working directory path. + protein_dataset: Protein dataset (Uniprot/OrthoDb) path. + genblast_timeout_secs: Time for timeout (sec). + max_intron_length: Maximum intron length. + genblast_bin : Software path. + convert2blastmask_bin: Software path. + makeblastdb_bin : Software path. + genblast_timeout: seconds + num_threads: int, number of threads. + """ + + check_exe(genblast_bin) + check_exe(convert2blastmask_bin) + check_exe(makeblastdb_bin) + if protein_set == "uniprot": + genblast_dir = create_dir(output_dir, "uniprot_output") + elif protein_set == "orthodb": + genblast_dir = create_dir(output_dir, "orthodb_output") + output_file = genblast_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logger.info("Genblast gtf file exists, skipping analysis") + return + logging.info(Path(f"{output_dir}/alignscore.txt")) + if not Path(f"{genblast_dir}/alignscore.txt").exists(): + # Get the repo directory + repo_root_dir = Path(__file__).parents[6] + shutil.copy(Path(f"{repo_root_dir}/data/alignscore.txt"), genblast_dir) + + if not masked_genome.exists(): + raise IOError(f"Masked genome file does not exist: {masked_genome}") + if not protein_dataset.exists(): + raise IOError(f"Protein file does not exist: {protein_dataset}") + asnb_file = Path(f"{masked_genome}.asnb") + if asnb_file.exists(): + logger.info("Found an existing asnb, so will skip convert2blastmask") + else: + _run_convert2blastmask(convert2blastmask_bin, masked_genome, asnb_file) + _run_makeblastdb(makeblastdb_bin, masked_genome, asnb_file) + batched_protein_files = _split_protein_file( + protein_dataset, genblast_dir, num_threads + ) + pool = multiprocessing.Pool(num_threads) # pylint:disable=consider-using-with + for batched_protein_file in batched_protein_files: + pool.apply_async( + _multiprocess_genblast, + args=( + batched_protein_file, + masked_genome, + genblast_bin, + genblast_timeout_secs, + max_intron_length, + ), + ) + pool.close() + pool.join() + _generate_genblast_gtf(genblast_dir) + for i in range(0, 10): + shutil.rmtree(genblast_dir / f"bin_{i}") + logger.info("Completed running GenBlast")
+ + + +def _multiprocess_genblast( + protein_file: Path, + masked_genome: Path, + genblast_bin: Path, + genblast_timeout: int, + max_intron_length: int, +): + """ + Executes GenBlast on genomic slice + Args: + protein_file: Path of a single batched file. + masked_genome : Masked genome file path. + genblast_bin : Software path. + genblast_timeout_secs: Time for timeout (sec). + max_intron_length: Maximum intron length. + Command line options: + -P Search program used to produce HSPs, + can be either "blast" or "wublast", default is "blast", + optional + -p specifies the program option of genBlast: genblasta or genblastg + -q List of query sequences to blast, must be in fasta format, + required + -t The target database of genomic sequences in fasta format, + required + -g parameter for blast: Perform gapped alignment (T/F) + [default: F], optional + -d parameter for genBlast: maximum allowed distance between HSPs + within the same gene, a non-negative integer [default: 100000], + optional + -r parameter for genBlast: number of ranks in the output, + a positive integer, optional + -e parameter for blast: The e-value, [default: 1e-2], + optional + -c parameter for genBlast: minimum percentage of query gene + coverage in the output, between 0 and 1 (e.g. for 50% + gene coverage, use "0.5"), optional + -W parameter for blast: Set word size, 0 means using blast default [default: 0], + optional + -scodon The number of base pairs to search for start codon within the region of HSP + group (inside the first HSP). If not specified, default is 15. + -i parameter for genBlastG: minimum intron length, optional. + If not specified, the default value is 15. + -x parameter for genBlastG: minimum internal exon length, optional. + If not specified, default is 20. + -n parameter for genBlastG: maximum number of splice sites per region, optional. + If not specified, default is 20. + -gff output options: turn on GFF output + -o output filename, optional. If not specified, the output + will be the same as the query filename with ".gblast" + extension. + -pid turn on final alignment PID computation (global alignment between predicted + gene and query) in output. + -softmask With this option NCBI blast will create a masking library, + you need to use it when blasting against a whole genome + """ + logger.info("Running GenBlast on : %s", protein_file) + + genblast_cmd = [ + str(genblast_bin), + "-p", + "genblastg", + "-q", + str(protein_file), + "-t", + str(masked_genome), + "-g", + "T", + "-pid", + "-r", + "1", + "-P", + "blast", + "-gff", + "-e", + "1e-1", + "-c", + "0.8", + "-W", + "3", + "-softmask", + "-scodon", + "50", + "-i", + "30", + "-x", + "10", + "-n", + "30", + "-d", + str(max_intron_length), + "-o", + str(protein_file), + ] + + logger.info(" ".join(genblast_cmd)) + # Using the child process termination as described here: + # https://alexandra-zaharia.github.io/posts/kill-subprocess + # -and-its-children-on-timeout-python/ + try: + p = subprocess.Popen(# pylint:disable=consider-using-with + genblast_cmd, start_new_session=True + ) + p.wait(timeout=genblast_timeout) + except subprocess.TimeoutExpired: + logger.error("Timeout reached for file: %s \n", protein_file) + subprocess.run(# pylint:disable=subprocess-run-check + ["touch", (Path(f"{protein_file}.except"))] + ) + os.killpg(os.getpgid(p.pid), signal.SIGTERM) + + +def _generate_genblast_gtf(genblast_dir: Path) -> None: + """ + Collect output from geneblast and create the final gtf file + genblast_dir: Working directory path. + """ + logging.info("AAAAA _generate_genblast_gtf") + output_file = genblast_dir / "annotation.gtf" + with open(output_file, "w+", encoding="utf8") as file_out: + genblast_extension = "_1.1c_2.3_s1_0_16_1" + for path in genblast_dir.rglob("*"): + # for root, dirs, files in os.walk(genblast_dir): + # for genblast_file in files: + # genblast_file = os.path.join(root, genblast_file) + if path.is_file() and path.suffix == ".gff": + gtf_string = _convert_genblast_gff_to_gtf(path) + file_out.write(gtf_string) + elif path.is_file() and path.suffix in ( + ".fa.blast", + ".fa.blast.report", + genblast_extension, + ): + path.unlink() + + +def _split_protein_file( + protein_dataset: Path, output_dir: Path, batch_size: int = 20 +) -> List: + """ + The protein dataset file is splitted by a number of sequence equals to the batch_size + in batch files stored in 10 output directories. + protein_dataset : Path for the protein dataset. + output_dir : Output directory path. + batch_size : Size of the batch, it needs to be equals to the number of threads + to parallelise the sequence processing for each file. + """ + batched_protein_files = [] + + for i in range(0, 10): + create_dir(output_dir, (f"bin_{i}")) + with open(protein_dataset,"r", encoding="utf8") as file_in: + seq_count = 0 + batch_count = 0 + current_record = "" + initial_seq = True + for line in file_in: + match = re.search(r">(.+)$", line) + # match header and is not first sequence, if the number of stored sequences in each file equals + # the number of batch_size, a new file will be created and the current_record reset + if match and not initial_seq and seq_count % batch_size == 0: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batch_count += 1 + seq_count += 1 + current_record = line + batched_protein_files.append(batch_file) + # match header and is the first sequence + elif match: + current_record += line + initial_seq = False + seq_count += 1 + # other lines + else: + current_record += line + + if current_record: + bin_num = random.randint(0, 9) + batch_file = output_dir / f"bin_{bin_num}" / f"{batch_count}.fa" + with batch_file.open("w+") as file_out: + file_out.write(current_record) + batched_protein_files.append(batch_file) + return batched_protein_files + + +def _run_convert2blastmask( + convert2blastmask_bin: Path, masked_genome: Path, asnb_file: Path +) -> None: + """ + Convert masking information in lower-case masked FASTA input to file + formats suitable for makeblastdb. + convert2blastmask_bin : Software path. + masked_genome: Path of masked genome file. + asnb_file: Path of assembly file. + """ + logger.info("Running convert2blastmask prior to GenBlast:") + cmd = [ + str(convert2blastmask_bin), + "-in", + str(masked_genome), + "-parse_seqids", + "-masking_algorithm", # mask_program_name + "other", + "-masking_options", # mask_program_options + '"REpeatDetector, default"', + "-outfmt", # output_format + "maskinfo_asn1_bin", + "-out", + str(asnb_file), + ] + logger.info(" ".join(cmd)) + subprocess.run(cmd, check=True) + logger.info("Completed running convert2blastmask") + + +def _run_makeblastdb(makeblastdb_bin: Path, masked_genome: Path, asnb_file: Path) -> None: + """ + Application to create BLAST databases. + makeblastdb_bin : Software path. + masked_genome: Path of masked genome file. + asnb_file: Path of assembly file. + """ + logger.info("Running makeblastdb prior to GenBlast") + subprocess.run( # pylint:disable=subprocess-run-check + [ + str(makeblastdb_bin), + "-in", + str(masked_genome), + "-dbtype", # molecule_type + "nucl", + "-parse_seqids", + "-mask_data", + str(asnb_file), + "-max_file_sz", # number_of_bytes + "10000000000", + ] + ) + logger.info("Completed running makeblastdb") + + +def _convert_genblast_gff_to_gtf(gff_file: Path) -> str: + """ + Convert the content of gtf file in gff format + gff_file: Path for the gff file + """ + gtf_string = "" + with open(gff_file, "r", encoding="utf8") as file_in: + for line in file_in: + results = line.split() + if len(results) == 9: + results[2] = "exon" if results[2] == "coding_exon" else results[2] + attributes = _set_genblast_attributes(str(results[8]), str(results[2])) + results[8] = attributes + converted_line = "\t".join(results) + gtf_string += converted_line + "\n" + return gtf_string + + +def _set_genblast_attributes(attributes: str, feature_type: str) -> str: + """ + Given the list of attributes in the genblast output, + define the new attributes for the gtf file. + attributes: GenBlast attribute list + feature_type: transcript or exon + Example genBlast output #pylint: disable=line-too-long, trailing-whitespace + 1 genBlastG transcript 131128674 131137049 252.729 - . ID=259447-R1-1-A1;Name=259447;PID=84.65;Coverage=94.22;Note=PID:84.65-Cover:94.22 + 1 genBlastG coding_exon 131137031 131137049 . - . ID=259447-R1-1-A1-E1;Parent=259447-R1-1-A1 + 1 genBlastG coding_exon 131136260 131136333 . - . ID=259447-R1-1-A1-E2;Parent=259447-R1-1-A1 + 1 genBlastG coding_exon 131128674 131130245 . - . ID=259447-R1-1-A1-E3;Parent=259447-R1-1-A1 + """ + converted_attributes = "" + split_attributes = attributes.split(";") + if feature_type == "transcript": + match = re.search(r"Name\=(.+)$", split_attributes[1]) + assert match + name = match.group(1) + converted_attributes = f'gene_id "{name}"; transcript_id "{name}";' + elif feature_type == "exon": + match = re.search(r"\-E(\d+);Parent\=(.+)\-R\d+\-\d+\-", attributes) + assert match + exon_rank = match.group(1) + name = match.group(2) + converted_attributes = ( + f'gene_id "{name}"; transcript_id "{name}"; exon_number "{exon_rank}";' + ) + + return converted_attributes + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run TRF.""" + + masked_genome_file = argschema.fields.InputFile( + required=True, description="Masked genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + protein_file = argschema.fields.String( + required=True, description="Path for the protein dataset" + ) + genblast_timeout_secs = argschema.fields.Integer( + required=False, default=10800, description="Genblast timeout period" + ) + max_intron_length = argschema.fields.Integer( + required=True, description="Maximum intron length" + ) + genblast_bin = argschema.fields.String( + required=False, + default="genblast", + description="Genblast executable path", + ) + convert2blastmask_bin = argschema.fields.String( + required=False, + default="convert2blastmask", + description="convert2blastmask executable path", + ) + makeblastdb_bin = argschema.fields.String( + required=False, default="makeblastdb", description="makeblastdb executable path" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + protein_set = argschema.fields.String( + required=True, + description="Protein set [uniprot,orthodb]", + validate=lambda x: x in ["uniprot", "orthodb"], + ) + + +def main() -> None: + """Genblast's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "genblast.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_genblast( + Path(mod.args["masked_genome_file"]), + Path(mod.args["output_dir"]), + Path(mod.args["protein_file"]), + mod.args["max_intron_length"], + mod.args["genblast_timeout_secs"], + Path(mod.args["genblast_bin"]), + Path(mod.args["convert2blastmask_bin"]), + Path(mod.args["makeblastdb_bin"]), + mod.args["num_threads"], + mod.args["protein_set"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/dust.html b/_modules/ensembl/tools/anno/repeat_annotation/dust.html new file mode 100644 index 0000000..3a8cf2c --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/dust.html @@ -0,0 +1,310 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.dust — ensembl-anno 0.1 documentation + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.dust

+# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+DustMasker is a program that identifies and masks out low complexity
+parts of a genome using a new and improved DUST algorithm.
+
+Morgulis A, Gertz EM, Schaffer AA, Agarwala R. A Fast and Symmetric
+DUST Implementation to Mask Low-Complexity DNA Sequences.
+"""
+__all__ = ["run_dust"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+import tempfile
+from typing import List
+import argschema
+
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +def run_dust( + genome_file: PathLike, + output_dir: Path, + dust_bin: Path = Path("dustmasker"), + num_threads: int = 1, +) -> None: + """ + Run Dust on genomic slices with mutiprocessing + Args: + genome_file : Genome file path. + output_dir : Working directory path. + dust_bin : Dust software path. + num_threads: Number of threads. + """ + + check_exe(dust_bin) + dust_dir = create_dir(output_dir, "dust_output") + os.chdir(str(dust_dir)) + output_file = dust_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "repeat") + if transcript_count > 0: + logger.info("Dust gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + dust_cmd = [dust_bin, "-in"] + pool = multiprocessing.Pool(num_threads) # pylint: disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_dust, + args=( + dust_cmd, + slice_id, + dust_dir, + genome_file, + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(dust_dir, "repeat_id", "dust", True, ".dust.gtf") + for gtf_file in dust_dir.glob("*.dust.gtf"): + gtf_file.unlink()
+ + + +def _multiprocess_dust( # pylint: disable=too-many-locals + dust_cmd: List[str], + slice_id: List[str], + dust_dir: Path, + genome_file: Path, +) -> None: + """ + Run Dust on multiprocess on genomic slices + Args: + dust_cmd: Dust command to execute. + slice_id: List of slice IDs. + dust_dir : Dust output directory path. + genome_file : Genome file. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find low complexity regions with Dust: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, dust_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + with tempfile.TemporaryDirectory(dir=dust_dir) as tmpdirname: + slice_file = dust_dir / tmpdirname / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = dust_dir / f"{slice_name}.dust.gtf" + output_file = Path(f"{slice_file}.dust") + dust_cmd.append(str(slice_file)) + logger.info("dust_cmd: %s", dust_cmd) + with open(output_file, "w+", encoding="utf8") as dust_out: + subprocess.run(dust_cmd, stdout=dust_out, check=True) + _create_dust_gtf(output_file, region_results, region_name) + slice_file.unlink() + output_file.unlink() + + +def _create_dust_gtf( + output_file: Path, + region_results: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + output_file : GTF file with final results. + region_results : GTF file with the results per region. + region_name :Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as dust_in, open( + region_results, "w+", encoding="utf8" + ) as dust_out: + repeat_count = 1 + for line in dust_in: + result_match = re.search(r"(\d+)\ - (\d+)", line) + if result_match: + start = int(result_match.group(1)) + 1 + end = int(result_match.group(2)) + 1 + gtf_line = ( + f"{region_name}\tDust\trepeat\t{start}\t" + f'{end}\t.\t+\t.\trepeat_id "{repeat_count}";\n' + ) + dust_out.write(gtf_line) + repeat_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run DustMasker.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + dust_bin = argschema.fields.String( + required=False, + default="dustmasker", + description="Dust executable path", + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + + +def main() -> None: + """Dust's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "dust.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_dust( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["dust_bin"], + mod.args["num_threads"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/red.html b/_modules/ensembl/tools/anno/repeat_annotation/red.html new file mode 100644 index 0000000..61a905f --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/red.html @@ -0,0 +1,272 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.red — ensembl-anno 0.1 documentation + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.red

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Red is the first repeat-detection tool capable of labeling its training data
+and training itself automatically on an entire genome.
+Girgis, H.Z. Red: an intelligent, rapid, accurate tool for detecting repeats
+de-novo on the genomic scale. BMC Bioinformatics 16, 227 (2015).
+https://doi.org/10.1186/s12859-015-0654-5
+"""
+__all__ = ["run_red"]
+
+import logging
+import logging.config
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +def run_red(genome_file: Path, output_dir: Path, red_bin: Path = Path("Red"),) -> str: + """ + Run Red on genome file + + Args: + genome_file : Genome file path. + output_dir : Working directory path. + red_bin : Red software path. + + Return: + masked genome file + """ + check_exe(red_bin) + red_dir = create_dir(output_dir, "red_output") + red_mask_dir = create_dir(red_dir, "mask_output") + red_repeat_dir = create_dir(red_dir, "repeat_output") + red_genome_dir = create_dir(red_dir, "genome_dir") + + sym_link_genome_cmd = "ln -s " + str(genome_file) + " " + str(red_genome_dir) + genome_file_name = genome_file.name + red_genome_file = red_genome_dir / genome_file_name + genome_file_stem = genome_file.stem + masked_genome_file = red_mask_dir / f"{genome_file_stem}.msk" + repeat_coords_file = red_repeat_dir / f"{genome_file_stem}.rpt" + output_file = red_dir / "annotation.gtf" + + if masked_genome_file.exists(): + logger.warning( + "Masked Genome file already found on the path to the Red mask output dir. \ + Will not create a new file" + ) + # _create_red_gtf(repeat_coords_file, output_file) + return str(masked_genome_file) + if red_genome_file.exists(): + logger.warning( + "Unmasked genome file already found on the path to the Red genome dir, \ + will not create a sym link" + ) + + else: + logger.info( + "Preparing to sym link the genome file to the Red genome dir. Cmd\n %s", + sym_link_genome_cmd, + ) + # subprocess.run(["ln", "-s", genome_file, red_genome_dir]) + red_genome_file.symlink_to(genome_file) + try: + if red_genome_file.exists(): + logger.info("Running Red") + subprocess.run( + [ + red_bin, + "-gnm", + red_genome_dir, + "-msk", + red_mask_dir, + "-rpt", + red_repeat_dir, + ], + check=True, + ) + except: + logger.error( + "Could not find the genome file in the Red genome dir or sym link \ + to the original file. Path expected:\n%s", + genome_file, + ) + _create_red_gtf(repeat_coords_file, output_file) + return str(masked_genome_file)
+ + + +def _create_red_gtf(repeat_coords_file: Path, output_file: Path): + """ + Create Red gtf file from masked genome file + + Args: + repeat_coords_file: Coordinates for repeats. + output_file : GTF file with the final results. + """ + with open(repeat_coords_file, "r", encoding="utf8") as red_in, open( + output_file, "w+", encoding="utf8" + ) as red_out: + for repeat_id, line in enumerate(red_in, start=1): + result_match = re.search(r"^\>(.+)\:(\d+)\-(\d+)", line) + if result_match: + region_name = result_match.group(1) + # Note that Red is 0-based, so add 1 + start = int(result_match.group(2)) + 1 + end = int(result_match.group(3)) + 1 + gtf_line = ( + f"{region_name}\tRed\trepeat\t{start}\t" + f'{end}\t.\t+\t.\trepeat_id "{repeat_id}";\n' + ) + red_out.write(gtf_line) + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run Red.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + red_bin = argschema.fields.String( + required=False, default="Red", description="Red executable path", + ) + + +def main() -> None: + """Red's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "red.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_red( + Path(mod.args["genome_file"]), mod.args["output_dir"], mod.args["red_bin"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/repeatmasker.html b/_modules/ensembl/tools/anno/repeat_annotation/repeatmasker.html new file mode 100644 index 0000000..0aa3ca3 --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/repeatmasker.html @@ -0,0 +1,369 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.repeatmasker — ensembl-anno 0.1 documentation + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.repeatmasker

+# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    RepeatMasker is a program that screens DNA sequences for interspersed
+    repeats and low complexity DNA sequences.
+    Smit, AFA, Hubley, R & Green, P. RepeatMasker Open-4.0
+"""
+
+__all__ = ["run_repeatmasker"]
+
+import json
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+logger = logging.getLogger('__name__')
+
+
+
+[docs] +def run_repeatmasker( + genome_file: PathLike, + output_dir: Path, + repeatmasker_bin: Path = Path("RepeatMasker"), + library: str = "", + repeatmasker_engine: str = "rmblast", + species: str = "", + num_threads: int = 1, +) -> None: + + """ + Executes RepeatMasker on the genome slices and stores the final annotation.gtf in repeatmasker_output + Args: + genome_file : Genome file path. + repeatmasker_path : RepeatMasker executable path. + library : Custom repeat library. + species :Species name. + output_dir : Output directory path. + num_threads: Number of threads. + + """ + check_exe(repeatmasker_bin) + repeatmasker_dir = create_dir(output_dir, "repeatmasker_output") + + output_file = repeatmasker_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "repeat") + if transcript_count > 0: + logger.info("Repeatmasker gtf file exists") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + repeatmasker_cmd = [ + str(repeatmasker_bin), + "-nolow",#does not display simple repeats or low_complexity DNA in the annotation + "-engine", + repeatmasker_engine, + "-dir", + str(repeatmasker_dir), + ] + if not library: + if not species: + species = "homo" + repeatmasker_cmd.extend(["-species", species]) + else: + repeatmasker_cmd.extend(["-lib", library]) + logger.info(f"Running RepeatMasker {repeatmasker_cmd}") + pool = multiprocessing.Pool(num_threads) # pylint: disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_repeatmasker, + args=( + repeatmasker_cmd, + slice_id, + genome_file, + repeatmasker_dir, + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(repeatmasker_dir, "repeat_id", "repeatmask", True, ".rm.gtf") + for gtf_file in repeatmasker_dir.glob("*.rm.gtf"): + gtf_file.unlink()
+ + +def _multiprocess_repeatmasker( # pylint: disable=too-many-locals + repeatmasker_cmd: List[str], + slice_id: List[str], + genome_file: Path, + repeatmasker_dir: Path, +) -> None: + """ + Run Repeatmasker on genomic slice + + Args: + repeatmasker_cmd: RepeatMasker command to execute. + slice_id: Slice ID to run RepeatMasker on. + genome_file : Genome file path. + repeatmasker_dir : RepeatMasker output directory path. + """ + + region_name, start, end = slice_id + logger.info( + "Processing slice to find repeats with RepeatMasker: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence( + region_name, int(start), int(end), 1, genome_file, repeatmasker_dir + ) + slice_file_name = f"{region_name}.rs{start}.re{end}" + region_file = repeatmasker_dir / f"{slice_file_name}.fa" + with open(region_file, "w+", encoding="utf8") as region_fasta_out: + region_fasta_out.write(f">{region_name}\n{seq}\n") + region_results_file = Path(f"{region_file}.rm.gtf") + output_file = Path(f"{region_file}.out") + masked_file = Path(f"{region_file}.masked") + tbl_file = Path(f"{region_file}.tbl") + log_file = Path(f"{region_file}.log") + cat_file = Path(f"{region_file}.cat") + repeatmasker_cmd = repeatmasker_cmd.copy() + repeatmasker_cmd.append(str(region_file)) + logger.info(repeatmasker_cmd) + subprocess.run(repeatmasker_cmd, check=True) + _create_repeatmasker_gtf(output_file, region_results_file, region_name) + output_file.unlink() + region_file.unlink() + masked_file.unlink(missing_ok=True) + tbl_file.unlink(missing_ok=True) + log_file.unlink(missing_ok=True) + cat_file.unlink(missing_ok=True) + + +def _create_repeatmasker_gtf( # pylint: disable=too-many-locals + output_file: Path, + region_results_file: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + + All the genomic slices are collected in a single gtf output with the following format: + SW perc perc perc query position in query matching repeat position in repeat + score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID + Args: + output_file : GTF file with final results. + region_results_file_path : GTF file with results per region. + region_name : Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as repeatmasker_in, open( + region_results_file, "w+", encoding="utf8" + ) as repeatmasker_out: + repeat_count = 1 + for line in repeatmasker_in: + result_match = re.search(r"^\s*\d+\s+", line) + if result_match: + results = line.split() + if results[-1] == "*": + results.pop() + if len(results) != 15: + continue + score = results[0] + start = results[5] + end = results[6] + strand = results[8] + repeat_name = results[9] + repeat_class = results[10] + if strand == "+": + repeat_start = results[11] + repeat_end = results[12] + else: + repeat_start = results[13] + repeat_end = results[12] + strand = "-" + gtf_line = ( + f"{region_name}\tRepeatMasker\trepeat\t{start}\t{end}\t.\t" + f"{strand}\t.\trepeat_id{repeat_count}; " + f'repeat_name "{repeat_name}"; repeat_class "{repeat_class}"; ' + f'repeat_start "{repeat_start}"; ' + f'repeat_end "{repeat_end}"; score "{score}";\n' + ) + repeatmasker_out.write(gtf_line) + repeat_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run RepeatMasker.""" + + genome_file = argschema.fields.InputFile( + required= True, description= "Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required= True, description= "Output directory path" + ) + repeatmasker_bin = argschema.fields.String( + required= False, default= "RepeatMasker", + description = "RepeatMasker executable path", + + ) + library = argschema.fields.String( + required= False, default= "", description= "Custom repeat library" + ) + repeatmasker_engine = argschema.fields.String( + required= False, default= "rmblast", description= "RepeatMasker engine" + ) + species = argschema.fields.String( + required= False, + default="homo", + description="Species name (used if no library is provided)" + ) + num_threads = argschema.fields.Integer( + required= False, default= 1, description= "Number of threads" + ) + + +def main() -> None: + """RepeatMasker's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") /"repeatmasking.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig(loginipath, defaults={"logfilename": str(log_file_path)}, disable_existing_loggers=False,) + run_repeatmasker( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["repeatmasker_bin"], + mod.args["library"], + mod.args["repeatmasker_engine"], + mod.args["species"], + mod.args["num_threads"], + ) + +if __name__ == "__main__": + main() + +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/repeat_annotation/trf.html b/_modules/ensembl/tools/anno/repeat_annotation/trf.html new file mode 100644 index 0000000..187602c --- /dev/null +++ b/_modules/ensembl/tools/anno/repeat_annotation/trf.html @@ -0,0 +1,398 @@ + + + + + + + ensembl.tools.anno.repeat_annotation.trf — ensembl-anno 0.1 documentation + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.repeat_annotation.trf

+# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Tandem Repeats Finder is a program to locate and display tandem repeats in DNA sequences.
+    Benson G. Tandem repeats finder: a program to analyze DNA sequences.
+    Nucleic Acids Res. 1999; 27(2):573–580. doi:10.1093/nar/27.2.573
+"""
+__all__ = ["run_trf"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+import tempfile
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +def run_trf( + genome_file: PathLike, + output_dir: Path, + num_threads: int = 1, + trf_bin: Path = Path("trf"), + match_score: int = 2, + mismatch_score: int = 5, + delta: int = 7, + pm: int = 80, + pi: int = 10, + minscore: int = 40, + maxperiod: int = 500, +) -> None: + """ + Executes TRF on genomic slices + Args: + genome_file : Genome file path. + output_dir : working directory path. + num_threads: int, number of threads. + trf_bin : TRF software path. + match_score : Matching weight. + mismatch_score : Mismatching penalty. + delta : Indel penalty. + pm : Match probability (whole number). + pi : Indel probability (whole number). + minscore : Minimum alignment score to report. + maxperiod : Maximum period size to report. + """ + check_exe(trf_bin) + trf_dir = create_dir(output_dir, "trf_output") + os.chdir(str(trf_dir)) + output_file = trf_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "repeat") + if transcript_count > 0: + logger.info("Trf gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + trf_output_extension = ( + f".{match_score}.{mismatch_score}.{delta}." + f"{pm}.{pi}.{minscore}.{maxperiod}.dat" + ) + trf_cmd = [ + trf_bin, + None, + str(match_score), + str(mismatch_score), + str(delta), + str(pm), + str(pi), + str(minscore), + str(maxperiod), + "-d", + "-h", + ] + logger.info("Running TRF") + pool = multiprocessing.Pool(num_threads)#pylint:disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_trf, + args=( + trf_cmd, + slice_id, + trf_dir, + trf_output_extension, + genome_file, + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(trf_dir, "repeat_id", "trf", True, ".trf.gtf") + for gtf_file in trf_dir.glob("*.trf.gtf"): + gtf_file.unlink()
+ + + +def _multiprocess_trf( + trf_cmd: List[str], + slice_id: List[str], + trf_dir: Path, + trf_output_extension: Path, + genome_file:Path, +) -> None: + """ + Run TRF on multiprocess on genomic slices + Args: + trf_cmd: TRF command to execute. + slice_id: Slice Id to run TRF on. + trf_dir : TRF output dir. + trf_output_extension: TRF file output extension. + genome_file : Genome file. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find tandem repeats with TRF:%s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, trf_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + with tempfile.TemporaryDirectory(dir=trf_dir) as tmpdirname: + slice_file = trf_dir / tmpdirname / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = trf_dir / f"{slice_name}.trf.gtf" + # TRF writes to the current dir, so swtich to the output dir for it + # os.chdir(str(trf_output_dir)) + output_file = Path(f"{slice_file}{trf_output_extension}") + trf_cmd = trf_cmd.copy() + trf_cmd[1] = str(slice_file) + logger.info("trf_cmd: %s", trf_cmd) + # with open(trf_output_file_path, "w+") as trf_out: + subprocess.run(trf_cmd, cwd=trf_dir / tmpdirname)#pylint:disable=subprocess-run-check + _create_trf_gtf(output_file, region_results, region_name) + slice_file.unlink() + output_file.unlink() + + +def _create_trf_gtf( + output_file: Path, + region_results: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + + TRF output format: + cols 1+2: Indices of the repeat relative to the start of the sequence + col 3: Period size of the repeat + col 4: Number of copies aligned with the consensus pattern + col 5: Size of consensus pattern (may differ slightly from the period size) + col 6: Percent of matches between adjacent copies overall + col 7: Percent of indels between adjacent copies overall + col 8: Alignment score + cols 9-12: Percent composition for each of the four nucleotides + col 13: Entropy measure based on percent composition + col 14: Consensus sequence + col 15: Repeat sequence + Args: + output_file : GTF file with final results. + region_results : GTF file with results per region. + region_name : Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as trf_in, open( + region_results, "w+", encoding="utf8" + ) as trf_out: + repeat_count = 1 + for line in trf_in: + result_match = re.search(r"^\d+", line) + if result_match: + results = line.split() + if len(results) != 15: + continue + start = results[0] + end = results[1] + period = float(results[2]) + copy_number = float(results[3]) + percent_matches = float(results[5]) + score = float(results[7]) + repeat_consensus = results[13] + if ( # pylint: disable=too-many-boolean-expressions + score < 50 + and percent_matches >= 80 + and copy_number > 2 + and period < 10 + ) or (copy_number >= 2 and percent_matches >= 70 and score >= 50): + gtf_line = ( + f"{region_name}\tTRF\trepeat\t{start}\t{end}\t.\t+\t.\t" + f'repeat_id "{repeat_count}"; score "{score}"; ' + f'repeat_consensus "{repeat_consensus}";\n' + ) + trf_out.write(gtf_line) + repeat_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run TRF.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + trf_bin = argschema.fields.String( + required=False, + default="trf", + description="TRF executable path", + ) + match_score = argschema.fields.Integer( + required=False, default=2, description="Matching weight" + ) + mismatch_score = argschema.fields.Integer( + required=False, default=5, description="Mismatching penalty" + ) + delta = argschema.fields.Integer( + required=False, default=7, description="Indel penalty" + ) + pm = argschema.fields.Integer( + required=False, default=80, description="Match probability" + ) + pi = argschema.fields.Integer( + required=False, default=10, description="Indel probability" + ) + minscore = argschema.fields.Integer( + required=False, default=40, description="Minimum alignment score to report" + ) + maxperiod = argschema.fields.Integer( + required=False, default=500, description="Maximum period size to report" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + + +def main() -> None: + """TRF's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "trf.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_trf( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["num_threads"], + mod.args["trf_bin"], + mod.args["match_score"], + mod.args["mismatch_score"], + mod.args["delta"], + mod.args["pm"], + mod.args["pi"], + mod.args["minscore"], + mod.args["maxperiod"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/simple_feature_annotation/cpg.html b/_modules/ensembl/tools/anno/simple_feature_annotation/cpg.html new file mode 100644 index 0000000..015bc80 --- /dev/null +++ b/_modules/ensembl/tools/anno/simple_feature_annotation/cpg.html @@ -0,0 +1,363 @@ + + + + + + + ensembl.tools.anno.simple_feature_annotation.cpg — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.simple_feature_annotation.cpg

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Set of discriminant functions that can recognize structural and compositional features
+such as CpG islands, promoter regions and first splice-donor sites.
+Davuluri RV, Grosse I, Zhang MQ: Computational identification of promoters and
+first exons in the human genome. Nat Genet. 2001, 29(4):412-417. [PMID: 11726928]
+"""
+__all__ = ["run_cpg"]
+import logging
+import logging.config
+import multiprocessing
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from tempfile import TemporaryDirectory
+from typing import List,Union
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +def run_cpg( + genome_file: PathLike, + output_dir: Path, + cpg_bin: Path = Path("cpg_lh"), + cpg_min_length: int = 400, + cpg_min_gc_content: int = 50, + cpg_min_oe: float = 0.6, + num_threads: int = 1, +) -> None: + """ + Run CpG islands on genomic slices + Args: + genome_file : Genome file path. + output_dir : Working directory path + cpg_bin : CpG software path. + cpg_min_length : Min length of CpG islands + cpg_min_gc_content : Min GC frequency percentage + cpg_min_oe : Min ratio of the observed to expected number of CpG (CpGo/e) + num_threads: int, number of threads. + """ + + check_exe(cpg_bin) + cpg_dir = create_dir(output_dir, "cpg_output") + output_file = cpg_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "simple_feature") + if transcript_count > 0: + logger.info("Cpg gtf file exists") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + logger.info("Running CpG") + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_cpg, + args=( + cpg_bin, + slice_id, + genome_file, + cpg_dir, + cpg_min_length, + cpg_min_gc_content, + cpg_min_oe, + ), + ) + + pool.close() + pool.join() + slice_output_to_gtf(cpg_dir, "feature_id", "cpg", True, ".cpg.gtf") + for gtf_file in cpg_dir.glob("*.cpg.gtf"): + gtf_file.unlink()
+ + + +def _multiprocess_cpg( + cpg_bin: Path, + slice_id: List[str], + genome_file: Path, + cpg_dir: Path, + cpg_min_length: int = 400, + cpg_min_gc_content: int = 50, + cpg_min_oe: float = 0.6, +) -> None: + """ + Annotation of CpG islands on multiprocess on genomic slices + Args: + cpg_bin: CpG software path. + slice_id: Slice id to run CpG on. + genome_file : Genome file. + cpg_dir : Output dir. + cpg_min_length : Min length of CpG islands + cpg_min_gc_content : Min GC frequency percentage + cpg_min_oe : Min ratio of the observed to expected number of CpG (CpGo/e) + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find CpG islands with cpg_lh: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, cpg_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + #with TemporaryDirectory(dir=cpg_dir) as tmpdirname: + slice_file = cpg_dir / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = cpg_dir / f"{slice_file}.cpg.gtf" + output_file = Path(f"{slice_file}.cpg") + cpg_cmd = [str(cpg_bin), str(slice_file)] + with open(output_file, "w+", encoding="utf8") as cpg_out: + subprocess.run(cpg_cmd, stdout=cpg_out, check=True) + _create_cpg_gtf( + output_file, + region_results, + region_name, + cpg_min_length, + cpg_min_gc_content, + cpg_min_oe, + ) + slice_file.unlink() + output_file.unlink() + + +def _create_cpg_gtf( + output_file: Path, + region_results: Path, + region_name: str, + cpg_min_length: int = 400, + cpg_min_gc_content: int = 50, + cpg_min_oe: float = 0.6, +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + output_file : GTF file with final results. + region_results : GTF file with the results per region. + region_name :Coordinates of genomic slice. + cpg_dir : Output dir. + cpg_min_length : Min length of CpG islands + cpg_min_gc_content : Min GC frequency percentage + cpg_min_oe : Min ratio of the observed to expected number of CpG (CpGo/e) + """ + with open(output_file, "r", encoding="utf8") as cpg_in, open(region_results, "w+", encoding="utf8") as cpg_out: + feature_count = 1 + for line in cpg_in: + result_match = re.search(r"^" + region_name, line) + if result_match: + results = line.split() + start = int(results[1]) + end = int(results[2]) + length = end - start + 1 + score = float(results[3]) + gc_content = float(results[6]) + oe_score_str = results[7] + oe_score: Union[float, int] + if oe_score_str in ("-", "inf"): + oe_score=0 + else: + oe_score=float(oe_score_str) + if ( + int(length) >= int(cpg_min_length) + and gc_content >= int(cpg_min_gc_content) + and oe_score >= float(cpg_min_oe) + ): + gtf_line = ( + f"{region_name}\tCpG\tsimple_feature\t{start}\t" + f'{end}\t.\t+\t.\tfeature_id "{feature_count}"; score "{score}";\n' + ) + cpg_out.write(gtf_line) + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run CpG software.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + cpg_bin = argschema.fields.String( + required=False, + default="cpg_lh", + description="CpG executable path", + ) + cpg_min_length = argschema.fields.Integer( + required=False, + default="400", + description="Min length of CpG islands", + ) + cpg_min_gc_content = argschema.fields.Integer( + required=False, + default="50", + description="Min GC frequency percentage", + ) + cpg_min_oe = argschema.fields.Float( + required=False, + default="0.6", + description="Min ratio of the observed to expected number of CpG (CpGo/e)", + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + + +def main() -> None: + """CpG's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "cpg.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_cpg( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["cpg_bin"], + mod.args["cpg_min_length"], + mod.args["cpg_min_gc_content"], + mod.args["cpg_min_oe"], + mod.args["num_threads"], + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/simple_feature_annotation/eponine.html b/_modules/ensembl/tools/anno/simple_feature_annotation/eponine.html new file mode 100644 index 0000000..23a3e09 --- /dev/null +++ b/_modules/ensembl/tools/anno/simple_feature_annotation/eponine.html @@ -0,0 +1,348 @@ + + + + + + + ensembl.tools.anno.simple_feature_annotation.eponine — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.simple_feature_annotation.eponine

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Eponine is a probabilistic method for detecting transcription start sites (TSS)
+in mammalian genomic sequence, with good specificity and excellent positional accuracy.
+Down TA, Hubbard TJ. Computational detection and location of transcription start sites
+in mammalian genomic DNA. Genome Res. 2002 Mar;12(3):458-61. doi: 10.1101/gr.216102.
+PMID: 11875034; PMCID: PMC155284.
+"""
+__all__ = ["run_eponine"]
+
+import logging
+import logging.config
+import multiprocessing
+import os
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from tempfile import TemporaryDirectory
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    check_file,
+    create_dir,
+    check_gtf_content,
+    get_sequence,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+)
+
+logger = logging.getLogger("__name__")
+
+
+
+[docs] +def run_eponine( + genome_file: PathLike, + output_dir: Path, + num_threads: int = 1, + java_bin: Path = Path("java"), + eponine_bin: Path = Path( + "/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/opt/eponine/libexec/eponine-scan.jar" + ), + eponine_threshold: float = 0.999, +) -> None: + """ + Run Eponine on genomic slices + Args: + genome_file : Genome file path. + output_dir : Working directory path. + java_bin : Java path. + eponine_bin : Eponine software path + num_threads: Number of threads. + """ + check_file(eponine_bin) + check_exe(java_bin) + eponine_dir = create_dir(output_dir, "eponine_output") + # os.chdir(str(eponine_dir)) + output_file = eponine_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "simple_feature") + if transcript_count > 0: + logger.info("Eponine gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id( + seq_region_to_length, slice_size=1000000, overlap=0, min_length=5000 + ) + + eponine_cmd = [ + str(java_bin), + "-jar", + str(eponine_bin), + "-threshold", + str(eponine_threshold), + "-seq", + ] + logger.info("Running Eponine") + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_eponine, + args=( + eponine_cmd, + slice_id, + eponine_dir, + Path(genome_file), + ), + ) + pool.close() + pool.join() + slice_output_to_gtf(eponine_dir, "feature_id", "eponine", True, ".epo.gtf") + for gtf_file in eponine_dir.glob("*.epo.gtf"): + gtf_file.unlink()
+ + + +def _multiprocess_eponine( + eponine_cmd: List[str], + slice_id: List[str], + eponine_dir: Path, + genome_file: Path, +) -> None: + """ + Run Eponine on multiprocess on genomic slices + Args: + eponine_cmd: Eponine command to execute. + slice_id: List of slice IDs. + eponine_dir : Eponine output directory path. + genome_file : Genome file. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find transcription start sites with Eponine: %s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, eponine_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + #with tempfile.TemporaryDirectory(dir=eponine_dir) as tmpdirname: + slice_file = eponine_dir / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + region_results = eponine_dir / f"{slice_name}.epo.gtf" + output_file = Path(f"{slice_file}.epo") + eponine_cmd = eponine_cmd.copy() + eponine_cmd.append(str(slice_file)) + logging.info(eponine_cmd) + with open(output_file, "w+", encoding="utf8") as eponine_out: + subprocess.run(eponine_cmd, stdout=eponine_out, check=True) + _create_eponine_gtf(output_file, region_results, region_name) + slice_file.unlink() + output_file.unlink() + + +def _create_eponine_gtf( + output_file: Path, + region_results: Path, + region_name: str, +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + output_file: GTF file with final results. + region_results: GTF file with the results per region. + region_name: Coordinates of genomic slice. + """ + with open(output_file, "r", encoding="utf8") as eponine_in, open( + region_results, "w+", encoding="utf8" + ) as eponine_out: + feature_count = 1 + for line in eponine_in: + result_match = re.search(r"^" + region_name, line) + if result_match: + results = line.split() + start = int(results[3]) + end = int(results[4]) + score = float(results[5]) + strand = results[6] + logging.info(results) + # There's a one base offset on the reverse strand + if strand == "-": + start -= 1 + end -= 1 + + gtf_line = ( + f"{region_name}\tEponine\tsimple_feature\t" + f"{start}\t{end}\t.\t{strand}\t.\t" + f'feature_id "{feature_count}"; score "{score}";\n' + ) + eponine_out.write(gtf_line) + feature_count += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run Eponine.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + java_bin = argschema.fields.String( + required=False, + default="java", + description="Java executable path", + ) + eponine_bin = argschema.fields.String( + required=False, + default="/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/opt/eponine/libexec/eponine-scan.jar", # pylint:disable=line-too-long + description="Java executable path", + ) + eponine_threashold = argschema.fields.Float( + required=False, default=0.999, description="Eponine threashold" + ) + + +def main() -> None: + """Eponine's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "eponine.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_eponine( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["num_threads"], + Path(mod.args["java_bin"]), + Path(mod.args["eponine_bin"]), + mod.args["eponine_threashold"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/snc_rna_annotation/trnascan.html b/_modules/ensembl/tools/anno/snc_rna_annotation/trnascan.html new file mode 100644 index 0000000..fe91f78 --- /dev/null +++ b/_modules/ensembl/tools/anno/snc_rna_annotation/trnascan.html @@ -0,0 +1,399 @@ + + + + + + + ensembl.tools.anno.snc_rna_annotation.trnascan — ensembl-anno 0.1 documentation + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.snc_rna_annotation.trnascan

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+tRNAscan-SE identifies 99-100% of transfer RNA genes in DNA sequence while
+giving less than one false positive per 15 gigabases.
+Lowe TM, Eddy SR: tRNAscan-SE: a program for improved detection of transfer
+RNA genes in genomic sequence.
+Nucleic Acids Res. 1997, 25(5):955-64. [PMID: 9023104]
+"""
+__all__ = ["run_trnascan"]
+
+import logging
+import logging.config
+import multiprocessing
+from os import PathLike
+from pathlib import Path
+import re
+import subprocess
+from typing import List
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    check_file,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+    get_slice_id,
+    slice_output_to_gtf,
+    get_sequence,
+)
+
+logger = logging.getLogger(__name__)
+
+
+
+[docs] +def run_trnascan( + genome_file: PathLike, + output_dir: Path, + trnascan_bin: Path = Path("tRNAscan-SE"), + trnascan_filter: Path = Path("EukHighConfidenceFilter"), + num_threads: int = 1, +) -> None: + """ + Executes tRNAscan-SE on genomic slices + Args: + genome_file : Genome file path. + trnascan_bin : tRNAscan-SE software path. + trnascan_filter : tRNAscan-SE filter set path. + output_dir : working directory path. + num_threads: int, number of threads. + """ + check_exe(trnascan_bin) + check_file(trnascan_filter) + trnascan_dir = create_dir(output_dir, "trnascan_output") + output_file = trnascan_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logger.info("Trnascan gtf file exists, skipping analysis") + return + logger.info("Creating list of genomic slices") + seq_region_to_length = get_seq_region_length(genome_file, 5000) + slice_ids_per_region = get_slice_id(seq_region_to_length, 1000000, 0, 5000) + trnascan_cmd = [ + str(trnascan_bin), + None, + "-o", + None, + "-f", + None, + "-H", # show both primary and secondary structure components to covariance model bit scores + "-q", # quiet mode + "--detail", + "-Q", + ] + logger.info("Running tRNAscan-SE") + pool = multiprocessing.Pool(num_threads) # pylint: disable=consider-using-with + for slice_id in slice_ids_per_region: + pool.apply_async( + _multiprocess_trnascan, + args=( + trnascan_cmd, + slice_id, + genome_file, + trnascan_filter, + trnascan_dir, + ), + ) + + pool.close() + pool.join() + slice_output_to_gtf( + output_dir=trnascan_dir, unique_ids=True, file_extension=".trna.gtf" + ) + for gtf_file in trnascan_dir.glob("*.trna.gtf"): + gtf_file.unlink()
+ + + +def _multiprocess_trnascan( + trnascan_cmd: List[str], + slice_id: List[str], + genome_file: Path, + trnascan_filter: Path, + trnascan_dir: Path, +) -> None: + """ + Run tRNAscan-SE on multiprocess on genomic slices + Args: + trnascan_cmd: tRNAscan-SE command to execute. + slice_id: Slice Id to run tRNAscan-SE on. + genome_file : Genome file. + trnascan_dir : tRNAscan-SE output dir. + trnascan_filter: tRNAscan-SE filter set. + """ + region_name, start, end = slice_id + logger.info( + "Processing slice to find tRNAs using tRNAscan-SE:%s:%s:%s", + region_name, + start, + end, + ) + seq = get_sequence(region_name, int(start), int(end), 1, genome_file, trnascan_dir) + slice_name = f"{region_name}.rs{start}.re{end}" + slice_file = trnascan_dir / f"{slice_name}.fa" + with open(slice_file, "w+", encoding="utf8") as region_out: + region_out.write(f">{region_name}\n{seq}\n") + # trnscan output + region_results = trnascan_dir / f"{slice_name}.trna.gtf" + output_file = Path(f"{slice_file}.trna") + ss_output_file = Path(f"{output_file}.ss") + # filtering + filter_prefix_file = f"{slice_name}.filt" + filter_output_file = trnascan_dir / f"{filter_prefix_file}.out" + filter_log_file = trnascan_dir / f"{filter_prefix_file}.log" + filter_ss_file = trnascan_dir / f"{filter_prefix_file}.ss" + # trnascan_cmd = generic_trnascan_cmd.copy() + trnascan_cmd[1], trnascan_cmd[3], trnascan_cmd[5] = ( + str(slice_file), + str(output_file), + str(ss_output_file), + ) + logger.info("tRNAscan-SE command: %s", " ".join(trnascan_cmd)) + subprocess.run(trnascan_cmd, check=True) + # If the trnascan output is empty there is no need to go on with filtering + if output_file.stat().st_size == 0: + output_file.unlink() + slice_file.unlink() + ss_output_file.unlink(missing_ok=True) + return + + filter_cmd = [ + str(trnascan_filter), + "--result", # tRNAscan-SE output file used as input + str(output_file), + "--ss", # tRNAscan-SE secondary structure file used as input + str(ss_output_file), + "--output", + str(trnascan_dir), + "--prefix", + str(filter_prefix_file), + ] + logger.info( + "tRNAscan-SE filter command: %s", " ".join(str(item) for item in filter_cmd) + ) + subprocess.run(filter_cmd)#pylint:disable=subprocess-run-check + _create_trnascan_gtf(region_results, filter_output_file, region_name) + output_file.unlink(missing_ok=True) + slice_file.unlink(missing_ok=True) + ss_output_file.unlink(missing_ok=True) + Path(filter_prefix_file).unlink(missing_ok=True) + filter_log_file.unlink(missing_ok=True) + filter_ss_file.unlink(missing_ok=True) + filter_output_file.unlink(missing_ok=True) + + +def _create_trnascan_gtf( + region_results: Path, filter_output_file: Path, region_name: str +) -> None: + """ + Read the fasta file and save the content in gtf format + All the genomic slices are collected in a single gtf output + Args: + region_results : GTF file with the results per region. + filter_file : GTF file with the filtered results per region. + region_name :Coordinates of genomic slice. + + tRNAscan-SE output format: + col0: GtRNAdb Gene Symbol - gene ID in corresponding genome + col1: tRNAscan-SE ID - tRNA ID in tRNAscan-SE prediction results + col2-3: Locus - Genomic coordinates of predicted gene + col4: Isotype (from Anticodon) - tRNA isotype determined by anticodon + col5: Anticodon - anticodon of predicted tRNA gene + col6-7: Intron boundaries + col8: General tRNA Model Score - covariance model bit score from tRNAscan-SE results + col9: Best Isotype Model - best matching (highest scoring) isotype determined + by isotype-specific covariance model classification + col10-11-12: Anticodon and Isotype Model Agreement - consistency between anticodon + from predicted gene sequence and best isotype model + col13: Features - special gene features that may include gene set categorization, + number of introns, possible pseudogenes, possible truncation, or base-pair mismatches + """ + with open(filter_output_file, "r", encoding="utf8") as trna_in, open( + region_results, "w+", encoding="utf8" + ) as trna_out: + gene_counter = 1 + for line in trna_in: + result_match = re.search(r"^" + region_name, line) + if result_match: + results = line.split() + start = int(results[2]) + end = int(results[3]) + strand = "+" + if start > end: + strand = "-" + start, end = end, start + biotype = ( + "tRNA" + if re.search(r"high confidence set", line) + else "tRNA_pseudogene" + ) + transcript_string = ( + f"{region_name}\ttRNAscan\ttranscript\t{start}\t{end}\t.\t" + f'{strand}\t.\tgene_id "{gene_counter}"; transcript_id ' + f'"{gene_counter}"; biotype "{biotype}";\n' + ) + exon_string = ( + f"{region_name}\ttRNAscan\texon\t{start}\t{end}\t.\t" + f'{strand}\t.\tgene_id "{gene_counter}"; transcript_id ' + f'"{gene_counter}"; exon_number "1"; biotype "{biotype}";\n' + ) + trna_out.write(transcript_string) + trna_out.write(exon_string) + trna_out.flush() + gene_counter += 1 + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run tRNAscan-SE.""" + + genome_file = argschema.fields.InputFile( + required=True, description="Genome file path" + ) + trnascan_bin = argschema.fields.String( + required=False, + default="tRNAscan-SE", + description="tRNAscan-SE executable path", + ) + trnascan_filter = argschema.fields.String( + required=False, + default="/hps/software/users/ensembl/ensw/C8-MAR21-sandybridge/linuxbrew/bin/EukHighConfidenceFilter", + description="tRNAscan-SE filter path", + ) + output_dir = argschema.fields.OutputDir( + required=True, description="Output directory path" + ) + num_threads = argschema.fields.Integer( + required=False, default=1, description="Number of threads" + ) + + +def main() -> None: + """tRNAscan-SE's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "trnascan.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_trnascan( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["trnascan_bin"], + Path(mod.args["trnascan_filter"]), + mod.args["num_threads"], + ) + + +if __name__ == "__main__": + main() +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/minimap.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/minimap.html new file mode 100644 index 0000000..190d992 --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/minimap.html @@ -0,0 +1,365 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.minimap — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.minimap

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Minimap2 is a pairwise sequence alignment algorithm designed for efficiently comparing nucleotide sequences.
+The algorithm uses a versatile indexing strategy to quickly find approximate matches between sequences, 
+allowing it to efficiently align long sequences against reference genomes or other sequences.
+
+Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics, 34(18), 3094-3100.
+"""
+
+__all__ = ["run_minimap2"]
+import logging
+import logging.config
+from pathlib import Path
+import subprocess
+from typing import List
+import argschema
+
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+
+
+[docs] +def run_minimap2( + output_dir: Path, + long_read_fastq_dir: Path, + genome_file: Path, + minimap2_bin: Path = Path("minimap2"), + paftools_bin: Path = Path("paftools.js"), + max_intron_length: int = 100000, + num_threads: int = 1, +) -> None: + """ + Run Minimap2 to align long read data against genome file. + Default Minimap set for PacBio data. + Args: + output_dir : Working directory path. + long_read_fastq_dir : Long read directory path. + genome_file : Genome file path. + minimap2_bin : Software path. + paftools_bin : Software path. + max_intron_length : The maximum intron size for alignments. Defaults to 100000. + num_threads : Number of available threads. + """ + check_exe(minimap2_bin) + check_exe(paftools_bin) + minimap2_dir = create_dir(output_dir, "minimap2_output") + + logging.info("Skip analysis if the gtf file already exists") + output_file = minimap2_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logging.info("Minimap2 gtf file exists, skipping analysis") + return + minimap2_index_file = minimap2_dir / f"{genome_file.name}.mmi" + # minimap2_hints_file = minimap2_dir /"minimap2_hints.gff" + file_types = ("*.fastq", "*.fq") + fastq_file_list = [ + path for file_type in file_types for path in Path(long_read_fastq_dir).rglob(file_type) + ] + if len(fastq_file_list) == 0: + raise IndexError(f"The list of fastq files is empty. Fastq dir:\n{long_read_fastq_dir}") + + if not minimap2_index_file.exists(): + logging.info("Did not find an index file for minimap2. Will create now") + try: + subprocess.run( # pylint:disable=subprocess-run-check + [ + minimap2_bin, + "-t", + str(num_threads), + "-d", + str(minimap2_index_file), + genome_file, + ] + ) + except subprocess.CalledProcessError as e: + logging.error("An error occurred while creating minimap2 index: %s", e) + except OSError as e: + logging.error("An OS error occurred: %s", e) + + logging.info("Running minimap2 on the files in the long read fastq dir") + for fastq_file in fastq_file_list: + sam_file = minimap2_dir / f"{fastq_file.name}.sam" + bed_file = minimap2_dir / f"{fastq_file.name}.bed" + logging.info("Processing %s", fastq_file) + with open(bed_file, "w+", encoding="utf8") as bed_file_out: + subprocess.run( # pylint:disable=subprocess-run-check + [ + minimap2_bin, + "-G", + str(max_intron_length), + "-t", + str(num_threads), + "--cs", + "--secondary=no", + "-ax", + "splice", + "-u", + "b", + minimap2_index_file, + fastq_file, + "-o", + sam_file, + ] + ) + logging.info("Creating bed file from SAM") + subprocess.run( + [paftools_bin, "splice2bed", sam_file], stdout=bed_file_out + ) # pylint:disable=subprocess-run-check + + bed_to_gtf(minimap2_dir) + + logging.info("Completed running minimap2")
+ + + +def bed_to_gtf(output_dir: Path) -> None: + """ + Convert bed file into gtf file + Args: + output_dir : Working directory path. + """ + gtf_file_path = output_dir / "annotation.gtf" + with open(gtf_file_path, "w+", encoding="utf8") as gtf_out: + gene_id = 1 + for bed_file in output_dir.glob("*.bed"): + logging.info("Converting bed to GTF: %s", str(bed_file)) + with open(bed_file, "r", encoding="utf8") as bed_in: + for line in bed_in: + elements = line.rstrip().split("\t") + seq_region_name = elements[0] + offset = int(elements[1]) + strand = elements[5] + # sizes of individual block of exons + block_sizes = [size for size in elements[10].split(",") if size] + block_starts = [size for size in elements[11].split(",") if size] + exons = bed_block_to_exons(block_sizes, block_starts, offset) + transcript_start = None + transcript_end = None + exon_records = [] + for i, exon_coords in enumerate(exons): + if transcript_start is None or exon_coords[0] < transcript_start: + transcript_start = exon_coords[0] + + if transcript_end is None or exon_coords[1] > transcript_end: + transcript_end = exon_coords[1] + + exon_line = ( + f"{seq_region_name}\tminimap\texon\t{exon_coords[0]}\t" + f"{exon_coords[1]}\t.\t{strand}\t.\t" + f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"; ' + f'exon_number "{i+ 1}";\n' + ) + exon_records.append(exon_line) + transcript_line = ( + f"{seq_region_name}\tminimap\ttranscript\t{transcript_start}\t" + f"{transcript_end}\t.\t{strand}\t.\t" + f'gene_id "minimap_{gene_id}"; transcript_id "minimap_{gene_id}"\n' + ) + gtf_out.write(transcript_line) + for exon_line in exon_records: + gtf_out.write(exon_line) + gene_id += 1 + + +def bed_block_to_exons(block_sizes: List, block_starts: List, offset: int) -> List: + """ + Extract exon size and start from exon feature block + Args: + block_sizes : Block feature size. + block_starts : Block feature starts. + offset : Feature offset. + + Returns: + List of exon coordinates + """ + exons = [] + for i, _ in enumerate(block_sizes): + block_start = offset + int(block_starts[i]) + 1 + block_end = block_start + int(block_sizes[i]) - 1 + if block_end < block_start: + logging.warning("Warning: block end is less than block start, skipping exon") + continue + exon_coords = [str(block_start), str(block_end)] + exons.append(exon_coords) + return exons + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run Minimap2 software.""" + + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + long_read_fastq_dir = argschema.fields.String( + required=True, + description="Long read directory path", + ) + genome_file = argschema.fields.InputFile(required=True, description="Genome file path") + minimap2_bin = argschema.fields.String( + required=False, + default="minimap2", + description="Minimap2 software path", + ) + paftools_bin = argschema.fields.String( + required=False, + default="paftools.js", + description="Paftools software path", + ) + max_intron_length = argschema.fields.Integer( + required=False, + default="100000", + description="The maximum intron length.", + ) + max_intron_length = argschema.fields.Integer( + required=False, + default="100000", + description="The maximum intron size for alignments.", + ) + num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads") + + +def main() -> None: + """Minimap2's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "minimap.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_minimap2( + mod.args["output_dir"], + mod.args["long_read_fastq_dir"], + mod.args["genome_file"], + mod.args["minimap2_bin"], + mod.args["paftools_bin"], + mod.args["max_intron_length"], + mod.args["num_threads"], + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/scallop.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/scallop.html new file mode 100644 index 0000000..44ae968 --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/scallop.html @@ -0,0 +1,305 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.scallop — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.scallop

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Scallop is a high-performance tool designed for the accurate and efficient quantification 
+of transcriptome assembly. 
+It's capable of handling large-scale transcriptomic data while providing precise estimates 
+of transcript abundances.
+Scallop's algorithmic approach allows it to efficiently reconstruct transcript structures 
+and quantify their expression levels, making it a valuable resource for studying gene 
+expression and transcriptome analysis.
+
+Shao M, Kingsford C. Accurate assembly of transcripts through phase-preserving graph 
+decomposition. Nat Biotechnol.
+2017 Dec;35(12):1167-1169. doi: 10.1038/nbt.4020. Epub 2017 Nov 13. PMID: 29131147; PMCID: PMC5722698.
+"""
+
+__all__ = ["run_scallop"]
+import logging
+import logging.config
+from pathlib import Path
+import re
+import subprocess
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+
+
+[docs] +def run_scallop( + output_dir: Path, + scallop_bin: Path = Path("scallop"), + prlimit_bin: Path = Path("prlimit"), + stringtie_bin: Path = Path("stringtie"), + memory_limit: int = 40 * 1024**3, +) -> None: + """ + Run Scallop assembler on short read data after STAR alignment. + + Args: + output_dir : Working directory path. + scallop_bin : Software path. + prlimit_bin : Software path. + stringtie_bin : Software path. + memory_limit : Memory limit Scallop command Defaults to 40*1024**3. + """ + check_exe(scallop_bin) + check_exe(stringtie_bin) + scallop_dir = create_dir(output_dir, "scallop_output") + logging.info("Skip analysis if the gtf file already exists") + output_file = scallop_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logging.info("Scallop gtf file exists, skipping analysis") + return + + star_dir = output_dir / "star_output" + + if star_dir.exists() and len(list(star_dir.glob("*.bam"))) != 0: + for sorted_bam_file in star_dir.glob("*.bam"): + transcript_file_name = re.sub(".bam", ".scallop.gtf", sorted_bam_file.name) + transcript_file = scallop_dir / transcript_file_name + if transcript_file.exists(): + logging.info( + "Found an existing stringtie gtf file, will not overwrite. \ + File found: %s", + transcript_file, + ) + else: + logging.info("Running Scallop on: %s", sorted_bam_file.name) + try: + scallop_cmd = [ + scallop_bin, + "-i", + sorted_bam_file, + "-o", + transcript_file, + "--min_flank_length", + "10", + ] + if memory_limit is not None: + scallop_cmd = prlimit_command(prlimit_bin, scallop_cmd, memory_limit) + subprocess.check_output(scallop_cmd, stderr=subprocess.STDOUT, universal_newlines=True) + # This combines the standard output and error streams into a single + # string and ensures that the output is in text mode + + except subprocess.CalledProcessError as ex: + logging.error("Error occurred while running Scallop:") + logging.error("Command: %s\n", " ".join(scallop_cmd)) + logging.error("Return code: %s\n", str(ex.returncode)) + logging.error("Output and error messages: %s\n", ex.output) + else: + raise IndexError(f"The list of sorted bam files is empty, Star output dir: {star_dir}") + + # Now need to merge + logging.info("Merge Scaalop's output.") + scallop_merge(scallop_dir, stringtie_bin)
+ + + +def scallop_merge(scallop_dir: Path, stringtie_bin: Path = Path("stringtie")) -> None: + """ + Merge Scallop result in a single gtf file + + scallop_dir : Input directory's path. + stringtie_bin : Software path. + """ + scallop_input_to_file = scallop_dir / "scallop_assemblies.txt" + scallop_merge_output_file = scallop_dir / "annotation.gtf" + with open(scallop_input_to_file, "w+", encoding="utf8") as gtf_list_out: + for gtf_file in scallop_dir.glob("*.scallop.gtf"): + transcript_count = check_gtf_content(gtf_file, "transcript") + if transcript_count > 0: + gtf_list_out.write(gtf_file + "\n") + else: + logging.warning("Warning, skipping file with no transcripts. Path:%s\n", gtf_file) + + try: + subprocess.check_output( + [ + stringtie_bin, + "--merge", + "-o", + scallop_merge_output_file, + scallop_input_to_file, + ], + stderr=subprocess.STDOUT, + text=True, + ) + + except subprocess.CalledProcessError as e: + print("StringTie execution failed with an error:%s", e.output) + + +def prlimit_command(prlimit_bin, command_list, virtual_memory_limit) -> list: + """ + Prepend memory limiting arguments to a command list to be run with subprocess. + + This method uses the `prlimit` program to set the memory limit. + + The `virtual_memory_limit` size is in bytes. + + prlimit arguments: + -v, --as[=limits] + Address space limit. + """ + return [str(prlimit_bin), f"-v{virtual_memory_limit}"] + command_list + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run StringTie software.""" + + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + scallop_bin = argschema.fields.String( + required=False, + default="scallop", + description="Scallop software path", + ) + prlimit_bin = argschema.fields.String( + required=False, + default="prlimit", + description="Prlimit software path", + ) + memory_limit = argschema.fields.Integer( + required=False, default=40 * 1024**3, description="Memory's limit for Scallop command" + ) + + +def main() -> None: + """Scallop's entry-point. :no-index:""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "scallop.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_scallop( + mod.args["output_dir"], mod.args["scallop_bin"], mod.args["prlimit_bin"], mod.args["memory_limit"] + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/star.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/star.html new file mode 100644 index 0000000..df38ad2 --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/star.html @@ -0,0 +1,720 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.star — ensembl-anno 0.1 documentation + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.star

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+The STAR (Spliced Transcripts Alignment to a Reference) alignment tool is widely used
+in genomics research for aligning RNA-seq data to a reference genome.
+Dobin A, Davis CA, Schlesinger F, et al. STAR: ultrafast universal RNA-seq aligner.
+Bioinformatics. 2013;29(1):15-21. doi:10.1093/bioinformatics/bts635
+"""
+
+__all__ = ["run_star", "subsample_transcriptomic_data"]
+import logging
+import logging.config
+import gzip
+import math
+import multiprocessing
+from pathlib import Path
+import random
+import re
+import shutil
+import subprocess
+from typing import List
+import argschema
+
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+    get_seq_region_length,
+)
+
+
+
+[docs] +def run_star( + genome_file: Path, + output_dir: Path, + short_read_fastq_dir: Path, + delete_pre_trim_fastq: bool = False, + trim_fastq: bool = False, + max_reads_per_sample: int = 0, + max_intron_length: int = 100000, + num_threads: int = 1, + star_bin: Path = Path("star"), + samtools_bin: Path = Path("samtools"), + trim_galore_bin: Path = Path("trim_galore"), +) -> None: + """ + Run STAR alignment on list of short read data. + + Args: + genome_file : Genome file path. + output_dir : Working directory path. + short_read_fastq_dir : Short read directory path. + delete_pre_trim_fastq : Delete the original fastq files after trimming. Defaults to False. + trim_fastq : Trim short read files using TrimGalore. Defaults to False. + max_reads_per_sample : Max number of reads per sample. Defaults to 0 (unlimited). + max_intron_length : The maximum intron size for alignments. Defaults to 100000. + num_threads : Number of available threads. + star_bin : Software path. + samtools_bin : Software path. + trim_galore_bin : Software path. + + """ + check_exe(star_bin) + # If trimming has been enabled then switch the path for + # short_read_fastq_dir from the original location to the trimmed fastq dir + if trim_fastq: + run_trimming(output_dir, short_read_fastq_dir, delete_pre_trim_fastq, num_threads, trim_galore_bin) + short_read_fastq_dir = output_dir / "trim_galore_output" + + # if not os.path.exists(subsample_script_path): + #subsample_script_path = "subsample_fastq.py" + + star_dir = create_dir(output_dir, "star_output") + + for output_file in [ + output_dir / "stringtie_output/annotation.gtf", + output_dir / "scallop_output/annotation.gtf", + ]: + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") # check a gtf + if transcript_count > 0: + logging.info("Transcriptomic alignment exists") + return + + star_index_file = star_dir / "SAindex" + fastq_file_list = [] + file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz") + fastq_file_list = [ + path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) + ] + if len(fastq_file_list) == 0: + raise IndexError(f"The list of fastq files is empty. Fastq dir:\n{short_read_fastq_dir}") + + + # for file_type in file_types: + # fastq_file_list.extend(glob.glob(os.path.join(short_read_fastq_dir, file_type))) + + # Get list of paired paths + fastq_file_list = _create_paired_paths(fastq_file_list) + # Subsamples in parallel if there's a value set + if max_reads_per_sample: + subsample_transcriptomic_data(fastq_file_list) + # Get the list of the new subsampled files + fastq_file_list = [ + path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) + ] + # I don't think is needed + # fastq_file_list = check_for_fastq_subsamples(fastq_file_list) + + if not star_index_file.exists(): + logging.info("Did not find an index file for Star. Will create now") + seq_region_to_length = get_seq_region_length(genome_file, 0) + genome_size = sum(seq_region_to_length.values()) + # This calculates the base-2 logarithm of the genome_size. The logarithm of the genome size is + # a measure of how many bits are needed to represent the genome size in binary. + # + # The choice of 14 as the maximum value is likely based on empirical observations and optimization + # considerations. Too large of a seed length can lead to increased memory usage and potentially + # slower indexing, while a seed length that is too small might affect alignment accuracy. + index_bases = min(14, math.floor((math.log(genome_size, 2) / 2) - 1)) + try: + subprocess.run(#pylint:disable=subprocess-run-check + [ + str(star_bin), + "--runThreadN", + str(num_threads), + "--runMode", + "genomeGenerate", + "--outFileNamePrefix", + f"{star_dir}/", + "--genomeDir", + str(star_dir), + "--genomeSAindexNbases", + str(index_bases), + "--genomeFastaFiles", + str(genome_file), + ] + ) + except Exception as e: + logging.error("An error occurred while creating star index: %s", e) + + logging.info("Running Star on the files in the fastq dir") + for fastq_file in fastq_file_list: + # logger.info(fastq_file_path) + # fastq_file_name = os.path.basename(fastq_file_path) + star_tmp_dir = star_dir / "tmp" + if star_tmp_dir.exists(): + shutil.rmtree(star_tmp_dir) + sam_file = Path(f"{star_dir}/{fastq_file.name}.sam") + junctions_file = Path(f"{star_dir}/{fastq_file.name}.sj.tab") + sam_file_name = sam_file.name + sam_temp_file = Path(f"{star_dir}/{sam_file_name}.tmp") + bam_file = re.sub(".sam", ".bam", sam_file_name) + bam_sort_file = Path(f"{star_dir}/{bam_file}") + log_out_file = Path(f"{star_dir}/{fastq_file.name}.Log.final.out") + if log_out_file.exists() and bam_sort_file.exists() and bam_sort_file.stat().st_size != 0: + logging.info( + "Found an existing bam file for the fastq file, \ + presuming the file has been processed, will skip" + ) + continue + + logging.info("Processing %s", fastq_file) + star_command = [ + str(star_bin), + "--outFilterIntronMotifs", + "RemoveNoncanonicalUnannotated", + "--outSAMstrandField", + "intronMotif", + "--runThreadN", + str(num_threads), + "--twopassMode", + "Basic", + "--runMode", + "alignReads", + "--genomeDir", + str(star_dir), + "--readFilesIn", + str(fastq_file), + "--outFileNamePrefix", + f"{star_dir}/", + "--outTmpDir", + str(star_tmp_dir), + "--outSAMtype", + "SAM", + "--alignIntronMax", + str(max_intron_length), + ] + #'--outSJfilterIntronMaxVsReadN','5000','10000','25000','40000', + #'50000','50000','50000','50000','50000','100000'] + check_compression = re.search(r".gz$", fastq_file) + if check_compression: + star_command.append("--readFilesCommand") + star_command.append("gunzip") + star_command.append("-c") + subprocess.run(star_command)#pylint:disable=subprocess-run-check + shutil.move(Path(f"{star_dir}/Aligned.out.sam"), sam_file) + shutil.move(Path(f"{star_dir}/SJ.out.tab"), junctions_file) + logging.info("Converting samfile into sorted bam file. Bam file: %s", bam_file) + subprocess.run(#pylint:disable=subprocess-run-check + [ + str(samtools_bin), + "sort", + "-@", + str(num_threads), + "-T", + str(sam_temp_file), + "-o", + str(bam_sort_file), + str(sam_file), + ] + ) + shutil.move(star_dir / "Log.final.out", log_out_file) + sam_file.unlink() + logging.info("Completed running STAR")
+ + + +def _create_paired_paths(fastq_file_paths: List) -> List: + """ + Create list of paired transcriptomic fastq files + + Args: + fastq_file_paths (List): List of transcriptomic file paths. + + Returns: + List: List of paired transcriptomic files + """ + path_dict = {} + # final_list = [] + for fastq_file in fastq_file_paths: + paired_name = re.search(r"(.+)_\d+\.(fastq|fq)", fastq_file) + if not paired_name: + logging.exception( + "Could not find _1 or _2 at the end of the prefix \ + for file. Assuming file is not paired: %s", + fastq_file, + ) + # final_list.append([fastq_file]) + path_dict[fastq_file] = [fastq_file] + continue + run_accession = paired_name.group(1) + if run_accession in path_dict: + path_dict[run_accession].append(fastq_file) + else: + path_dict[run_accession] = [fastq_file] + # for pair in path_dict: + # final_list.append(path_dict[pair]) + logging.info([value for values_list in path_dict.values() for value in values_list]) + return [value for values_list in path_dict.values() for value in values_list] + +#pylint:disable=pointless-string-statement +""" +For an advanced and optimised subsampling we could use +https://github.com/lh3/seqtk +""" + + +def _subsample_paired_fastq_files( + fastq_files: List[Path], + output_files: List[Path] = "", + subsample_read_limit: int = 100000000, + num_threads: int = 2, + compressed: bool = False, +) -> None: + """ + Perform subsampling on two paired FastQ files in parallel using multiple threads. + + Args: + fastq_files : Path for paired fastq files. + output_files : Path for the output file. + subsample_read_limit : Subsample size, defaults to 100000000. + num_threads : Number of threads, defaults to 2. + compressed : file compressed, defaults to False. + """ + fastq_file_1, fastq_file_2 = fastq_files + if len(output_files) == 0: + output_files = [f"{fastq_file_1}.sub", f"{fastq_file_2}.sub"] + output_file_1, output_file_2 = output_files + if re.search(r"\.gz$", fastq_file_1): + compressed = True + num_lines = sum(1 for line in gzip.open(fastq_file_1))#pylint:disable=consider-using-with + else: + num_lines = sum(1 for line in open(fastq_file_1))#pylint:disable=consider-using-with + + range_limit = int(num_lines / 4) + if range_limit <= subsample_read_limit: + logging.info("Number of reads (%s is less than the max allowed read count (%s), \ + no need to subsample", str(range_limit),str(subsample_read_limit) + ) + return + + rand_list = random.sample(range(0, range_limit - 1), subsample_read_limit) + random_indices = {idx * 4: 1 for idx in rand_list} + logging.info("Processing paired files in parallel") + pool = multiprocessing.Pool(int(num_threads))#pylint:disable=consider-using-with + pool.apply_async( + _subsample_fastq_subset, + args=( + fastq_file_1, + output_file_1, + random_indices, + compressed, + ), + ) + pool.apply_async( + _subsample_fastq_subset, + args=( + fastq_file_2, + output_file_2, + random_indices, + compressed, + ), + ) + pool.close() + pool.join() + + +def _subsample_fastq_subset( + fastq_file: Path, output_file: Path, random_indices: dict, compressed: bool +) -> None: + """ + Selecting specific sets of four lines from an input FastQ file and writing them to an output file. + + Args: + fastq_file : Path for the fastq file. + output_file : Path for the output file. + random_indices : set of random indices. + compressed : the files is compressed + """ + line_index = 0 + + with gzip.open(fastq_file, "rt") if compressed else open(fastq_file) as file_in, open( + output_file, "w+" + ) as file_out: + lines = [file_in.readline() for _ in range(4)] + while lines[3]: # This ensures that the loop continues until the end of the input file. + if line_index in random_indices: + file_out.writelines(lines) + line_index += 4 + lines = [file_in.readline() for _ in range(4)] + + +
+[docs] +def subsample_transcriptomic_data(fastq_file_list: List[Path], num_threads: int = 2) -> None: + """ + Subsample paired fastq files. + + Args: + fastq_file_list : List of fastq file path to process. + num_threads : number of threads + """ + for fastq_files in fastq_file_list: + fastq_file_1, fastq_file_2 = fastq_files + # fastq_file_pair = "" + # if len(fastq_files) == 2: + # fastq_file_pair = fastq_files[1] + + if len(fastq_files) == 1: + fastq_file_1 = fastq_files[0] + if Path(f"{fastq_file_1}.sub").exists(): + logging.info( + "Found an existing .sub file on the fastq path, will use that instead. File:%s.sub", + fastq_file_1, + ) + else: + _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads) + + elif len(fastq_files) == 2: + fastq_file_1, fastq_file_2 = fastq_files + if Path(f"{fastq_file_1}.sub").exists() and Path(f"{fastq_file_2}.sub").exists(): + logging.info( + "Found an existing .sub files on the fastq path for both members of the pair, will use \ + those instead of subsampling again. Files: %s.sub,%s.sub", + fastq_file_1, + fastq_file_2, + ) + elif Path(f"{fastq_file_2}.sub").exists(): + _subsample_paired_fastq_files(fastq_files, compressed=True, num_threads=num_threads)
+ + + +def run_trimming( + output_dir: Path, + short_read_fastq_dir: Path, + delete_pre_trim_fastq: bool = False, + num_threads: int = 1, + trim_galore_bin="trim_galore", +) -> None: + """ + Trim list of short read fastq files. + Args: + output_dir : Working directory path. + short_read_fastq_dir : Short read directory path. + delete_pre_trim_fastq : Removing original fastq file post trimming. Defaults to False. + num_threads : Number of threads. + trim_galore_bin : Software path. + """ + check_exe(trim_galore_bin) + trim_dir = create_dir(output_dir, "trim_galore_output") + + fastq_file_list = [] + file_types = ("*.fastq", "*.fq", "*.fastq.gz", "*.fq.gz") + fastq_file_list = [ + path for file_type in file_types for path in Path(short_read_fastq_dir).rglob(file_type) + ] + fastq_file_list = _create_paired_paths(fastq_file_list) + + trim_galore_cmd = [ + str(trim_galore_bin), + "--illumina", + "--quality", + "20", + "--length", + "50", + "--output_dir", + str(trim_dir), + ] + + pool = multiprocessing.Pool(int(num_threads)) # pylint:disable=consider-using-with + for fastq_paired_files in fastq_file_list: + pool.apply_async( + multiprocess_trim_galore, + args=( + trim_galore_cmd, + fastq_paired_files, + trim_dir, + ), + ) + if delete_pre_trim_fastq: + for file_path in fastq_paired_files: + file_path.unlink() + pool.close() + pool.join() + + trimmed_fastq_list = trim_dir.glob("*.fq.gz") + + for trimmed_fastq_path in trimmed_fastq_list: + logging.info("Trimmed file path: %s", str(trimmed_fastq_path)) + sub_patterns = re.compile(r"|".join(("_val_1.fq", "_val_2.fq", "_trimmed.fq"))) + updated_file_path = sub_patterns.sub(".fq", trimmed_fastq_path.name) + updated_file_path = short_read_fastq_dir / updated_file_path + logging.info("Updated file path: %s", str(updated_file_path)) + trimmed_fastq_path.rename(updated_file_path) + + files_to_delete_list = [] + for file_type in file_types: + files_to_delete_list.extend(short_read_fastq_dir.glob(file_type)) + + for file_to_delete in files_to_delete_list: + file_to_delete.unlink() + + +def multiprocess_trim_galore(trim_galore_cmd: List, fastq_paired_files: List[Path]) -> None: + """ + Trim short paired or single short read fastq file. + Args: + trim_galore_cmd : Generic command. + fastq_paired_files : List of single or paired fastq files. + """ + + fastq_file = fastq_paired_files[0] + fastq_file_pair = None + + if len(fastq_paired_files) == 2: + fastq_file, fastq_file_pair = fastq_paired_files + trim_galore_cmd.append("--paired") + trim_galore_cmd.append(fastq_file) + trim_galore_cmd.append(fastq_file_pair) + elif len(fastq_paired_files) == 1: + trim_galore_cmd.append(fastq_paired_files) + + logging.info("Running Trim Galore with the following command: %s", {" ".join(trim_galore_cmd)}) + subprocess.run(trim_galore_cmd, check=True) + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run STAR software.""" + + genome_file = argschema.fields.InputFile(required=True, description="Genome file path") + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + short_read_fastq_dir = argschema.fields.String( + required=True, + description="Short read directory path", + ) + delete_pre_trim_fastq = argschema.fields.Bool( + required=False, + default=False, + description="Delete the original fastq files after trimming", + ) + trim_fastq = argschema.fields.Bool( + required=False, + default=False, + description="Trim the short read files using Trim Galore", + ) + max_reads_per_sample = argschema.fields.Integer( + required=False, + default="0", + description="The maximum number of reads to use per sample.", + ) + max_intron_length = argschema.fields.Integer( + required=False, + default="100000", + description="The maximum intron size for alignments.", + ) + num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads") + star_bin = argschema.fields.String( + required=False, + default="star", + description="Star software path", + ) + samtools_bin = argschema.fields.String( + required=False, + default="samtools", + description="Samtools software path", + ) + trim_galore_bin = argschema.fields.String( + required=False, + default="trim_galore", + description="Trim Galore software path", + ) + + +def main() -> None: + """STAR's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "star.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_star( + mod.args["genome_file"], + mod.args["output_dir"], + mod.args["short_read_fastq_dir"], + mod.args["delete_pre_trim_fastq"], + mod.args["trim_fastq"], + mod.args["max_reads_per_sample"], + mod.args["max_intron_length"], + mod.args["num_threads"], + mod.args["star_bin"], + mod.args["samtools_bin"], + mod.args["trim_galore_bin"], + ) + + +# pylint:disable=pointless-string-statement +""" +def model_builder(work_dir): + + star_output_dir = os.path.join(work_dir, "star_output") + + all_junctions_file = os.path.join(star_output_dir, "all_junctions.sj") + sjf_out = open(all_junctions_file, "w+") + + for sj_tab_file in glob.glob(input_dir + "/*.sj.tab"): + sjf_in = open(sj_tab_file) + sjf_lines = sjf_in.readlines() + for line in sjf_lines: + elements = line.split("\t") + strand = "+" + + # my $slice_name = $eles[0]; + # my $start = $eles[1]; + # my $end = $eles[2]; + # my $strand = $eles[3]; + + # If the strand is undefined then skip, Augustus expects a strand + if elements[3] == "0": + continue + elif elements[3] == "2": + strand = "-" + + junction_length = int(elements[2]) - int(elements[1]) + 1 + if junction_length < 100: + continue + + if not elements[4] and elements[7] < 10: + continue + + # For the moment treat multimapping and single + # mapping things as a combined score + score = float(elements[6]) + float(elements[7]) + score = str(score) + output_line = [ + elements[0], + "RNASEQ", + "intron", + elements[1], + elements[2], + score, + strand, + ".", + ("src=W;mul=" + score + ";"), + ] + sjf_out.write("\t".join(output_line) + "\n") + + sjf_out.close() +""" +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/ensembl/tools/anno/transcriptomic_annotation/stringtie.html b/_modules/ensembl/tools/anno/transcriptomic_annotation/stringtie.html new file mode 100644 index 0000000..55cf6e6 --- /dev/null +++ b/_modules/ensembl/tools/anno/transcriptomic_annotation/stringtie.html @@ -0,0 +1,256 @@ + + + + + + + ensembl.tools.anno.transcriptomic_annotation.stringtie — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Source code for ensembl.tools.anno.transcriptomic_annotation.stringtie

+# See the NOTICE file distributed with this work for additional information
+# regarding copyright ownership.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+StringTie is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts.
+It uses a novel network flow algorithm as well as an optional de novo assembly step to assemble and
+quantitate full-length transcripts representing multiple splice variants for each gene locus.
+Pertea M, Pertea GM, Antonescu CM, Chang TC, Mendell JT & Salzberg SL. StringTie enables improved 
+reconstruction of a transcriptome from RNA-seq reads Nature Biotechnology 2015, doi:10.1038/nbt.3122
+"""
+
+__all__ = ["run_stringtie"]
+import logging
+import logging.config
+from pathlib import Path
+import re
+import subprocess
+import argschema
+
+from ensembl.tools.anno.utils._utils import (
+    check_exe,
+    create_dir,
+    check_gtf_content,
+)
+
+
+
+[docs] +def run_stringtie( + output_dir: Path, + stringtie_bin: Path = Path("stringtie"), + num_threads: int = 1, +) -> None: + """ + StringTie assembler of short read data. + Args: + output_dir : Working directory path. + stringtie_bin : Software path. + num_threads : Number of available threads. + """ + check_exe(stringtie_bin) + stringtie_dir = create_dir(output_dir, "stringtie_output") + logging.info("Skip analysis if the gtf file already exists") + output_file = stringtie_dir / "annotation.gtf" + if output_file.exists(): + transcript_count = check_gtf_content(output_file, "transcript") + if transcript_count > 0: + logging.info("Stringtie gtf file exists, skipping analysis") + return + + stringtie_merge_input_file = stringtie_dir / "stringtie_assemblies.txt" + stringtie_merge_output_file = stringtie_dir / "annotation.gtf" + star_dir = output_dir / "star_output" + + if star_dir.exists() and len(list(star_dir.glob("*.bam"))) != 0: + for sorted_bam_file in star_dir.glob("*.bam"): + transcript_file_name = re.sub(".bam", ".stringtie.gtf", sorted_bam_file.name) + transcript_file = stringtie_dir / transcript_file_name + if transcript_file.exists(): + logging.info( + "Found an existing stringtie gtf file, will not overwrite. \ + File found: %s", + transcript_file, + ) + else: + logging.info("Running Stringtie on: %s", sorted_bam_file.name) + try: + subprocess.check_output( # pylint:disable=subprocess-run-check + [ + stringtie_bin, + sorted_bam_file, + "-o", + transcript_file, + "-p", + str(num_threads), + "-t", # disable trimming of predicted transcripts based on coverage + "-a", # minimum anchor length for junctions + "15", + ] + ) + except subprocess.CalledProcessError as e: + logging.error("Error running Stringtie command: %s", e) + logging.error("Return code: %s", str(e.returncode)) + logging.error("Output and error messages:%s\n", e.output) + else: + raise IndexError(f"The list of sorted bam files is empty, Star output dir: {star_dir}") + + logging.info("Creating Stringtie merge input file: %s", stringtie_merge_input_file) + with open(stringtie_merge_input_file, "w+", encoding="utf8") as gtf_list_out: + for gtf_file in stringtie_dir.glob("*.stringtie.gtf"): + transcript_count = check_gtf_content(gtf_file, "transcript") + if transcript_count > 0: + gtf_list_out.write(f"{gtf_file}\n") + else: + logging.warning("Warning, skipping file with no transcripts. Path:%s", gtf_file) + logging.info("Merging Stringtie results.") + try: + subprocess.run( # pylint:disable=subprocess-run-check + [ + stringtie_bin, + "--merge", + "-o", + stringtie_merge_output_file, + stringtie_merge_input_file, + ] + ) + except subprocess.CalledProcessError as e: + logging.error("Error running Stringtie merging command: %s", e)
+ + + +class InputSchema(argschema.ArgSchema): + """Input arguments expected to run StringTie software.""" + + output_dir = argschema.fields.OutputDir(required=True, description="Output directory path") + stringtie_bin = argschema.fields.String( + required=False, + default="stringtie", + description="StringTie software path", + ) + num_threads = argschema.fields.Integer(required=False, default=1, description="Number of threads") + + +def main() -> None: + """StringTie's entry-point.""" + mod = argschema.ArgSchemaParser(schema_type=InputSchema) + log_file_path = create_dir(mod.args["output_dir"], "log") / "stringtie.log" + loginipath = Path(__file__).parents[6] / "conf" / "logging.conf" + logging.config.fileConfig( + loginipath, + defaults={"logfilename": str(log_file_path)}, + disable_existing_loggers=False, + ) + run_stringtie( + mod.args["output_dir"], + mod.args["stringtie_bin"], + mod.args["num_threads"], + ) +
+ +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html new file mode 100644 index 0000000..2ed8aa9 --- /dev/null +++ b/_modules/index.html @@ -0,0 +1,117 @@ + + + + + + + Overview: module code — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+ + +
+
+
+ + + + + \ No newline at end of file diff --git a/_sources/cmsearch.rst.txt b/_sources/cmsearch.rst.txt new file mode 100644 index 0000000..3faba6f --- /dev/null +++ b/_sources/cmsearch.rst.txt @@ -0,0 +1,8 @@ +Cmsearch Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.snc_rna_annotation.cmsearch + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/cpg.rst.txt b/_sources/cpg.rst.txt new file mode 100644 index 0000000..8603218 --- /dev/null +++ b/_sources/cpg.rst.txt @@ -0,0 +1,8 @@ +CpG Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.simple_feature_annotation.cpg + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/dust.rst.txt b/_sources/dust.rst.txt new file mode 100644 index 0000000..2bc64ca --- /dev/null +++ b/_sources/dust.rst.txt @@ -0,0 +1,8 @@ +DustMasker Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.dust + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/eponine.rst.txt b/_sources/eponine.rst.txt new file mode 100644 index 0000000..e460382 --- /dev/null +++ b/_sources/eponine.rst.txt @@ -0,0 +1,8 @@ +Eponine Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.simple_feature_annotation.eponine + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/genblast.rst.txt b/_sources/genblast.rst.txt new file mode 100644 index 0000000..401f411 --- /dev/null +++ b/_sources/genblast.rst.txt @@ -0,0 +1,8 @@ +Genblast Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.protein_annotation.genblast + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt new file mode 100644 index 0000000..98a7404 --- /dev/null +++ b/_sources/index.rst.txt @@ -0,0 +1,58 @@ +.. See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. ensembl-anno documentation master file, created by + sphinx-quickstart on Fri Sep 1 12:25:36 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +======================================== +Ensembl-anno +=========================================== + +Anno tool kit + + +Contents +-------- +Check out :ref:`installation ` section for further information on how +to install the project. + +.. toctree:: + :maxdepth: 2 + :caption: Index + + install + license + + _modules/cpg + _modules/dust + _modules/eponine + _modules/genblast + _modules/minimap + _modules/red + _modules/repeatmasker + _modules/scallop + _modules/star + _modules/stringtie + _modules/trf + _modules/trnascan + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` \ No newline at end of file diff --git a/_sources/install.rst.txt b/_sources/install.rst.txt new file mode 100644 index 0000000..997e6c0 --- /dev/null +++ b/_sources/install.rst.txt @@ -0,0 +1,54 @@ +.. See the NOTICE file distributed with this work for additional information + regarding copyright ownership. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +API Setup and installation +=========================== + +Requirements +-------------- + +.. _install: + +An Ensembl API checkout including: + +- ensembl-production `ensembl-production `_. +- ensembl-analysis `ensembl-analysis `_. (on dev/hive_master branch) +- ensembl-taxonomy `ensembl-taxonomy `_. +- ensembl-orm `ensembl-orm `_. + +Software +^^^^^^^^ + +#. Python 3.8+ +#. Bioperl 1.6.9+ + +Python Modules +^^^^^^^^^^^^^^ +#. argschema + + + +Installation +------------ +Directly from GitHub: + +.. code-block:: none + :linenos: + + git clone https://github.com/Ensembl/ensembl-analysis -b experimental/gbiab + git clone https://github.com/Ensembl/ensembl-production + git clone https://github.com/Ensembl/ensembl-hive + git clone https://github.com/Ensembl/ensembl-taxonomy + git clone https://github.com/Ensembl/ensembl-orm \ No newline at end of file diff --git a/_sources/license.rst.txt b/_sources/license.rst.txt new file mode 100644 index 0000000..9e9b2fe --- /dev/null +++ b/_sources/license.rst.txt @@ -0,0 +1,203 @@ +License +------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/_sources/minimap.rst.txt b/_sources/minimap.rst.txt new file mode 100644 index 0000000..bf3a45c --- /dev/null +++ b/_sources/minimap.rst.txt @@ -0,0 +1,8 @@ +Minimap2 Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.minimap + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/red.rst.txt b/_sources/red.rst.txt new file mode 100644 index 0000000..26743cd --- /dev/null +++ b/_sources/red.rst.txt @@ -0,0 +1,8 @@ +Red Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.red + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/repeatmasker.rst.txt b/_sources/repeatmasker.rst.txt new file mode 100644 index 0000000..8598c0e --- /dev/null +++ b/_sources/repeatmasker.rst.txt @@ -0,0 +1,8 @@ +Repeatmasker Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.repeatmasker + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/scallop.rst.txt b/_sources/scallop.rst.txt new file mode 100644 index 0000000..744055b --- /dev/null +++ b/_sources/scallop.rst.txt @@ -0,0 +1,8 @@ +Scallop Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.scallop + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/star.rst.txt b/_sources/star.rst.txt new file mode 100644 index 0000000..d83c66b --- /dev/null +++ b/_sources/star.rst.txt @@ -0,0 +1,8 @@ +STAR Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.star + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/stringtie.rst.txt b/_sources/stringtie.rst.txt new file mode 100644 index 0000000..878de41 --- /dev/null +++ b/_sources/stringtie.rst.txt @@ -0,0 +1,8 @@ +Stringtie Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.transcriptomic_annotation.stringtie + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/trf.rst.txt b/_sources/trf.rst.txt new file mode 100644 index 0000000..9268f3c --- /dev/null +++ b/_sources/trf.rst.txt @@ -0,0 +1,8 @@ +TRF Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.repeat_annotation.trf + :members: + :undoc-members: + :show-inheritance: + diff --git a/_sources/trnascan.rst.txt b/_sources/trnascan.rst.txt new file mode 100644 index 0000000..d9da9d4 --- /dev/null +++ b/_sources/trnascan.rst.txt @@ -0,0 +1,8 @@ +tRNAscan-SE Module Documentation +============================== + +.. automodule:: ensembl.tools.anno.snc_rna_annotation.trnascan + :members: + :undoc-members: + :show-inheritance: + diff --git a/_static/agogo.css b/_static/agogo.css new file mode 100644 index 0000000..11b0b92 --- /dev/null +++ b/_static/agogo.css @@ -0,0 +1,555 @@ +/* + * agogo.css_t + * ~~~~~~~~~~~ + * + * Sphinx stylesheet -- agogo theme. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +* { + margin: 0px; + padding: 0px; +} + +body { + font-family: Garamond, Arial, serif; + line-height: 1.4em; + color: black; + background-color: #009999; + + /* fix for background colors breaking at horizontal + scrolling on smaller devices */ + min-width: fit-content; +} + + +/* Page layout */ + +div.header, div.content, div.footer { + width: 70em; + margin-left: auto; + margin-right: auto; +} + +div.header-wrapper { + background: #009999; + border-bottom: 3px solid #2e3436; +} + + +/* Default body styles */ +a { + color: green; +} + +a:visited { + color: #551a8b; +} + +div.bodywrapper a, div.footer a { + text-decoration: underline; +} + +.clearer { + clear: both; +} + +.left { + float: left; +} + +.right { + float: right; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +h1, h2, h3, h4 { + font-family: Arial, Helvetica, serif; + font-weight: normal; + color: #3465a4; + margin-bottom: .8em; +} + +h1 { + color: #204a87; +} + +h2 { + padding-bottom: .5em; + border-bottom: 1px solid #3465a4; +} + +a.headerlink { + visibility: hidden; + color: #dddddd; + padding-left: .3em; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +img { + border: 0; +} + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 2px 7px 1px 7px; + border-left: 0.2em solid black; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +dt:target, .highlighted { + background-color: #fbe54e; +} + +/* Header */ + +div.header { + padding: 1em; +} + +div.header .headertitle { + font-family: Arial, Helvetica, serif; + font-weight: normal; + font-size: 180%; + letter-spacing: .08em; + margin-bottom: .8em; +} + +div.header .headertitle a { + color: white; +} + +div.header div.rel { + margin-top: 1em; +} + +div.header div.rel a { + color: #33d6ff; + letter-spacing: .1em; + text-transform: uppercase; +} + +p.logo { + float: right; +} + +img.logo { + border: 0; +} + + +/* Content */ +div.content-wrapper { + background-color: white; + padding: 1em; +} + +div.document { + width: 50em; + float: left; +} + +div.body { + padding-right: 2em; + text-align: justify; +} + +div.document h1 { + line-height: 120%; +} + +div.document ul { + margin: 1.5em; + list-style-type: square; +} + +div.document dd { + margin-left: 1.2em; + margin-top: .4em; + margin-bottom: 1em; +} + +div.document .section { + margin-top: 1.7em; +} +div.document .section:first-child { + margin-top: 0px; +} + +div.document div.highlight { + padding: 3px; + border-top: 2px solid #dddddd; + border-bottom: 2px solid #dddddd; + margin-top: .8em; + margin-bottom: .8em; +} + +div.document div.literal-block-wrapper { + margin-top: .8em; + margin-bottom: .8em; +} + +div.document div.literal-block-wrapper div.highlight { + margin: 0; +} + +div.document div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.document div.code-block-caption span.caption-text { +} + +div.document h2 { + margin-top: .7em; +} + +div.document p { + margin-bottom: .5em; +} + +div.document li.toctree-l1 { + margin-bottom: 1em; +} + +div.document .descname { + font-weight: bold; +} + +div.document .sig-paren { + font-size: larger; +} + +div.document .docutils.literal { + background-color: #eeeeec; + padding: 1px; +} + +div.document .docutils.xref.literal { + background-color: transparent; + padding: 0px; +} + +div.document blockquote { + margin: 1em; +} + +div.document ol { + margin: 1.5em; +} + + +/* Sidebar */ + +div.sidebar, +aside.sidebar { + width: 20em; + float: right; + font-size: .9em; +} + +div.sidebar a, aside.sidebar a, div.header a { + text-decoration: none; +} + +div.sidebar a:hover, aside.sidebar a:hover, div.header a:hover { + text-decoration: underline; +} + +div.sidebar h3, +aside.sidebar h3 { + color: #2e3436; + text-transform: uppercase; + font-size: 130%; + letter-spacing: .1em; +} + +div.sidebar ul, +aside.sidebar ul { + list-style-type: none; +} + +div.sidebar li.toctree-l1 a, +aside.sidebar li.toctree-l1 a { + display: block; + padding: 1px; + border: 1px solid #dddddd; + background-color: #eeeeec; + margin-bottom: .4em; + padding-left: 3px; + color: #2e3436; +} + +div.sidebar li.toctree-l2 a, +aside.sidebar li.toctree-l2 a { + background-color: transparent; + border: none; + margin-left: 1em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l3 a, +aside.sidebar li.toctree-l3 a { + background-color: transparent; + border: none; + margin-left: 2em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l2:last-child a, +aside.sidebar li.toctree-l2:last-child a { + border-bottom: none; +} + +div.sidebar li.toctree-l1.current a, +aside.sidebar li.toctree-l1.current a { + border-right: 5px solid #33d6ff; +} + +div.sidebar li.toctree-l1.current li.toctree-l2 a, +aside.sidebar li.toctree-l1.current li.toctree-l2 a { + border-right: none; +} + +div.sidebar input[type="text"], +aside.sidebar input[type="text"] { + width: 170px; +} + +div.sidebar input[type="submit"], +aside.sidebar input[type="submit"] { + width: 30px; +} + + +/* Footer */ + +div.footer-wrapper { + background: #e6fff9; + border-top: 4px solid #babdb6; + padding-top: 10px; + padding-bottom: 10px; + min-height: 80px; +} + +div.footer, div.footer a { + color: #888a85; +} + +div.footer .right { + text-align: right; +} + +div.footer .left { + text-transform: uppercase; +} + + +/* Styles copied from basic theme */ + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-right { + text-align: right; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li div.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- viewcode extension ---------------------------------------------------- */ + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family:: Garamond, Arial, serif; +} + +div.viewcode-block:target { + margin: -1px -3px; + padding: 0 3px; + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +div.code-block-caption { + background-color: #ddd; + color: #333; + padding: 2px 5px; + font-size: small; +} + +/* -- math display ---------------------------------------------------------- */ + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} \ No newline at end of file diff --git a/_static/basic.css b/_static/basic.css new file mode 100644 index 0000000..a917981 --- /dev/null +++ b/_static/basic.css @@ -0,0 +1,925 @@ +/* + * basic.css + * ~~~~~~~~~ + * + * Sphinx stylesheet -- basic theme. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +/* -- main layout ----------------------------------------------------------- */ + +div.clearer { + clear: both; +} + +div.section::after { + display: block; + content: ''; + clear: left; +} + +/* -- relbar ---------------------------------------------------------------- */ + +div.related { + width: 100%; + font-size: 90%; +} + +div.related h3 { + display: none; +} + +div.related ul { + margin: 0; + padding: 0 0 0 10px; + list-style: none; +} + +div.related li { + display: inline; +} + +div.related li.right { + float: right; + margin-right: 5px; +} + +/* -- sidebar --------------------------------------------------------------- */ + +div.sphinxsidebarwrapper { + padding: 10px 5px 0 10px; +} + +div.sphinxsidebar { + float: left; + width: 20em; + margin-left: -100%; + font-size: 90%; + word-wrap: break-word; + overflow-wrap : break-word; +} + +div.sphinxsidebar ul { + list-style: none; +} + +div.sphinxsidebar ul ul, +div.sphinxsidebar ul.want-points { + margin-left: 20px; + list-style: square; +} + +div.sphinxsidebar ul ul { + margin-top: 0; + margin-bottom: 0; +} + +div.sphinxsidebar form { + margin-top: 10px; +} + +div.sphinxsidebar input { + border: 1px solid #98dbcc; + font-family: sans-serif; + font-size: 1em; +} + +div.sphinxsidebar #searchbox form.search { + overflow: hidden; +} + +div.sphinxsidebar #searchbox input[type="text"] { + float: left; + width: 80%; + padding: 0.25em; + box-sizing: border-box; +} + +div.sphinxsidebar #searchbox input[type="submit"] { + float: left; + width: 20%; + border-left: none; + padding: 0.25em; + box-sizing: border-box; +} + + +img { + border: 0; + max-width: 100%; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li p.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; + margin-left: auto; + margin-right: auto; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable { + width: 100%; +} + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable ul { + margin-top: 0; + margin-bottom: 0; + list-style-type: none; +} + +table.indextable > tbody > tr > td > ul { + padding-left: 0em; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +div.modindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +div.genindex-jumpbox { + border-top: 1px solid #ddd; + border-bottom: 1px solid #ddd; + margin: 1em 0 1em 0; + padding: 0.4em; +} + +/* -- domain module index --------------------------------------------------- */ + +table.modindextable td { + padding: 2px; + border-collapse: collapse; +} + +/* -- general body styles --------------------------------------------------- */ + +div.body { + min-width: 360px; + max-width: 800px; +} + +div.body p, div.body dd, div.body li, div.body blockquote { + -moz-hyphens: auto; + -ms-hyphens: auto; + -webkit-hyphens: auto; + hyphens: auto; +} + +a.headerlink { + visibility: hidden; +} + +a:visited { + color: #551A8B; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink, +caption:hover > a.headerlink, +p.caption:hover > a.headerlink, +div.code-block-caption:hover > a.headerlink { + visibility: visible; +} + +div.body p.caption { + text-align: inherit; +} + +div.body td { + text-align: left; +} + +.first { + margin-top: 0 !important; +} + +p.rubric { + margin-top: 30px; + font-weight: bold; +} + +img.align-left, figure.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, figure.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, figure.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +img.align-default, figure.align-default, .figure.align-default { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + text-align: center; +} + +.align-default { + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- sidebars -------------------------------------------------------------- */ + +div.sidebar, +aside.sidebar { + margin: 0 0 0.5em 1em; + border: 1px solid #ddb; + padding: 7px; + background-color: #ffe; + width: 40%; + float: right; + clear: right; + overflow-x: auto; +} + +p.sidebar-title { + font-weight: bold; +} + +nav.contents, +aside.topic, +div.admonition, div.topic, blockquote { + clear: left; +} + +/* -- topics ---------------------------------------------------------------- */ + +nav.contents, +aside.topic, +div.topic { + border: 1px solid #ccc; + padding: 7px; + margin: 10px 0 10px 0; +} + +p.topic-title { + font-size: 1.1em; + font-weight: bold; + margin-top: 10px; +} + +/* -- admonitions ----------------------------------------------------------- */ + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 7px; +} + +div.admonition dt { + font-weight: bold; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +div.body p.centered { + text-align: center; + margin-top: 25px; +} + +/* -- content of sidebars/topics/admonitions -------------------------------- */ + +div.sidebar > :last-child, +aside.sidebar > :last-child, +nav.contents > :last-child, +aside.topic > :last-child, +div.topic > :last-child, +div.admonition > :last-child { + margin-bottom: 0; +} + +div.sidebar::after, +aside.sidebar::after, +nav.contents::after, +aside.topic::after, +div.topic::after, +div.admonition::after, +blockquote::after { + display: block; + content: ''; + clear: both; +} + +/* -- tables ---------------------------------------------------------------- */ + +table.docutils { + margin-top: 10px; + margin-bottom: 10px; + border: 0; + border-collapse: collapse; +} + +table.align-center { + margin-left: auto; + margin-right: auto; +} + +table.align-default { + margin-left: auto; + margin-right: auto; +} + +table caption span.caption-number { + font-style: italic; +} + +table caption span.caption-text { +} + +table.docutils td, table.docutils th { + padding: 1px 8px 1px 5px; + border-top: 0; + border-left: 0; + border-right: 0; + border-bottom: 1px solid #aaa; +} + +th { + text-align: left; + padding-right: 5px; +} + +table.citation { + border-left: solid 1px gray; + margin-left: 1px; +} + +table.citation td { + border-bottom: none; +} + +th > :first-child, +td > :first-child { + margin-top: 0px; +} + +th > :last-child, +td > :last-child { + margin-bottom: 0px; +} + +/* -- figures --------------------------------------------------------------- */ + +div.figure, figure { + margin: 0.5em; + padding: 0.5em; +} + +div.figure p.caption, figcaption { + padding: 0.3em; +} + +div.figure p.caption span.caption-number, +figcaption span.caption-number { + font-style: italic; +} + +div.figure p.caption span.caption-text, +figcaption span.caption-text { +} + +/* -- field list styles ----------------------------------------------------- */ + +table.field-list td, table.field-list th { + border: 0 !important; +} + +.field-list ul { + margin: 0; + padding-left: 1em; +} + +.field-list p { + margin: 0; +} + +.field-name { + -moz-hyphens: manual; + -ms-hyphens: manual; + -webkit-hyphens: manual; + hyphens: manual; +} + +/* -- hlist styles ---------------------------------------------------------- */ + +table.hlist { + margin: 1em 0; +} + +table.hlist td { + vertical-align: top; +} + +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + + +/* -- other body styles ----------------------------------------------------- */ + +ol.arabic { + list-style: decimal; +} + +ol.loweralpha { + list-style: lower-alpha; +} + +ol.upperalpha { + list-style: upper-alpha; +} + +ol.lowerroman { + list-style: lower-roman; +} + +ol.upperroman { + list-style: upper-roman; +} + +:not(li) > ol > li:first-child > :first-child, +:not(li) > ul > li:first-child > :first-child { + margin-top: 0px; +} + +:not(li) > ol > li:last-child > :last-child, +:not(li) > ul > li:last-child > :last-child { + margin-bottom: 0px; +} + +ol.simple ol p, +ol.simple ul p, +ul.simple ol p, +ul.simple ul p { + margin-top: 0; +} + +ol.simple > li:not(:first-child) > p, +ul.simple > li:not(:first-child) > p { + margin-top: 0; +} + +ol.simple p, +ul.simple p { + margin-bottom: 0; +} + +aside.footnote > span, +div.citation > span { + float: left; +} +aside.footnote > span:last-of-type, +div.citation > span:last-of-type { + padding-right: 0.5em; +} +aside.footnote > p { + margin-left: 2em; +} +div.citation > p { + margin-left: 4em; +} +aside.footnote > p:last-of-type, +div.citation > p:last-of-type { + margin-bottom: 0em; +} +aside.footnote > p:last-of-type:after, +div.citation > p:last-of-type:after { + content: ""; + clear: both; +} + +dl.field-list { + display: grid; + grid-template-columns: fit-content(30%) auto; +} + +dl.field-list > dt { + font-weight: bold; + word-break: break-word; + padding-left: 0.5em; + padding-right: 5px; +} + +dl.field-list > dd { + padding-left: 0.5em; + margin-top: 0em; + margin-left: 0em; + margin-bottom: 0em; +} + +dl { + margin-bottom: 15px; +} + +dd > :first-child { + margin-top: 0px; +} + +dd ul, dd table { + margin-bottom: 10px; +} + +dd { + margin-top: 3px; + margin-bottom: 10px; + margin-left: 30px; +} + +.sig dd { + margin-top: 0px; + margin-bottom: 0px; +} + +.sig dl { + margin-top: 0px; + margin-bottom: 0px; +} + +dl > dd:last-child, +dl > dd:last-child > :last-child { + margin-bottom: 0; +} + +dt:target, span.highlighted { + background-color: #fbe54e; +} + +rect.highlighted { + fill: #fbe54e; +} + +dl.glossary dt { + font-weight: bold; + font-size: 1.1em; +} + +.versionmodified { + font-style: italic; +} + +.system-message { + background-color: #fda; + padding: 5px; + border: 3px solid red; +} + +.footnote:target { + background-color: #ffa; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +.guilabel, .menuselection { + font-family: sans-serif; +} + +.accelerator { + text-decoration: underline; +} + +.classifier { + font-style: oblique; +} + +.classifier:before { + font-style: normal; + margin: 0 0.5em; + content: ":"; + display: inline-block; +} + +abbr, acronym { + border-bottom: dotted 1px; + cursor: help; +} + +.translated { + background-color: rgba(207, 255, 207, 0.2) +} + +.untranslated { + background-color: rgba(255, 207, 207, 0.2) +} + +/* -- code displays --------------------------------------------------------- */ + +pre { + overflow: auto; + overflow-y: hidden; /* fixes display issues on Chrome browsers */ +} + +pre, div[class*="highlight-"] { + clear: both; +} + +span.pre { + -moz-hyphens: none; + -ms-hyphens: none; + -webkit-hyphens: none; + hyphens: none; + white-space: nowrap; +} + +div[class*="highlight-"] { + margin: 1em 0; +} + +td.linenos pre { + border: 0; + background-color: transparent; + color: #aaa; +} + +table.highlighttable { + display: block; +} + +table.highlighttable tbody { + display: block; +} + +table.highlighttable tr { + display: flex; +} + +table.highlighttable td { + margin: 0; + padding: 0; +} + +table.highlighttable td.linenos { + padding-right: 0.5em; +} + +table.highlighttable td.code { + flex: 1; + overflow: hidden; +} + +.highlight .hll { + display: block; +} + +div.highlight pre, +table.highlighttable pre { + margin: 0; +} + +div.code-block-caption + div { + margin-top: 0; +} + +div.code-block-caption { + margin-top: 1em; + padding: 2px 5px; + font-size: small; +} + +div.code-block-caption code { + background-color: transparent; +} + +table.highlighttable td.linenos, +span.linenos, +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ +} + +div.code-block-caption span.caption-number { + padding: 0.1em 0.3em; + font-style: italic; +} + +div.code-block-caption span.caption-text { +} + +div.literal-block-wrapper { + margin: 1em 0; +} + +code.xref, a code { + background-color: transparent; + font-weight: bold; +} + +h1 code, h2 code, h3 code, h4 code, h5 code, h6 code { + background-color: transparent; +} + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family: sans-serif; +} + +div.viewcode-block:target { + margin: -1px -10px; + padding: 0 10px; +} + +/* -- math display ---------------------------------------------------------- */ + +img.math { + vertical-align: middle; +} + +div.body div.math p { + text-align: center; +} + +span.eqno { + float: right; +} + +span.eqno a.headerlink { + position: absolute; + z-index: 1; +} + +div.math:hover a.headerlink { + visibility: visible; +} + +/* -- printout stylesheet --------------------------------------------------- */ + +@media print { + div.document, + div.documentwrapper, + div.bodywrapper { + margin: 0 !important; + width: 100%; + } + + div.sphinxsidebar, + div.related, + div.footer, + #top-link { + display: none; + } +} \ No newline at end of file diff --git a/_static/bgfooter.png b/_static/bgfooter.png new file mode 100644 index 0000000..b7c7cad Binary files /dev/null and b/_static/bgfooter.png differ diff --git a/_static/bgtop.png b/_static/bgtop.png new file mode 100644 index 0000000..0574088 Binary files /dev/null and b/_static/bgtop.png differ diff --git a/_static/doctools.js b/_static/doctools.js new file mode 100644 index 0000000..d06a71d --- /dev/null +++ b/_static/doctools.js @@ -0,0 +1,156 @@ +/* + * doctools.js + * ~~~~~~~~~~~ + * + * Base JavaScript utilities for all Sphinx HTML documentation. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", +]); + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); + } +}; + +/** + * Small JavaScript module for the documentation. + */ +const Documentation = { + init: () => { + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); + }, + + /** + * i18n support + */ + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", + + // gettext and ngettext don't access this so that the functions + // can safely bound to a different name (_ = Documentation.gettext) + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } + }, + + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; + }, + + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; + }, + + /** + * helper function to focus on search bar + */ + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); + }, + + /** + * Initialise the domain index toggle buttons + */ + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); + } + }; + + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); + }, + + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.altKey || event.ctrlKey || event.metaKey) return; + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); + } + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); + } + break; + } + } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } + }); + }, +}; + +// quick alias for translations +const _ = Documentation.gettext; + +_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js new file mode 100644 index 0000000..e21c068 --- /dev/null +++ b/_static/documentation_options.js @@ -0,0 +1,13 @@ +const DOCUMENTATION_OPTIONS = { + VERSION: '0.1', + LANGUAGE: 'en', + COLLAPSE_INDEX: false, + BUILDER: 'html', + FILE_SUFFIX: '.html', + LINK_SUFFIX: '.html', + HAS_SOURCE: true, + SOURCELINK_SUFFIX: '.txt', + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: true, +}; \ No newline at end of file diff --git a/_static/file.png b/_static/file.png new file mode 100644 index 0000000..a858a41 Binary files /dev/null and b/_static/file.png differ diff --git a/_static/language_data.js b/_static/language_data.js new file mode 100644 index 0000000..250f566 --- /dev/null +++ b/_static/language_data.js @@ -0,0 +1,199 @@ +/* + * language_data.js + * ~~~~~~~~~~~~~~~~ + * + * This script contains the language-specific data used by searchtools.js, + * namely the list of stopwords, stemmer, scorer and splitter. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; + + +/* Non-minified version is copied as a separate JS file, is available */ + +/** + * Porter Stemmer + */ +var Stemmer = function() { + + var step2list = { + ational: 'ate', + tional: 'tion', + enci: 'ence', + anci: 'ance', + izer: 'ize', + bli: 'ble', + alli: 'al', + entli: 'ent', + eli: 'e', + ousli: 'ous', + ization: 'ize', + ation: 'ate', + ator: 'ate', + alism: 'al', + iveness: 'ive', + fulness: 'ful', + ousness: 'ous', + aliti: 'al', + iviti: 'ive', + biliti: 'ble', + logi: 'log' + }; + + var step3list = { + icate: 'ic', + ative: '', + alize: 'al', + iciti: 'ic', + ical: 'ic', + ful: '', + ness: '' + }; + + var c = "[^aeiou]"; // consonant + var v = "[aeiouy]"; // vowel + var C = c + "[^aeiouy]*"; // consonant sequence + var V = v + "[aeiou]*"; // vowel sequence + + var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 + var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 + var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 + var s_v = "^(" + C + ")?" + v; // vowel in stem + + this.stemWord = function (w) { + var stem; + var suffix; + var firstch; + var origword = w; + + if (w.length < 3) + return w; + + var re; + var re2; + var re3; + var re4; + + firstch = w.substr(0,1); + if (firstch == "y") + w = firstch.toUpperCase() + w.substr(1); + + // Step 1a + re = /^(.+?)(ss|i)es$/; + re2 = /^(.+?)([^s])s$/; + + if (re.test(w)) + w = w.replace(re,"$1$2"); + else if (re2.test(w)) + w = w.replace(re2,"$1$2"); + + // Step 1b + re = /^(.+?)eed$/; + re2 = /^(.+?)(ed|ing)$/; + if (re.test(w)) { + var fp = re.exec(w); + re = new RegExp(mgr0); + if (re.test(fp[1])) { + re = /.$/; + w = w.replace(re,""); + } + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1]; + re2 = new RegExp(s_v); + if (re2.test(stem)) { + w = stem; + re2 = /(at|bl|iz)$/; + re3 = new RegExp("([^aeiouylsz])\\1$"); + re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re2.test(w)) + w = w + "e"; + else if (re3.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + else if (re4.test(w)) + w = w + "e"; + } + } + + // Step 1c + re = /^(.+?)y$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(s_v); + if (re.test(stem)) + w = stem + "i"; + } + + // Step 2 + re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step2list[suffix]; + } + + // Step 3 + re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + suffix = fp[2]; + re = new RegExp(mgr0); + if (re.test(stem)) + w = stem + step3list[suffix]; + } + + // Step 4 + re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; + re2 = /^(.+?)(s|t)(ion)$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + if (re.test(stem)) + w = stem; + } + else if (re2.test(w)) { + var fp = re2.exec(w); + stem = fp[1] + fp[2]; + re2 = new RegExp(mgr1); + if (re2.test(stem)) + w = stem; + } + + // Step 5 + re = /^(.+?)e$/; + if (re.test(w)) { + var fp = re.exec(w); + stem = fp[1]; + re = new RegExp(mgr1); + re2 = new RegExp(meq1); + re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); + if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) + w = stem; + } + re = /ll$/; + re2 = new RegExp(mgr1); + if (re.test(w) && re2.test(w)) { + re = /.$/; + w = w.replace(re,""); + } + + // and turn initial Y back to y + if (firstch == "y") + w = firstch.toLowerCase() + w.substr(1); + return w; + } +} + diff --git a/_static/minus.png b/_static/minus.png new file mode 100644 index 0000000..d96755f Binary files /dev/null and b/_static/minus.png differ diff --git a/_static/plus.png b/_static/plus.png new file mode 100644 index 0000000..7107cec Binary files /dev/null and b/_static/plus.png differ diff --git a/_static/pygments.css b/_static/pygments.css new file mode 100644 index 0000000..6110e9f --- /dev/null +++ b/_static/pygments.css @@ -0,0 +1,84 @@ +pre { line-height: 125%; } +td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; } +td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; } +.highlight .hll { background-color: #ffffcc } +.highlight { background: #f8f8f8; } +.highlight .c { color: #8f5902; font-style: italic } /* Comment */ +.highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */ +.highlight .g { color: #000000 } /* Generic */ +.highlight .k { color: #204a87; font-weight: bold } /* Keyword */ +.highlight .l { color: #000000 } /* Literal */ +.highlight .n { color: #000000 } /* Name */ +.highlight .o { color: #ce5c00; font-weight: bold } /* Operator */ +.highlight .x { color: #000000 } /* Other */ +.highlight .p { color: #000000; font-weight: bold } /* Punctuation */ +.highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #8f5902; font-style: italic } /* Comment.Preproc */ +.highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */ +.highlight .gd { color: #a40000 } /* Generic.Deleted */ +.highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */ +.highlight .ges { color: #000000; font-weight: bold; font-style: italic } /* Generic.EmphStrong */ +.highlight .gr { color: #ef2929 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #00A000 } /* Generic.Inserted */ +.highlight .go { color: #000000; font-style: italic } /* Generic.Output */ +.highlight .gp { color: #8f5902 } /* Generic.Prompt */ +.highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */ +.highlight .kc { color: #204a87; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #204a87; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #204a87; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #204a87; font-weight: bold } /* Keyword.Pseudo */ +.highlight .kr { color: #204a87; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #204a87; font-weight: bold } /* Keyword.Type */ +.highlight .ld { color: #000000 } /* Literal.Date */ +.highlight .m { color: #0000cf; font-weight: bold } /* Literal.Number */ +.highlight .s { color: #4e9a06 } /* Literal.String */ +.highlight .na { color: #c4a000 } /* Name.Attribute */ +.highlight .nb { color: #204a87 } /* Name.Builtin */ +.highlight .nc { color: #000000 } /* Name.Class */ +.highlight .no { color: #000000 } /* Name.Constant */ +.highlight .nd { color: #5c35cc; font-weight: bold } /* Name.Decorator */ +.highlight .ni { color: #ce5c00 } /* Name.Entity */ +.highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */ +.highlight .nf { color: #000000 } /* Name.Function */ +.highlight .nl { color: #f57900 } /* Name.Label */ +.highlight .nn { color: #000000 } /* Name.Namespace */ +.highlight .nx { color: #000000 } /* Name.Other */ +.highlight .py { color: #000000 } /* Name.Property */ +.highlight .nt { color: #204a87; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #000000 } /* Name.Variable */ +.highlight .ow { color: #204a87; font-weight: bold } /* Operator.Word */ +.highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */ +.highlight .w { color: #f8f8f8 } /* Text.Whitespace */ +.highlight .mb { color: #0000cf; font-weight: bold } /* Literal.Number.Bin */ +.highlight .mf { color: #0000cf; font-weight: bold } /* Literal.Number.Float */ +.highlight .mh { color: #0000cf; font-weight: bold } /* Literal.Number.Hex */ +.highlight .mi { color: #0000cf; font-weight: bold } /* Literal.Number.Integer */ +.highlight .mo { color: #0000cf; font-weight: bold } /* Literal.Number.Oct */ +.highlight .sa { color: #4e9a06 } /* Literal.String.Affix */ +.highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */ +.highlight .sc { color: #4e9a06 } /* Literal.String.Char */ +.highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */ +.highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #4e9a06 } /* Literal.String.Double */ +.highlight .se { color: #4e9a06 } /* Literal.String.Escape */ +.highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */ +.highlight .si { color: #4e9a06 } /* Literal.String.Interpol */ +.highlight .sx { color: #4e9a06 } /* Literal.String.Other */ +.highlight .sr { color: #4e9a06 } /* Literal.String.Regex */ +.highlight .s1 { color: #4e9a06 } /* Literal.String.Single */ +.highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */ +.highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #000000 } /* Name.Function.Magic */ +.highlight .vc { color: #000000 } /* Name.Variable.Class */ +.highlight .vg { color: #000000 } /* Name.Variable.Global */ +.highlight .vi { color: #000000 } /* Name.Variable.Instance */ +.highlight .vm { color: #000000 } /* Name.Variable.Magic */ +.highlight .il { color: #0000cf; font-weight: bold } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/_static/searchtools.js b/_static/searchtools.js new file mode 100644 index 0000000..7918c3f --- /dev/null +++ b/_static/searchtools.js @@ -0,0 +1,574 @@ +/* + * searchtools.js + * ~~~~~~~~~~~~~~~~ + * + * Sphinx JavaScript utilities for the full-text search. + * + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ +"use strict"; + +/** + * Simple result scoring code. + */ +if (typeof Scorer === "undefined") { + var Scorer = { + // Implement the following function to further tweak the score for each result + // The function takes a result array [docname, title, anchor, descr, score, filename] + // and returns the new score. + /* + score: result => { + const [docname, title, anchor, descr, score, filename] = result + return score + }, + */ + + // query matches the full name of an object + objNameMatch: 11, + // or matches in the last dotted part of the object name + objPartialMatch: 6, + // Additive scores depending on the priority of the object + objPrio: { + 0: 15, // used to be importantResults + 1: 5, // used to be objectResults + 2: -5, // used to be unimportantResults + }, + // Used when the priority is not in the mapping. + objPrioDefault: 0, + + // query found in title + title: 15, + partialTitle: 7, + // query found in terms + term: 5, + partialTerm: 2, + }; +} + +const _removeChildren = (element) => { + while (element && element.lastChild) element.removeChild(element.lastChild); +}; + +/** + * See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping + */ +const _escapeRegExp = (string) => + string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + +const _displayItem = (item, searchTerms, highlightTerms) => { + const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; + const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; + const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; + const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + const contentRoot = document.documentElement.dataset.content_root; + + const [docName, title, anchor, descr, score, _filename] = item; + + let listItem = document.createElement("li"); + let requestUrl; + let linkUrl; + if (docBuilder === "dirhtml") { + // dirhtml builder + let dirname = docName + "/"; + if (dirname.match(/\/index\/$/)) + dirname = dirname.substring(0, dirname.length - 6); + else if (dirname === "index/") dirname = ""; + requestUrl = contentRoot + dirname; + linkUrl = requestUrl; + } else { + // normal html builders + requestUrl = contentRoot + docName + docFileSuffix; + linkUrl = docName + docLinkSuffix; + } + let linkEl = listItem.appendChild(document.createElement("a")); + linkEl.href = linkUrl + anchor; + linkEl.dataset.score = score; + linkEl.innerHTML = title; + if (descr) { + listItem.appendChild(document.createElement("span")).innerHTML = + " (" + descr + ")"; + // highlight search terms in the description + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + } + else if (showSearchSummary) + fetch(requestUrl) + .then((responseData) => responseData.text()) + .then((data) => { + if (data) + listItem.appendChild( + Search.makeSearchSummary(data, searchTerms) + ); + // highlight search terms in the summary + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + }); + Search.output.appendChild(listItem); +}; +const _finishSearch = (resultCount) => { + Search.stopPulse(); + Search.title.innerText = _("Search Results"); + if (!resultCount) + Search.status.innerText = Documentation.gettext( + "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." + ); + else + Search.status.innerText = _( + `Search finished, found ${resultCount} page(s) matching the search query.` + ); +}; +const _displayNextItem = ( + results, + resultCount, + searchTerms, + highlightTerms, +) => { + // results left, load the summary and display it + // this is intended to be dynamic (don't sub resultsCount) + if (results.length) { + _displayItem(results.pop(), searchTerms, highlightTerms); + setTimeout( + () => _displayNextItem(results, resultCount, searchTerms, highlightTerms), + 5 + ); + } + // search finished, update title and status message + else _finishSearch(resultCount); +}; + +/** + * Default splitQuery function. Can be overridden in ``sphinx.search`` with a + * custom function per language. + * + * The regular expression works by splitting the string on consecutive characters + * that are not Unicode letters, numbers, underscores, or emoji characters. + * This is the same as ``\W+`` in Python, preserving the surrogate pair area. + */ +if (typeof splitQuery === "undefined") { + var splitQuery = (query) => query + .split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu) + .filter(term => term) // remove remaining empty strings +} + +/** + * Search Module + */ +const Search = { + _index: null, + _queued_query: null, + _pulse_status: -1, + + htmlToText: (htmlString) => { + const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); + htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); + const docContent = htmlElement.querySelector('[role="main"]'); + if (docContent !== undefined) return docContent.textContent; + console.warn( + "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." + ); + return ""; + }, + + init: () => { + const query = new URLSearchParams(window.location.search).get("q"); + document + .querySelectorAll('input[name="q"]') + .forEach((el) => (el.value = query)); + if (query) Search.performSearch(query); + }, + + loadIndex: (url) => + (document.body.appendChild(document.createElement("script")).src = url), + + setIndex: (index) => { + Search._index = index; + if (Search._queued_query !== null) { + const query = Search._queued_query; + Search._queued_query = null; + Search.query(query); + } + }, + + hasIndex: () => Search._index !== null, + + deferQuery: (query) => (Search._queued_query = query), + + stopPulse: () => (Search._pulse_status = -1), + + startPulse: () => { + if (Search._pulse_status >= 0) return; + + const pulse = () => { + Search._pulse_status = (Search._pulse_status + 1) % 4; + Search.dots.innerText = ".".repeat(Search._pulse_status); + if (Search._pulse_status >= 0) window.setTimeout(pulse, 500); + }; + pulse(); + }, + + /** + * perform a search for something (or wait until index is loaded) + */ + performSearch: (query) => { + // create the required interface elements + const searchText = document.createElement("h2"); + searchText.textContent = _("Searching"); + const searchSummary = document.createElement("p"); + searchSummary.classList.add("search-summary"); + searchSummary.innerText = ""; + const searchList = document.createElement("ul"); + searchList.classList.add("search"); + + const out = document.getElementById("search-results"); + Search.title = out.appendChild(searchText); + Search.dots = Search.title.appendChild(document.createElement("span")); + Search.status = out.appendChild(searchSummary); + Search.output = out.appendChild(searchList); + + const searchProgress = document.getElementById("search-progress"); + // Some themes don't use the search progress node + if (searchProgress) { + searchProgress.innerText = _("Preparing search..."); + } + Search.startPulse(); + + // index already loaded, the browser was quick! + if (Search.hasIndex()) Search.query(query); + else Search.deferQuery(query); + }, + + /** + * execute search (requires search index to be loaded) + */ + query: (query) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // stem the search terms and add them to the correct list + const stemmer = new Stemmer(); + const searchTerms = new Set(); + const excludedTerms = new Set(); + const highlightTerms = new Set(); + const objectTerms = new Set(splitQuery(query.toLowerCase().trim())); + splitQuery(query.trim()).forEach((queryTerm) => { + const queryTermLower = queryTerm.toLowerCase(); + + // maybe skip this "word" + // stopwords array is from language_data.js + if ( + stopwords.indexOf(queryTermLower) !== -1 || + queryTerm.match(/^\d+$/) + ) + return; + + // stem the word + let word = stemmer.stemWord(queryTermLower); + // select the correct list + if (word[0] === "-") excludedTerms.add(word.substr(1)); + else { + searchTerms.add(word); + highlightTerms.add(queryTermLower); + } + }); + + if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js + localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" ")) + } + + // console.debug("SEARCH: searching for:"); + // console.info("required: ", [...searchTerms]); + // console.info("excluded: ", [...excludedTerms]); + + // array of [docname, title, anchor, descr, score, filename] + let results = []; + _removeChildren(document.getElementById("search-progress")); + + const queryLower = query.toLowerCase(); + for (const [title, foundTitles] of Object.entries(allTitles)) { + if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { + for (const [file, id] of foundTitles) { + let score = Math.round(100 * queryLower.length / title.length) + results.push([ + docNames[file], + titles[file] !== title ? `${titles[file]} > ${title}` : title, + id !== null ? "#" + id : "", + null, + score, + filenames[file], + ]); + } + } + } + + // search for explicit entries in index directives + for (const [entry, foundEntries] of Object.entries(indexEntries)) { + if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { + for (const [file, id] of foundEntries) { + let score = Math.round(100 * queryLower.length / entry.length) + results.push([ + docNames[file], + titles[file], + id ? "#" + id : "", + null, + score, + filenames[file], + ]); + } + } + } + + // lookup as object + objectTerms.forEach((term) => + results.push(...Search.performObjectSearch(term, objectTerms)) + ); + + // lookup as search terms in fulltext + results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + + // let the scorer override scores with a custom scoring function + if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); + + // now sort the results by score (in opposite order of appearance, since the + // display function below uses pop() to retrieve items) and then + // alphabetically + results.sort((a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; + }); + + // remove duplicate search results + // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept + let seen = new Set(); + results = results.reverse().reduce((acc, result) => { + let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(','); + if (!seen.has(resultStr)) { + acc.push(result); + seen.add(resultStr); + } + return acc; + }, []); + + results = results.reverse(); + + // for debugging + //Search.lastresults = results.slice(); // a copy + // console.info("search results:", Search.lastresults); + + // print the results + _displayNextItem(results, results.length, searchTerms, highlightTerms); + }, + + /** + * search for object names + */ + performObjectSearch: (object, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const objects = Search._index.objects; + const objNames = Search._index.objnames; + const titles = Search._index.titles; + + const results = []; + + const objectSearchCallback = (prefix, match) => { + const name = match[4] + const fullname = (prefix ? prefix + "." : "") + name; + const fullnameLower = fullname.toLowerCase(); + if (fullnameLower.indexOf(object) < 0) return; + + let score = 0; + const parts = fullnameLower.split("."); + + // check for different match types: exact matches of full name or + // "last name" (i.e. last dotted part) + if (fullnameLower === object || parts.slice(-1)[0] === object) + score += Scorer.objNameMatch; + else if (parts.slice(-1)[0].indexOf(object) > -1) + score += Scorer.objPartialMatch; // matches in last name + + const objName = objNames[match[1]][2]; + const title = titles[match[0]]; + + // If more than one term searched for, we require other words to be + // found in the name/title/description + const otherTerms = new Set(objectTerms); + otherTerms.delete(object); + if (otherTerms.size > 0) { + const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase(); + if ( + [...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0) + ) + return; + } + + let anchor = match[3]; + if (anchor === "") anchor = fullname; + else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname; + + const descr = objName + _(", in ") + title; + + // add custom score for some objects according to scorer + if (Scorer.objPrio.hasOwnProperty(match[2])) + score += Scorer.objPrio[match[2]]; + else score += Scorer.objPrioDefault; + + results.push([ + docNames[match[0]], + fullname, + "#" + anchor, + descr, + score, + filenames[match[0]], + ]); + }; + Object.keys(objects).forEach((prefix) => + objects[prefix].forEach((array) => + objectSearchCallback(prefix, array) + ) + ); + return results; + }, + + /** + * search for full-text terms in the index + */ + performTermsSearch: (searchTerms, excludedTerms) => { + // prepare search + const terms = Search._index.terms; + const titleTerms = Search._index.titleterms; + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + + const scoreMap = new Map(); + const fileMap = new Map(); + + // perform the search on the required terms + searchTerms.forEach((word) => { + const files = []; + const arr = [ + { files: terms[word], score: Scorer.term }, + { files: titleTerms[word], score: Scorer.title }, + ]; + // add support for partial matches + if (word.length > 2) { + const escapedWord = _escapeRegExp(word); + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord) && !terms[word]) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord) && !titleTerms[word]) + arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); + }); + } + + // no match but word was a required one + if (arr.every((record) => record.files === undefined)) return; + + // found search word in contents + arr.forEach((record) => { + if (record.files === undefined) return; + + let recordFiles = record.files; + if (recordFiles.length === undefined) recordFiles = [recordFiles]; + files.push(...recordFiles); + + // set score for the word in each file + recordFiles.forEach((file) => { + if (!scoreMap.has(file)) scoreMap.set(file, {}); + scoreMap.get(file)[word] = record.score; + }); + }); + + // create the mapping + files.forEach((file) => { + if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) + fileMap.get(file).push(word); + else fileMap.set(file, [word]); + }); + }); + + // now check if the files don't contain excluded terms + const results = []; + for (const [file, wordList] of fileMap) { + // check if all requirements are matched + + // as search terms with length < 3 are discarded + const filteredTermCount = [...searchTerms].filter( + (term) => term.length > 2 + ).length; + if ( + wordList.length !== searchTerms.size && + wordList.length !== filteredTermCount + ) + continue; + + // ensure that none of the excluded terms is in the search result + if ( + [...excludedTerms].some( + (term) => + terms[term] === file || + titleTerms[term] === file || + (terms[term] || []).includes(file) || + (titleTerms[term] || []).includes(file) + ) + ) + break; + + // select one (max) score for the file. + const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w])); + // add result to the result list + results.push([ + docNames[file], + titles[file], + "", + null, + score, + filenames[file], + ]); + } + return results; + }, + + /** + * helper function to return a node containing the + * search summary for a given text. keywords is a list + * of stemmed words. + */ + makeSearchSummary: (htmlText, keywords) => { + const text = Search.htmlToText(htmlText); + if (text === "") return null; + + const textLower = text.toLowerCase(); + const actualStartPosition = [...keywords] + .map((k) => textLower.indexOf(k.toLowerCase())) + .filter((i) => i > -1) + .slice(-1)[0]; + const startWithContext = Math.max(actualStartPosition - 120, 0); + + const top = startWithContext === 0 ? "" : "..."; + const tail = startWithContext + 240 < text.length ? "..." : ""; + + let summary = document.createElement("p"); + summary.classList.add("context"); + summary.textContent = top + text.substr(startWithContext, 240).trim() + tail; + + return summary; + }, +}; + +_ready(Search.init); diff --git a/_static/sphinx_highlight.js b/_static/sphinx_highlight.js new file mode 100644 index 0000000..8a96c69 --- /dev/null +++ b/_static/sphinx_highlight.js @@ -0,0 +1,154 @@ +/* Highlighting utilities for Sphinx HTML documentation. */ +"use strict"; + +const SPHINX_HIGHLIGHT_ENABLED = true + +/** + * highlight a given string on a node by wrapping it in + * span elements with the given class name. + */ +const _highlight = (node, addItems, text, className) => { + if (node.nodeType === Node.TEXT_NODE) { + const val = node.nodeValue; + const parent = node.parentNode; + const pos = val.toLowerCase().indexOf(text); + if ( + pos >= 0 && + !parent.classList.contains(className) && + !parent.classList.contains("nohighlight") + ) { + let span; + + const closestNode = parent.closest("body, svg, foreignObject"); + const isInSVG = closestNode && closestNode.matches("svg"); + if (isInSVG) { + span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); + } else { + span = document.createElement("span"); + span.classList.add(className); + } + + span.appendChild(document.createTextNode(val.substr(pos, text.length))); + const rest = document.createTextNode(val.substr(pos + text.length)); + parent.insertBefore( + span, + parent.insertBefore( + rest, + node.nextSibling + ) + ); + node.nodeValue = val.substr(0, pos); + /* There may be more occurrences of search term in this node. So call this + * function recursively on the remaining fragment. + */ + _highlight(rest, addItems, text, className); + + if (isInSVG) { + const rect = document.createElementNS( + "http://www.w3.org/2000/svg", + "rect" + ); + const bbox = parent.getBBox(); + rect.x.baseVal.value = bbox.x; + rect.y.baseVal.value = bbox.y; + rect.width.baseVal.value = bbox.width; + rect.height.baseVal.value = bbox.height; + rect.setAttribute("class", className); + addItems.push({ parent: parent, target: rect }); + } + } + } else if (node.matches && !node.matches("button, select, textarea")) { + node.childNodes.forEach((el) => _highlight(el, addItems, text, className)); + } +}; +const _highlightText = (thisNode, text, className) => { + let addItems = []; + _highlight(thisNode, addItems, text, className); + addItems.forEach((obj) => + obj.parent.insertAdjacentElement("beforebegin", obj.target) + ); +}; + +/** + * Small JavaScript module for the documentation. + */ +const SphinxHighlight = { + + /** + * highlight the search words provided in localstorage in the text + */ + highlightSearchWords: () => { + if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight + + // get and clear terms from localstorage + const url = new URL(window.location); + const highlight = + localStorage.getItem("sphinx_highlight_terms") + || url.searchParams.get("highlight") + || ""; + localStorage.removeItem("sphinx_highlight_terms") + url.searchParams.delete("highlight"); + window.history.replaceState({}, "", url); + + // get individual terms from highlight string + const terms = highlight.toLowerCase().split(/\s+/).filter(x => x); + if (terms.length === 0) return; // nothing to do + + // There should never be more than one element matching "div.body" + const divBody = document.querySelectorAll("div.body"); + const body = divBody.length ? divBody[0] : document.querySelector("body"); + window.setTimeout(() => { + terms.forEach((term) => _highlightText(body, term, "highlighted")); + }, 10); + + const searchBox = document.getElementById("searchbox"); + if (searchBox === null) return; + searchBox.appendChild( + document + .createRange() + .createContextualFragment( + '" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/cpg.html b/cpg.html new file mode 100644 index 0000000..894cd57 --- /dev/null +++ b/cpg.html @@ -0,0 +1,95 @@ + + + + + + + + CpG Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

CpG Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/dust.html b/dust.html new file mode 100644 index 0000000..2706af4 --- /dev/null +++ b/dust.html @@ -0,0 +1,95 @@ + + + + + + + + DustMasker Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

DustMasker Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/eponine.html b/eponine.html new file mode 100644 index 0000000..50e28a5 --- /dev/null +++ b/eponine.html @@ -0,0 +1,95 @@ + + + + + + + + Eponine Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Eponine Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/genblast.html b/genblast.html new file mode 100644 index 0000000..0615f59 --- /dev/null +++ b/genblast.html @@ -0,0 +1,95 @@ + + + + + + + + Genblast Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Genblast Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/genindex.html b/genindex.html new file mode 100644 index 0000000..9a05235 --- /dev/null +++ b/genindex.html @@ -0,0 +1,94 @@ + + + + + + + Index — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ + +

Index

+ +
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..aee7143 --- /dev/null +++ b/index.html @@ -0,0 +1,122 @@ + + + + + + + + Contents — ensembl-anno 0.1 documentation + + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Anno tool kit

+
+

Contents

+

Check out installation section for further information on how +to install the project.

+ +
+

Indices and tables

+ +
+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/install.html b/install.html new file mode 100644 index 0000000..7bad679 --- /dev/null +++ b/install.html @@ -0,0 +1,143 @@ + + + + + + + + API Setup and installation — ensembl-anno 0.1 documentation + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

API Setup and installation

+
+

Requirements

+

An Ensembl API checkout including:

+ +
+

Software

+
    +
  1. Python 3.8+

  2. +
  3. Bioperl 1.6.9+

  4. +
+
+
+

Python Modules

+
    +
  1. argschema

  2. +
+
+
+
+

Installation

+

Directly from GitHub:

+
1git clone https://github.com/Ensembl/ensembl-analysis -b experimental/gbiab
+2git clone https://github.com/Ensembl/ensembl-production
+3git clone https://github.com/Ensembl/ensembl-hive
+4git clone https://github.com/Ensembl/ensembl-taxonomy
+5git clone https://github.com/Ensembl/ensembl-orm
+
+
+
+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/license.html b/license.html new file mode 100644 index 0000000..c4696e1 --- /dev/null +++ b/license.html @@ -0,0 +1,280 @@ + + + + + + + + License — ensembl-anno 0.1 documentation + + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

License

+
+

Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/

+
    +
  1. Definitions.

    +

    “License” shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document.

    +

    “Licensor” shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License.

    +

    “Legal Entity” shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +“control” means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity.

    +

    “You” (or “Your”) shall mean an individual or Legal Entity +exercising permissions granted by this License.

    +

    “Source” form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files.

    +

    “Object” form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types.

    +

    “Work” shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below).

    +

    “Derivative Works” shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof.

    +

    “Contribution” shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, “submitted” +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as “Not a Contribution.”

    +

    “Contributor” shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work.

    +
  2. +
  3. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form.

  4. +
  5. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed.

  6. +
  7. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions:

    +
      +
    1. You must give any other recipients of the Work or +Derivative Works a copy of this License; and

    2. +
    3. You must cause any modified files to carry prominent notices +stating that You changed the files; and

    4. +
    5. You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and

    6. +
    7. If the Work includes a “NOTICE” text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License.

    8. +
    +

    You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License.

    +
  8. +
  9. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions.

  10. +
  11. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file.

  12. +
  13. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an “AS IS” BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License.

  14. +
  15. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages.

  16. +
  17. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability.

  18. +
+

END OF TERMS AND CONDITIONS

+

APPENDIX: How to apply the Apache License to your work.

+
+

To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets “{}” +replaced with your own identifying information. (Don’t include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same “printed page” as the copyright notice for easier +identification within third-party archives.

+
+

Copyright [yyyy] [name of copyright owner]

+

Licensed under the Apache License, Version 2.0 (the “License”); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at

+
+
+

Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an “AS IS” BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.

+
+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/minimap.html b/minimap.html new file mode 100644 index 0000000..9d55a8c --- /dev/null +++ b/minimap.html @@ -0,0 +1,95 @@ + + + + + + + + Minimap2 Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Minimap2 Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/objects.inv b/objects.inv new file mode 100644 index 0000000..f7bc3af Binary files /dev/null and b/objects.inv differ diff --git a/py-modindex.html b/py-modindex.html new file mode 100644 index 0000000..c7e6859 --- /dev/null +++ b/py-modindex.html @@ -0,0 +1,185 @@ + + + + + + + Python Module Index — ensembl-anno 0.1 documentation + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/red.html b/red.html new file mode 100644 index 0000000..bf16623 --- /dev/null +++ b/red.html @@ -0,0 +1,95 @@ + + + + + + + + Red Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Red Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/repeatmasker.html b/repeatmasker.html new file mode 100644 index 0000000..739b31b --- /dev/null +++ b/repeatmasker.html @@ -0,0 +1,95 @@ + + + + + + + + Repeatmasker Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Repeatmasker Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/scallop.html b/scallop.html new file mode 100644 index 0000000..97726f6 --- /dev/null +++ b/scallop.html @@ -0,0 +1,95 @@ + + + + + + + + Scallop Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Scallop Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/search.html b/search.html new file mode 100644 index 0000000..8ded71c --- /dev/null +++ b/search.html @@ -0,0 +1,123 @@ + + + + + + + Search — ensembl-anno 0.1 documentation + + + + + + + + + + + + + + + + + +
+
+
+ +
+
+
+ +

Search

+ + + + +

+ Searching for multiple words only shows matches that contain + all words. +

+ + +
+ + + +
+ + + +
+ +
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/searchindex.js b/searchindex.js new file mode 100644 index 0000000..c36b8e9 --- /dev/null +++ b/searchindex.js @@ -0,0 +1 @@ +Search.setIndex({"docnames": ["cpg", "dust", "eponine", "genblast", "index", "install", "license", "minimap", "red", "repeatmasker", "scallop", "star", "stringtie", "trf", "trnascan"], "filenames": ["cpg.rst", "dust.rst", "eponine.rst", "genblast.rst", "index.rst", "install.rst", "license.rst", "minimap.rst", "red.rst", "repeatmasker.rst", "scallop.rst", "star.rst", "stringtie.rst", "trf.rst", "trnascan.rst"], "titles": ["CpG Module Documentation", "DustMasker Module Documentation", "Eponine Module Documentation", "Genblast Module Documentation", "Contents", "API Setup and installation", "License", "Minimap2 Module Documentation", "Red Module Documentation", "Repeatmasker Module Documentation", "Scallop Module Documentation", "STAR Module Documentation", "Stringtie Module Documentation", "TRF Module Documentation", "tRNAscan-SE Module Documentation"], "terms": {"index": [], "modul": 4, "search": 4, "page": [4, 6], "anno": 4, "tool": 4, "kit": 4, "check": 4, "out": [4, 6], "instal": 4, "section": [4, 6], "further": 4, "inform": [4, 6], "how": [4, 6], "project": 4, "api": 4, "setup": 4, "licens": 4, "an": [5, 6], "ensembl": 5, "checkout": 5, "includ": [5, 6], "product": [5, 6], "analysi": 5, "dev": 5, "hive_mast": 5, "branch": 5, "taxonomi": 5, "orm": 5, "3": 5, "8": 5, "bioperl": 5, "1": [5, 6], "6": 5, "9": [5, 6], "argschema": 5, "directli": 5, "from": [5, 6], "github": 5, "git": 5, "clone": 5, "http": [5, 6], "com": 5, "b": 5, "experiment": 5, "gbiab": 5, "hive": 5, "apach": 6, "version": 6, "2": 6, "0": 6, "januari": 6, "2004": 6, "www": 6, "org": 6, "definit": 6, "shall": 6, "mean": 6, "term": 6, "condit": 6, "us": 6, "reproduct": 6, "distribut": 6, "defin": 6, "through": 6, "thi": 6, "document": 6, "licensor": 6, "copyright": 6, "owner": 6, "entiti": 6, "author": 6, "i": 6, "grant": 6, "legal": 6, "union": 6, "act": 6, "all": 6, "other": 6, "control": 6, "ar": 6, "under": 6, "common": 6, "For": 6, "purpos": 6, "power": 6, "direct": 6, "indirect": 6, "caus": 6, "manag": 6, "whether": 6, "contract": 6, "otherwis": 6, "ii": 6, "ownership": 6, "fifti": 6, "percent": 6, "50": 6, "more": 6, "outstand": 6, "share": 6, "iii": 6, "benefici": 6, "you": 6, "your": 6, "individu": 6, "exercis": 6, "permiss": 6, "sourc": 6, "form": 6, "prefer": 6, "make": 6, "modif": 6, "limit": 6, "softwar": 6, "code": 6, "configur": 6, "file": 6, "object": 6, "ani": 6, "result": 6, "mechan": 6, "transform": 6, "translat": 6, "compil": 6, "gener": 6, "convers": 6, "media": 6, "type": 6, "work": 6, "authorship": 6, "made": 6, "avail": 6, "indic": 6, "notic": 6, "attach": 6, "exampl": 6, "provid": 6, "appendix": 6, "below": 6, "deriv": 6, "base": 6, "which": 6, "editori": 6, "revis": 6, "annot": 6, "elabor": 6, "repres": 6, "whole": 6, "origin": 6, "remain": 6, "separ": 6, "mere": 6, "link": 6, "bind": 6, "name": 6, "interfac": 6, "thereof": 6, "contribut": 6, "addit": 6, "intention": 6, "submit": 6, "inclus": 6, "behalf": 6, "electron": 6, "verbal": 6, "written": 6, "commun": 6, "sent": 6, "its": 6, "mail": 6, "list": 6, "system": 6, "issu": 6, "track": 6, "discuss": 6, "improv": 6, "exclud": 6, "conspicu": 6, "mark": 6, "design": 6, "write": 6, "Not": 6, "contributor": 6, "whom": 6, "ha": 6, "been": 6, "receiv": 6, "subsequ": 6, "incorpor": 6, "within": 6, "subject": 6, "each": 6, "herebi": 6, "perpetu": 6, "worldwid": 6, "non": 6, "exclus": 6, "charg": 6, "royalti": 6, "free": 6, "irrevoc": 6, "reproduc": 6, "prepar": 6, "publicli": 6, "displai": 6, "perform": 6, "sublicens": 6, "patent": 6, "except": 6, "state": 6, "have": 6, "offer": 6, "sell": 6, "import": 6, "transfer": 6, "where": 6, "appli": 6, "onli": 6, "those": 6, "claim": 6, "necessarili": 6, "infring": 6, "": 6, "alon": 6, "combin": 6, "wa": 6, "If": 6, "institut": 6, "litig": 6, "against": 6, "cross": 6, "counterclaim": 6, "lawsuit": 6, "alleg": 6, "constitut": 6, "contributori": 6, "termin": 6, "date": 6, "redistribut": 6, "mai": 6, "copi": 6, "medium": 6, "without": 6, "meet": 6, "follow": 6, "must": 6, "give": 6, "recipi": 6, "modifi": 6, "carri": 6, "promin": 6, "chang": 6, "retain": 6, "trademark": 6, "attribut": 6, "do": 6, "pertain": 6, "part": 6, "text": 6, "readabl": 6, "contain": 6, "least": 6, "one": 6, "place": 6, "along": 6, "wherev": 6, "third": 6, "parti": 6, "normal": 6, "appear": 6, "The": 6, "content": 6, "add": 6, "own": 6, "alongsid": 6, "addendum": 6, "cannot": 6, "constru": 6, "statement": 6, "differ": 6, "compli": 6, "submiss": 6, "unless": 6, "explicitli": 6, "notwithstand": 6, "abov": 6, "noth": 6, "herein": 6, "supersed": 6, "agreement": 6, "execut": 6, "regard": 6, "doe": 6, "trade": 6, "servic": 6, "requir": [4, 6], "reason": 6, "customari": 6, "describ": 6, "disclaim": 6, "warranti": 6, "applic": 6, "law": 6, "agre": 6, "AS": 6, "basi": 6, "OR": 6, "OF": 6, "kind": 6, "either": 6, "express": 6, "impli": 6, "titl": 6, "merchant": 6, "fit": 6, "FOR": 6, "A": 6, "particular": 6, "sole": 6, "respons": 6, "determin": 6, "appropri": 6, "assum": 6, "risk": 6, "associ": 6, "liabil": 6, "In": 6, "event": 6, "theori": 6, "tort": 6, "neglig": 6, "deliber": 6, "grossli": 6, "liabl": 6, "damag": 6, "special": 6, "incident": 6, "consequenti": 6, "charact": 6, "aris": 6, "inabl": 6, "loss": 6, "goodwil": 6, "stoppag": 6, "comput": 6, "failur": 6, "malfunct": 6, "commerci": 6, "even": 6, "advis": 6, "possibl": 6, "accept": 6, "while": 6, "choos": 6, "fee": 6, "support": 6, "indemn": 6, "oblig": 6, "right": 6, "consist": 6, "howev": 6, "indemnifi": 6, "defend": 6, "hold": 6, "harmless": 6, "incur": 6, "assert": 6, "end": 6, "AND": 6, "To": 6, "boilerpl": 6, "field": 6, "enclos": 6, "bracket": 6, "replac": 6, "identifi": 6, "don": 6, "t": 6, "should": 6, "comment": 6, "syntax": 6, "format": 6, "we": 6, "also": 6, "recommend": 6, "class": 6, "descript": 6, "same": 6, "print": 6, "easier": 6, "identif": 6, "archiv": 6, "yyyi": 6, "complianc": 6, "obtain": 6, "see": 6, "specif": 6, "languag": 6, "govern": 6, "function": [], "run": [], "assembl": [], "short": [], "read": [], "data": [], "pertea": [], "m": [], "gm": [], "antonescu": [], "cm": [], "tc": [], "mendel": [], "jt": [], "salzberg": [], "sl": [], "enabl": [], "reconstruct": [], "transcriptom": [], "rna": [], "seq": [], "natur": [], "biotechnologi": [], "2015": [], "doi": [], "10": [], "1038": [], "nbt": [], "3122": [], "stringti": [], "python": [], "run_stringti": [], "fast": [], "highli": [], "effici": [], "align": [], "potenti": [], "transcript": [], "It": [], "novel": [], "network": [], "flow": [], "algorithm": [], "well": [], "option": [], "de": [], "novo": [], "assembli": [], "step": [], "quantit": [], "full": [], "length": [], "multipl": [], "splice": [], "variant": [], "gene": [], "locu": [], "transcriptomic_annot": [], "output_dir": [], "path": [], "stringtie_bin": [], "posixpath": [], "num_thread": [], "int": [], "none": [], "param": [], "directori": [], "number": [], "thread": [], "No": [], "inputschema": [], "strsequenceorset": [], "mani": [], "bool": [], "fals": [], "context": [], "dict": [], "load_onli": [], "dump_onli": [], "partial": [], "unknown": [], "str": [], "input": [], "argument": [], "expect": [], "set": [], "discrimin": [], "can": [], "recogn": [], "structur": [], "composit": [], "featur": [], "island": [], "promot": [], "region": [], "first": [], "donor": [], "site": [], "davuluri": [], "rv": [], "gross": [], "zhang": [], "mq": [], "exon": [], "human": [], "genom": [], "nat": [], "genet": [], "2001": [], "29": [], "4": [], "412": [], "417": [], "pmid": [], "11726928": [], "simple_feature_annot": [], "run_cpg": [], "genome_fil": [], "pathlik": [], "cpg_bin": [], "cpg_lh": [], "cpg_min_length": [], "400": [], "cpg_min_gc_cont": [], "cpg_min_o": [], "float": [], "slice": [], "min": [], "gc": [], "frequenc": [], "percentag": [], "ratio": [], "observ": [], "cpgo": [], "e": [], "program": [], "mask": [], "low": [], "complex": [], "new": [], "dust": [], "morguli": [], "gertz": [], "em": [], "schaffer": [], "aa": [], "agarwala": [], "r": [], "symmetr": [], "implement": [], "dna": [], "sequenc": [], "repeat_annot": [], "run_dust": [], "dust_bin": [], "mutiprocess": [], "probabilist": [], "method": [], "detect": [], "start": [], "tss": [], "mammalian": [], "good": [], "excel": [], "posit": [], "accuraci": [], "down": [], "ta": [], "hubbard": [], "tj": [], "locat": [], "re": [], "2002": [], "mar": [], "12": [], "458": [], "61": [], "1101": [], "gr": [], "216102": [], "11875034": [], "pmcid": [], "pmc155284": [], "run_eponin": [], "java_bin": [], "java": [], "eponine_bin": [], "hp": [], "user": [], "ensw": [], "c8": [], "mar21": [], "sandybridg": [], "linuxbrew": [], "opt": [], "libexec": [], "scan": [], "jar": [], "eponine_threshold": [], "999": [], "homolog": [], "databas": [], "One": [], "kei": [], "flexibl": [], "handl": [], "compar": [], "task": [], "accur": [], "when": [], "undergon": [], "signific": [], "evolutionari": [], "capabl": [], "valuabl": [], "resourc": [], "research": [], "studi": [], "evolut": [], "famili": [], "across": [], "divers": [], "speci": [], "wide": [], "variou": [], "analys": [], "standalon": [], "command": [], "line": [], "bioinformat": [], "pipelin": [], "often": [], "reli": [], "sensit": [], "homologi": [], "insight": [], "relationship": [], "conserv": [], "organ": [], "she": [], "chu": [], "j": [], "uyar": [], "wang": [], "k": [], "chen": [], "n": [], "2011": [], "genblasta": [], "blast": [], "21": [], "5": [], "936": [], "949": [], "protein_annot": [], "run_genblast": [], "masked_genom": [], "protein_dataset": [], "max_intron_length": [], "genblast_timeout_sec": [], "10800": [], "genblast_bin": [], "convert2blastmask_bin": [], "convert2blastmask": [], "makeblastdb_bin": [], "makeblastdb": [], "protein_set": [], "uniprot": [], "orthodb": [], "protein": [], "dataset": [], "time": [], "timeout": [], "sec": [], "maximum": [], "intron": [], "genblast_timeout": [], "second": [], "cmsearch": [], "eponin": [], "genblast": [], "minimap2": [], "run_minimap2": [], "red": [], "run_r": [], "repeatmask": [], "run_repeatmask": [], "scallop": [], "run_scallop": [], "star": [], "run_star": [], "subsample_transcriptomic_data": [], "trf": [], "run_trf": [], "trnascan": [], "se": [], "run_trnascan": [], "pairwis": [], "nucleotid": [], "versatil": [], "strategi": [], "quickli": [], "find": [], "approxim": [], "match": [], "between": [], "allow": [], "long": [], "refer": [], "li": [], "h": [], "2018": [], "34": [], "18": [], "3094": [], "3100": [], "minimap": [], "long_read_fastq_dir": [], "minimap2_bin": [], "paftools_bin": [], "paftool": [], "100000": [], "default": [], "pacbio": [], "size": [], "repeat": [], "label": [], "train": [], "itself": [], "automat": [], "entir": [], "girgi": [], "z": [], "intellig": [], "rapid": [], "scale": [], "bmc": [], "16": [], "227": [], "1186": [], "s12859": [], "015": [], "0654": [], "red_bin": [], "paramet": [], "return": [], "screen": [], "interspers": [], "smit": [], "afa": [], "hublei": [], "green": [], "p": [], "open": [], "repeatmasker_bin": [], "librari": [], "repeatmasker_engin": [], "rmblast": [], "store": [], "final": [], "gtf": [], "repeatmasker_output": [], "repeatmasker_path": [], "custom": [], "output": [], "high": [], "quantif": [], "larg": [], "precis": [], "estim": [], "abund": [], "approach": [], "quantifi": [], "level": [], "shao": [], "kingsford": [], "c": [], "phase": [], "preserv": [], "graph": [], "decomposit": [], "biotechnol": [], "2017": [], "dec": [], "35": [], "1167": [], "1169": [], "4020": [], "epub": [], "nov": [], "13": [], "29131147": [], "pmc5722698": [], "scallop_path": [], "stringtie_path": [], "main_output_dir": [], "dobin": [], "davi": [], "ca": [], "schlesing": [], "f": [], "et": [], "al": [], "ultrafast": [], "univers": [], "2013": [], "15": [], "1093": [], "bts635": [], "short_read_fastq_dir": [], "delete_pre_trim_fastq": [], "trim_fastq": [], "max_reads_per_sampl": [], "star_bin": [], "samtools_bin": [], "samtool": [], "trim_galore_bin": [], "trim_galor": [], "delet": [], "fastq": [], "after": [], "trim": [], "trimgalor": [], "max": [], "per": [], "sampl": [], "unlimit": [], "fastq_file_list": [], "subsampl": [], "pair": [], "process": [], "tandem": [], "finder": [], "benson": [], "g": [], "analyz": [], "nucleic": [], "acid": [], "1999": [], "27": [], "573": [], "580": [], "nar": [], "trf_bin": [], "match_scor": [], "mismatch_scor": [], "delta": [], "7": [], "pm": [], "80": [], "pi": [], "minscor": [], "40": [], "maxperiod": [], "500": [], "weight": [], "mismatch": [], "penalti": [], "indel": [], "probabl": [], "minimum": [], "score": [], "report": [], "period": [], "99": [], "100": [], "less": [], "than": [], "gigabas": [], "tm": [], "eddi": [], "sr": [], "1997": [], "25": [], "955": [], "64": [], "9023104": [], "snc_rna_annot": [], "trnascan_bin": [], "trnascan_filt": [], "eukhighconfidencefilt": [], "filter": [], "cpg": [], "dustmask": [], "scallop_bin": [], "prlimit_bin": [], "prlimit": [], "memory_limit": [], "42949672960": [], "memori": [], "1024": []}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"welcom": [], "ensembl": [], "anno": [], "": [], "document": [0, 1, 2, 3, 7, 8, 9, 10, 11, 12, 13, 14], "indic": 4, "tabl": 4, "content": 4, "index": 4, "api": 5, "setup": 5, "instal": 5, "requir": 5, "softwar": 5, "python": 5, "modul": [0, 1, 2, 3, 5, 7, 8, 9, 10, 11, 12, 13, 14], "licens": 6, "stringti": 12, "cmsearch": [], "cpg": 0, "dustmask": 1, "eponin": 2, "genblast": 3, "minimap2": 7, "red": 8, "repeatmask": 9, "scallop": 10, "star": 11, "trf": 13, "trnascan": 14, "se": 14}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 60}, "alltitles": {"CpG Module Documentation": [[0, "cpg-module-documentation"]], "DustMasker Module Documentation": [[1, "dustmasker-module-documentation"]], "Eponine Module Documentation": [[2, "eponine-module-documentation"]], "Genblast Module Documentation": [[3, "genblast-module-documentation"]], "Contents": [[4, "contents"]], "Index": [[4, null]], "Indices and tables": [[4, "indices-and-tables"]], "API Setup and installation": [[5, "api-setup-and-installation"]], "Requirements": [[5, "requirements"]], "Software": [[5, "software"]], "Python Modules": [[5, "python-modules"]], "Installation": [[5, "installation"]], "License": [[6, "license"]], "Minimap2 Module Documentation": [[7, "minimap2-module-documentation"]], "Red Module Documentation": [[8, "red-module-documentation"]], "Repeatmasker Module Documentation": [[9, "repeatmasker-module-documentation"]], "Scallop Module Documentation": [[10, "scallop-module-documentation"]], "STAR Module Documentation": [[11, "star-module-documentation"]], "Stringtie Module Documentation": [[12, "stringtie-module-documentation"]], "TRF Module Documentation": [[13, "trf-module-documentation"]], "tRNAscan-SE Module Documentation": [[14, "trnascan-se-module-documentation"]]}, "indexentries": {}}) \ No newline at end of file diff --git a/star.html b/star.html new file mode 100644 index 0000000..ec03331 --- /dev/null +++ b/star.html @@ -0,0 +1,95 @@ + + + + + + + + STAR Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

STAR Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/stringtie.html b/stringtie.html new file mode 100644 index 0000000..d8bf948 --- /dev/null +++ b/stringtie.html @@ -0,0 +1,95 @@ + + + + + + + + Stringtie Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

Stringtie Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/trf.html b/trf.html new file mode 100644 index 0000000..9636f16 --- /dev/null +++ b/trf.html @@ -0,0 +1,95 @@ + + + + + + + + TRF Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

TRF Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file diff --git a/trnascan.html b/trnascan.html new file mode 100644 index 0000000..5137cb5 --- /dev/null +++ b/trnascan.html @@ -0,0 +1,95 @@ + + + + + + + + tRNAscan-SE Module Documentation — ensembl-anno 0.1 documentation + + + + + + + + + + + +
+
+
+ +
+
+
+ +
+

tRNAscan-SE Module Documentation

+
+ + +
+
+
+
+
+ +
+
+
+ + + + + \ No newline at end of file