From e641cf910568d204a421e65625ec000255407571 Mon Sep 17 00:00:00 2001 From: Luis Yanes Date: Mon, 12 Apr 2021 14:48:27 +0100 Subject: [PATCH] Issue 401 daijin class2 (#402) Coordinate changes to the configuration files with daijin - The parameter for the assembler 'class' has changed in the configuration file to 'class2' but this was not picked up by changes to daijin. - Removes a noisy print command in the BED12 parser. - Update the tutorial and daijin's CLI. - Update daijin.yaml in accord to Daijin's class2 program loading. Closes #401 --- Mikado/daijin/__init__.py | 2 +- Mikado/daijin/assemble.smk | 32 +++++++++++++++---------------- Mikado/parsers/bed12.py | 1 - docs/Tutorial/Daijin_tutorial.rst | 20 +++++++++---------- docs/Tutorial/daijin.yaml | 4 ++-- docs/Usage/Daijin.rst | 2 +- 6 files changed, 30 insertions(+), 31 deletions(-) diff --git a/Mikado/daijin/__init__.py b/Mikado/daijin/__init__.py index 251d96b57..f8a94b5dd 100644 --- a/Mikado/daijin/__init__.py +++ b/Mikado/daijin/__init__.py @@ -189,7 +189,7 @@ def create_config_parser(): required=False, default=[], nargs="*", help="Aligner(s) to use for long reads. Choices: %(choices)s") parser.add_argument("-as", "--assemblers", dest="asm_methods", required=False, - choices=["class", "cufflinks", "stringtie", "trinity", "scallop"], + choices=["class2", "cufflinks", "stringtie", "trinity", "scallop"], default=[], nargs="*", help="Assembler(s) to use for the analysis. Choices: %(choices)s") mikado = parser.add_argument_group("Options related to the Mikado phase of the pipeline.") # scoring = parser.add_argument_group("Options related to the scoring system") diff --git a/Mikado/daijin/assemble.smk b/Mikado/daijin/assemble.smk index beffe95e6..d6054fd6b 100644 --- a/Mikado/daijin/assemble.smk +++ b/Mikado/daijin/assemble.smk @@ -168,7 +168,7 @@ def makeAsmRunArray(assembler): CUFFLINKS_RUNS = makeAsmRunArray("cufflinks") TRINITY_RUNS = makeAsmRunArray("trinity") STRINGTIE_RUNS = makeAsmRunArray("stringtie") -CLASS_RUNS = makeAsmRunArray("class") +CLASS_RUNS = makeAsmRunArray("class2") SCALLOP_RUNS = makeAsmRunArray("scallop") SAMPLE_STR = ",".join(SAMPLES) @@ -181,12 +181,12 @@ for a in ALIGN_RUNS : PORTCULLIS_IN[a] = os.path.join(ALIGN_DIR, "output", a + ".sorted.bam") aln_abrv = {"tophat":"tph", "star":"sta", "gsnap":"gsp", "hisat":"hst", "gmap":"gmp"} -asm_abrv = {"cufflinks":"cuf", "stringtie":"stn", "class":"cls", "trinity":"trn", "scallop": "scl"} +asm_abrv = {"cufflinks":"cuf", "stringtie":"stn", "class2":"cls", "trinity":"trn", "scallop": "scl"} GTF_ASSEMBLY_METHODS = [] GFF_ASSEMBLY_METHODS = [] for asm in ASSEMBLY_METHODS: - if asm in ("cufflinks", "stringtie", "class", "scallop"): + if asm in ("cufflinks", "stringtie", "class2", "scallop"): GTF_ASSEMBLY_METHODS.append(asm) elif asm == "trinity": GFF_ASSEMBLY_METHODS.append(asm) @@ -447,7 +447,7 @@ def gmap_intron_lengths(command, MAX_INTRON): # Rules localrules: mikado_cfg, tophat_all, gsnap_all, gsnap_index, star_all, hisat_all, cufflinks_all, trinity_all, - stringtie_all, class_all, lr_gmap_all, lr_star_all, clean, align_all, lreads_all, asm_all, all + stringtie_all, class2_all, lr_gmap_all, lr_star_all, clean, align_all, lreads_all, asm_all, all rule all: input: @@ -868,28 +868,28 @@ rule stringtie_all: output: os.path.join(ASM_DIR, "stringtie.done") shell: "touch {output}" -rule asm_class: +rule asm_class2: input: bam=os.path.join(ALIGN_DIR, "output", "{alrun}.sorted.bam"), align=rules.align_all.output, ref=rules.uncompress_genome.output.genome output: - link=os.path.join(ASM_DIR, "output", r"class-{run2,\d+}-{alrun}.gtf"), - gtf=os.path.join(ASM_DIR, "class", r"class-{run2,\d+}-{alrun}", r"class-{run2}-{alrun}.gtf") + link=os.path.join(ASM_DIR, "output", r"class2-{run2,\d+}-{alrun}.gtf"), + gtf=os.path.join(ASM_DIR, "class2", r"class2-{run2,\d+}-{alrun}", r"class2-{run2}-{alrun}.gtf") params: - outdir=os.path.join(ASM_DIR, "class", "class-{run2}-{alrun}"), - load=loadPre(config, "class"), - extra=lambda wildcards: config["asm_methods"]["class"][int(wildcards.run2)], - link_src=os.path.join("..", "class", "class-{run2}-{alrun}", "class-{run2}-{alrun}.gtf") - log: os.path.join(ASM_DIR, "logs", "class", "class-{run2}-{alrun}.log") + outdir=os.path.join(ASM_DIR, "class2", "class2-{run2}-{alrun}"), + load=loadPre(config, "class2"), + extra=lambda wildcards: config["asm_methods"]["class2"][int(wildcards.run2)], + link_src=os.path.join("..", "class2", "class2-{run2}-{alrun}", "class2-{run2}-{alrun}.gtf") + log: os.path.join(ASM_DIR, "logs", "class2", "class2-{run2}-{alrun}.log") threads: THREADS - message: "Using class to assemble (run {wildcards.run2}): {input.bam}" + message: "Using class2 to assemble (run {wildcards.run2}): {input.bam}" shell: "{params.load} {CLASS} --clean --force -c \"{params.extra}\" -p {threads} {input.bam} > {output.gtf} \ 2> {log} && ln -sf {params.link_src} {output.link} && touch -h {output.link}" -rule class_all: - input: expand(os.path.join(ASM_DIR, "output", "class-{run2}-{alrun}.gtf"), run2=CLASS_RUNS, alrun=ALIGN_RUNS) - output: os.path.join(ASM_DIR, "class.done") +rule class2_all: + input: expand(os.path.join(ASM_DIR, "output", "class2-{run2}-{alrun}.gtf"), run2=CLASS_RUNS, alrun=ALIGN_RUNS) + output: os.path.join(ASM_DIR, "class2.done") shell: "touch {output}" rule lr_gmap: diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py index d9482f4c6..2220519f0 100644 --- a/Mikado/parsers/bed12.py +++ b/Mikado/parsers/bed12.py @@ -503,7 +503,6 @@ def _parse_attributes(self, attributes): """ self.attribute_order = [] - print("Parsing", attributes) infolist = self._attribute_pattern.findall(attributes.rstrip().rstrip(";")) diff --git a/docs/Tutorial/Daijin_tutorial.rst b/docs/Tutorial/Daijin_tutorial.rst index 07c804105..96cabf9d6 100644 --- a/docs/Tutorial/Daijin_tutorial.rst +++ b/docs/Tutorial/Daijin_tutorial.rst @@ -199,7 +199,7 @@ Now we will configure Daijin for the run:: -m permissive --sample-sheet sample_sheet.tsv \ --flank 500 -i 50 26000 --threads 2 \ --genome Reference/Drosophila_melanogaster.BDGP6.dna.toplevel.fa \ - -al hisat -as class stringtie -od Dmelanogaster --name Dmelanogaster \ + -al hisat -as class2 stringtie -od Dmelanogaster --name Dmelanogaster \ -o daijin.toml --prot-db Reference/Aedes_aegypti.fasta; This will create three files in the working directory: @@ -211,7 +211,7 @@ This will create three files in the working directory: .. code-block:: yaml blast: 'source blast-2.3.0' - class: 'source class-2.12' + class2: 'source class-2.12' cufflinks: '' gmap: '' hisat: 'source HISAT-2.0.4' @@ -256,15 +256,15 @@ You can also ask Daijin to display the steps to be executed, inclusive of their When Daijin is finished, have a look inside the folder Dmelanogaster/3-assemblies/output/; you will find the following GTF files: -* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662533-0.gtf -* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662534-0.gtf +* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662533-0.gtf +* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662534-0.gtf * Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662533-0.gtf * Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662534-0.gtf These are standard `GTF files `_ reporting the assembled transcripts for each method. We can have a feel for how they compare with our reference annotation by, again, using ``mikado util stats``. Conveniently, Daijin has already performed this analysis for us, and the files will be present in the same folder: -* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662533-0.gtf.stats -* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662534-0.gtf.stats +* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662533-0.gtf.stats +* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662534-0.gtf.stats * Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662533-0.gtf.stats * Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662534-0.gtf.stats @@ -273,9 +273,9 @@ Daijin has also created a summary of these statistics in Dmelanogaster/3-assembl +------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+ | File | genes | monoexonic_genes | transcripts | transcripts_per_gene | transcript_len_mean | monoexonic_transcripts | exons | exons_per_transcript | exon_len_mean | +==========================================+=========+====================+===============+========================+=======================+==========================+=========+========================+=================+ -| class-0-hisat-ERR1662533-0.gtf.stats | 11535 | 852 | 13958 | 1.2 | 1429.86 | 852 | 57554 | 4.12 | 346.77 | +| class2-0-hisat-ERR1662533-0.gtf.stats | 11535 | 852 | 13958 | 1.2 | 1429.86 | 852 | 57554 | 4.12 | 346.77 | +------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+ -| class-0-hisat-ERR1662534-0.gtf.stats | 11113 | 884 | 13341 | 1.19 | 1448.07 | 884 | 55123 | 4.13 | 350.47 | +| class2-0-hisat-ERR1662534-0.gtf.stats | 11113 | 884 | 13341 | 1.19 | 1448.07 | 884 | 55123 | 4.13 | 350.47 | +------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+ | stringtie-0-hisat-ERR1662533-0.gtf.stats | 15490 | 6252 | 18995 | 1.22 | 1668.17 | 6446 | 63798 | 3.36 | 496.67 | +------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+ @@ -292,8 +292,8 @@ We can assess the similarity of these assemblies to the reference annotation by mkdir -p Comparisons; mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf --index -l Reference/index.log; # Compare the CLASS2 assemblies against the reference - mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class_ERR1662533.log -o Comparisons/class_ERR1662533 -p Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662533-0.gtf; - mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class_ERR1662534.log -o Comparisons/class_ERR1662534 -p Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662534-0.gtf; + mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class2_ERR1662533.log -o Comparisons/class2_ERR1662533 -p Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662533-0.gtf; + mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class2_ERR1662534.log -o Comparisons/class2_ERR1662534 -p Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662534-0.gtf; # Compare the StringTie assemblies against the reference: mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/stringtie_ERR1662534.log -o Comparisons/stringtie_ERR1662534 -p Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662534-0.gtf; mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/stringtie_ERR1662533.log -o Comparisons/stringtie_ERR1662533 -p Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662533-0.gtf; diff --git a/docs/Tutorial/daijin.yaml b/docs/Tutorial/daijin.yaml index 21318c2c0..1372ccd95 100644 --- a/docs/Tutorial/daijin.yaml +++ b/docs/Tutorial/daijin.yaml @@ -6,7 +6,7 @@ align_methods: hisat: - '' asm_methods: - class: + class2: - '' stringtie: - '' @@ -23,7 +23,7 @@ load: # Commands to use to load/select the versions of the programs to use. Leave an empty # string if no loading is necessary. blast: 'source blast+-2.3.0' - class: 'source class-2.12' + class2: 'source class-2.12' cufflinks: '' gmap: '' hisat: 'source HISAT-2.0.4' diff --git a/docs/Usage/Daijin.rst b/docs/Usage/Daijin.rst index 272f9b6a3..97c4dc9d0 100644 --- a/docs/Usage/Daijin.rst +++ b/docs/Usage/Daijin.rst @@ -159,7 +159,7 @@ Daijin can be configured so to run each assembler and/or aligner multiple times, - '' - "-k 10" asm_methods: - class: + class2: - '' - "--set-cover" stringtie: