Skip to content

Commit

Permalink
Issue 401 daijin class2 (#402)
Browse files Browse the repository at this point in the history
Coordinate changes to the configuration files with daijin
- The parameter for the assembler 'class' has changed in the configuration file to 'class2' but this was not picked up by changes to daijin.
- Removes a noisy print command in the BED12 parser.
- Update the tutorial and daijin's CLI.
- Update daijin.yaml in accord to Daijin's class2 program loading.

Closes #401
  • Loading branch information
ljyanesm authored Apr 12, 2021
1 parent b0d1edc commit e641cf9
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 31 deletions.
2 changes: 1 addition & 1 deletion Mikado/daijin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def create_config_parser():
required=False, default=[], nargs="*",
help="Aligner(s) to use for long reads. Choices: %(choices)s")
parser.add_argument("-as", "--assemblers", dest="asm_methods", required=False,
choices=["class", "cufflinks", "stringtie", "trinity", "scallop"],
choices=["class2", "cufflinks", "stringtie", "trinity", "scallop"],
default=[], nargs="*", help="Assembler(s) to use for the analysis. Choices: %(choices)s")
mikado = parser.add_argument_group("Options related to the Mikado phase of the pipeline.")
# scoring = parser.add_argument_group("Options related to the scoring system")
Expand Down
32 changes: 16 additions & 16 deletions Mikado/daijin/assemble.smk
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def makeAsmRunArray(assembler):
CUFFLINKS_RUNS = makeAsmRunArray("cufflinks")
TRINITY_RUNS = makeAsmRunArray("trinity")
STRINGTIE_RUNS = makeAsmRunArray("stringtie")
CLASS_RUNS = makeAsmRunArray("class")
CLASS_RUNS = makeAsmRunArray("class2")
SCALLOP_RUNS = makeAsmRunArray("scallop")

SAMPLE_STR = ",".join(SAMPLES)
Expand All @@ -181,12 +181,12 @@ for a in ALIGN_RUNS :
PORTCULLIS_IN[a] = os.path.join(ALIGN_DIR, "output", a + ".sorted.bam")

aln_abrv = {"tophat":"tph", "star":"sta", "gsnap":"gsp", "hisat":"hst", "gmap":"gmp"}
asm_abrv = {"cufflinks":"cuf", "stringtie":"stn", "class":"cls", "trinity":"trn", "scallop": "scl"}
asm_abrv = {"cufflinks":"cuf", "stringtie":"stn", "class2":"cls", "trinity":"trn", "scallop": "scl"}

GTF_ASSEMBLY_METHODS = []
GFF_ASSEMBLY_METHODS = []
for asm in ASSEMBLY_METHODS:
if asm in ("cufflinks", "stringtie", "class", "scallop"):
if asm in ("cufflinks", "stringtie", "class2", "scallop"):
GTF_ASSEMBLY_METHODS.append(asm)
elif asm == "trinity":
GFF_ASSEMBLY_METHODS.append(asm)
Expand Down Expand Up @@ -447,7 +447,7 @@ def gmap_intron_lengths(command, MAX_INTRON):
# Rules

localrules: mikado_cfg, tophat_all, gsnap_all, gsnap_index, star_all, hisat_all, cufflinks_all, trinity_all,
stringtie_all, class_all, lr_gmap_all, lr_star_all, clean, align_all, lreads_all, asm_all, all
stringtie_all, class2_all, lr_gmap_all, lr_star_all, clean, align_all, lreads_all, asm_all, all

rule all:
input:
Expand Down Expand Up @@ -868,28 +868,28 @@ rule stringtie_all:
output: os.path.join(ASM_DIR, "stringtie.done")
shell: "touch {output}"

rule asm_class:
rule asm_class2:
input:
bam=os.path.join(ALIGN_DIR, "output", "{alrun}.sorted.bam"),
align=rules.align_all.output,
ref=rules.uncompress_genome.output.genome
output:
link=os.path.join(ASM_DIR, "output", r"class-{run2,\d+}-{alrun}.gtf"),
gtf=os.path.join(ASM_DIR, "class", r"class-{run2,\d+}-{alrun}", r"class-{run2}-{alrun}.gtf")
link=os.path.join(ASM_DIR, "output", r"class2-{run2,\d+}-{alrun}.gtf"),
gtf=os.path.join(ASM_DIR, "class2", r"class2-{run2,\d+}-{alrun}", r"class2-{run2}-{alrun}.gtf")
params:
outdir=os.path.join(ASM_DIR, "class", "class-{run2}-{alrun}"),
load=loadPre(config, "class"),
extra=lambda wildcards: config["asm_methods"]["class"][int(wildcards.run2)],
link_src=os.path.join("..", "class", "class-{run2}-{alrun}", "class-{run2}-{alrun}.gtf")
log: os.path.join(ASM_DIR, "logs", "class", "class-{run2}-{alrun}.log")
outdir=os.path.join(ASM_DIR, "class2", "class2-{run2}-{alrun}"),
load=loadPre(config, "class2"),
extra=lambda wildcards: config["asm_methods"]["class2"][int(wildcards.run2)],
link_src=os.path.join("..", "class2", "class2-{run2}-{alrun}", "class2-{run2}-{alrun}.gtf")
log: os.path.join(ASM_DIR, "logs", "class2", "class2-{run2}-{alrun}.log")
threads: THREADS
message: "Using class to assemble (run {wildcards.run2}): {input.bam}"
message: "Using class2 to assemble (run {wildcards.run2}): {input.bam}"
shell: "{params.load} {CLASS} --clean --force -c \"{params.extra}\" -p {threads} {input.bam} > {output.gtf} \
2> {log} && ln -sf {params.link_src} {output.link} && touch -h {output.link}"

rule class_all:
input: expand(os.path.join(ASM_DIR, "output", "class-{run2}-{alrun}.gtf"), run2=CLASS_RUNS, alrun=ALIGN_RUNS)
output: os.path.join(ASM_DIR, "class.done")
rule class2_all:
input: expand(os.path.join(ASM_DIR, "output", "class2-{run2}-{alrun}.gtf"), run2=CLASS_RUNS, alrun=ALIGN_RUNS)
output: os.path.join(ASM_DIR, "class2.done")
shell: "touch {output}"

rule lr_gmap:
Expand Down
1 change: 0 additions & 1 deletion Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,6 @@ def _parse_attributes(self, attributes):
"""

self.attribute_order = []
print("Parsing", attributes)

infolist = self._attribute_pattern.findall(attributes.rstrip().rstrip(";"))

Expand Down
20 changes: 10 additions & 10 deletions docs/Tutorial/Daijin_tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ Now we will configure Daijin for the run::
-m permissive --sample-sheet sample_sheet.tsv \
--flank 500 -i 50 26000 --threads 2 \
--genome Reference/Drosophila_melanogaster.BDGP6.dna.toplevel.fa \
-al hisat -as class stringtie -od Dmelanogaster --name Dmelanogaster \
-al hisat -as class2 stringtie -od Dmelanogaster --name Dmelanogaster \
-o daijin.toml --prot-db Reference/Aedes_aegypti.fasta;

This will create three files in the working directory:
Expand All @@ -211,7 +211,7 @@ This will create three files in the working directory:
.. code-block:: yaml
blast: 'source blast-2.3.0'
class: 'source class-2.12'
class2: 'source class-2.12'
cufflinks: ''
gmap: ''
hisat: 'source HISAT-2.0.4'
Expand Down Expand Up @@ -256,15 +256,15 @@ You can also ask Daijin to display the steps to be executed, inclusive of their

When Daijin is finished, have a look inside the folder Dmelanogaster/3-assemblies/output/; you will find the following GTF files:

* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662533-0.gtf
* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662534-0.gtf
* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662533-0.gtf
* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662534-0.gtf
* Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662533-0.gtf
* Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662534-0.gtf

These are standard `GTF files <http://www.ensembl.org/info/website/upload/gff.html>`_ reporting the assembled transcripts for each method. We can have a feel for how they compare with our reference annotation by, again, using ``mikado util stats``. Conveniently, Daijin has already performed this analysis for us, and the files will be present in the same folder:

* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662533-0.gtf.stats
* Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662534-0.gtf.stats
* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662533-0.gtf.stats
* Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662534-0.gtf.stats
* Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662533-0.gtf.stats
* Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662534-0.gtf.stats

Expand All @@ -273,9 +273,9 @@ Daijin has also created a summary of these statistics in Dmelanogaster/3-assembl
+------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+
| File | genes | monoexonic_genes | transcripts | transcripts_per_gene | transcript_len_mean | monoexonic_transcripts | exons | exons_per_transcript | exon_len_mean |
+==========================================+=========+====================+===============+========================+=======================+==========================+=========+========================+=================+
| class-0-hisat-ERR1662533-0.gtf.stats | 11535 | 852 | 13958 | 1.2 | 1429.86 | 852 | 57554 | 4.12 | 346.77 |
| class2-0-hisat-ERR1662533-0.gtf.stats | 11535 | 852 | 13958 | 1.2 | 1429.86 | 852 | 57554 | 4.12 | 346.77 |
+------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+
| class-0-hisat-ERR1662534-0.gtf.stats | 11113 | 884 | 13341 | 1.19 | 1448.07 | 884 | 55123 | 4.13 | 350.47 |
| class2-0-hisat-ERR1662534-0.gtf.stats | 11113 | 884 | 13341 | 1.19 | 1448.07 | 884 | 55123 | 4.13 | 350.47 |
+------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+
| stringtie-0-hisat-ERR1662533-0.gtf.stats | 15490 | 6252 | 18995 | 1.22 | 1668.17 | 6446 | 63798 | 3.36 | 496.67 |
+------------------------------------------+---------+--------------------+---------------+------------------------+-----------------------+--------------------------+---------+------------------------+-----------------+
Expand All @@ -292,8 +292,8 @@ We can assess the similarity of these assemblies to the reference annotation by
mkdir -p Comparisons;
mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf --index -l Reference/index.log;
# Compare the CLASS2 assemblies against the reference
mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class_ERR1662533.log -o Comparisons/class_ERR1662533 -p Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662533-0.gtf;
mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class_ERR1662534.log -o Comparisons/class_ERR1662534 -p Dmelanogaster/3-assemblies/output/class-0-hisat-ERR1662534-0.gtf;
mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class2_ERR1662533.log -o Comparisons/class2_ERR1662533 -p Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662533-0.gtf;
mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/class2_ERR1662534.log -o Comparisons/class2_ERR1662534 -p Dmelanogaster/3-assemblies/output/class2-0-hisat-ERR1662534-0.gtf;
# Compare the StringTie assemblies against the reference:
mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/stringtie_ERR1662534.log -o Comparisons/stringtie_ERR1662534 -p Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662534-0.gtf;
mikado compare -r Reference/Drosophila_melanogaster.BDGP6.89.gtf -l Comparisons/stringtie_ERR1662533.log -o Comparisons/stringtie_ERR1662533 -p Dmelanogaster/3-assemblies/output/stringtie-0-hisat-ERR1662533-0.gtf;
Expand Down
4 changes: 2 additions & 2 deletions docs/Tutorial/daijin.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ align_methods:
hisat:
- ''
asm_methods:
class:
class2:
- ''
stringtie:
- ''
Expand All @@ -23,7 +23,7 @@ load:
# Commands to use to load/select the versions of the programs to use. Leave an empty
# string if no loading is necessary.
blast: 'source blast+-2.3.0'
class: 'source class-2.12'
class2: 'source class-2.12'
cufflinks: ''
gmap: ''
hisat: 'source HISAT-2.0.4'
Expand Down
2 changes: 1 addition & 1 deletion docs/Usage/Daijin.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ Daijin can be configured so to run each assembler and/or aligner multiple times,
- ''
- "-k 10"
asm_methods:
class:
class2:
- ''
- "--set-cover"
stringtie:
Expand Down

0 comments on commit e641cf9

Please sign in to comment.