-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSnakefile
88 lines (77 loc) · 2.98 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
configfile: "./config/config.yaml"
# reference files paths
annotation_path = config["annotation_path"]
STAR_index_path = config["STAR_index_path"]
gtf_path = config["gtf_path"]
salmon_index_path = config["salmon_index_path"]
fa_path = config["fa_path"]
# Read samples.tsv to extract Samples, fq1, fq2
Samples = []
fq1 = []
fq2 = []
type = []
with open(config["samples"]) as file:
for line in file:
l = line.strip().split(',')
if len(l) == 4:
Samples.append(l[0])
fq1.append(l[1])
fq2.append(l[2])
type.append(l[3])
# Assume the full sample name is within the fq1 and fq2 file name
# Get the suffix of the seq file name excluding the sample name, used in starAlign.smk and salmonQuant.smk
fq1_suffix = fq1[0].replace(Samples[0], "")
fq2_suffix = fq2[0].replace(Samples[0], "")
###### Get wildcards for Reads (accomodate 3 patterns: .fastq.gz, .fq.gz, .fastq)
# Define the patterns with possible suffixes
patterns = ["data/{Read}.fastq.gz", "data/{Read}.fastq", "data/{Read}.fq.gz", "data/{Read}.fq"]
# Use glob_wildcards to find matching files
Reads = set()
for pattern in patterns:
Reads.update(glob_wildcards(pattern).Read)
# Convert set to list
Reads = list(Reads)
###### Accomodate .bam files
if all(i == "bam" for i in type):
include: "rules/bam_to_fastq.rmk",
input1 = [
expand("data/{Sample}_sorted.bam", Sample=Samples),
expand("data/{Sample}_R1.fastq.gz", Sample=Samples),
expand("data/{Sample}_R2.fastq.gz", Sample=Samples),
]
else:
input1 = []
###### Perform fastQC based on the file type
if ".fastq.gz" in fq1_suffix:
include: "rules/preAlignQC_fastq_gz.smk",
elif ".fq.gz" in fq1_suffix:
include: "rules/preAlignQC_fq_gz.smk",
elif ".fastq" in fq1_suffix:
include: "rules/preAlignQC_fastq.smk",
else:
include: "rules/preAlignQC_fq.smk",
include: "rules/starAlign.smk",
include: "rules/salmonQuant.smk",
include: "rules/countMatrix.smk",
include: "rules/rsem.smk",
input2 = [
expand("results/fastqc_results/{Read}_fastqc.zip", Read=Reads),
expand("results/fastqc_results/{Read}_fastqc.html", Read=Reads),
expand("results/STAR_results/{Sample}/{Sample}_Log.final.out", Sample=Samples),
"results/star_wide_countMatrix.csv",
"results/star_wide_countMatrix.Rds",
expand("results/salmon_results/{Sample}/quant.sf", Sample=Samples),
"results/salmon_wide_TPM_Matrix.csv",
"results/salmon_wide_TPM_Matrix.Rds",
multiext("rsem_ref/human_gencode", ".chrlist", ".n2g.idx.fa", ".transcripts.fa", ".grp", ".seq", ".idx.fa", ".ti"),
expand("results/rsem_results/{Sample}/{Sample}.genes.results", Sample=Samples),
expand("results/rsem_results/{Sample}/{Sample}.isoforms.results", Sample=Samples),
"results/rsem_geneLevel_wide_TPM_Matrix.csv",
"results/rsem_geneLevel_wide_TPM_Matrix.Rds",
"results/rsem_isoformLevel_wide_TPM_Matrix.csv",
"results/rsem_isoformLevel_wide_TPM_Matrix.Rds",
]
All_inputs = input1 + input2
rule all:
input:
All_inputs