-
Notifications
You must be signed in to change notification settings - Fork 1
/
RNA-seq_pipeline.txt
49 lines (34 loc) · 2.02 KB
/
RNA-seq_pipeline.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# G-AVENIR bioinformatics workshop: RNA-seq analysis pipeline
#1 Download the reference genome of An arabiensis from Vectorbase
wget https://vectorbase.org/common/downloads/Current_Release/AarabiensisDONGOLA2021/fasta/data/VectorBase-57_AarabiensisDONGOLA2021_Genome.fasta
#2 indexing the reference genome with subhead-build
subread-buildindex VectorBase-57_AarabiensisDONGOLA2021_Genome.fasta -o VectorBase-57_AarabiensisDONGOLA2021_Genome_index
#3 Download the gff file
wget https://vectorbase.org/common/downloads/Current_Release/AarabiensisDONGOLA2021/gff/data/VectorBase-57_AarabiensisDONGOLA2021.gff
#4 convert the gff file to gtf using gffread
gffread -T VectorBase-57_AarabiensisDONGOLA2021.gff -o VectorBase-57_AarabiensisDONGOLA2021.gtf
#5 quality control with fastqc. Several samples one or several samples could be provided as input to fastqc
fastqc D01_S6_L001_R1_001.fastq.gz
#6 quality control filtering using 'fastp'. This command line below with trim bad quality reads and adapters
# the steps 6-10 could be run in one bash script
fastp \
-i D01_S6_L001_R1_001.fastq.gz -I D01_S6_L001_R2_001.fastq.gz \
-o D01_S6_L001_R1_001_fp.fastq.gz -O D01_S6_L001_R2_001_fp.fastq.gz \
-h D01_S6_L001_fp.html -l 25
#7 Alignment
subjunc -T 4 -M 5 -d 50 -D 1500 -r D01_S6_L001_R1_001_fp.fastq.gz -R D01_S6_L001_R2_001_fp.fastq.gz -i VectorBase-57_AarabiensisDONGOLA2021_Genome_index -o D01_S6_L001.bam
#8 To get the summary statistics about the alignment
samtools flagstatD01_S6_L001.bam > D01_S6_L001.bam.summary.txt
#9 filter bam to remove MAPQ<10 (-q 10) and unmapped reads (-F 4)
samtools view -b -q 10 -F 4 D01_S6_L001.bam -o D01_S6_L001_mq10.bam -@ 16
#10 sorting the filtered *bam file
samtools sort D01_S6_L001_mq10.bam -o D01_S6_L001_mq10.srt.bam -@16
#11 FeatureCount (part of the subread package)
featureCounts \
-T 16 \
-s 2 \
-p -B -C -Q 10 \
--largestOverlap --minOverlap 1 \
-t exon -g transcript_id \
-a VectorBase-57_AarabiensisDONGOLA2021.gtf -o Aarab.antisense.count.txt \
*D01_S6_L001_mq10.srt.bam