-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1-genome_assembly_and_annotation.txt
118 lines (93 loc) · 6.22 KB
/
1-genome_assembly_and_annotation.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# 1.assembly
## 01.kmer_analysis
pre=Kmer_21
ls survey_*.fastq.gz | awk '{print "gzip -dc "$0 }' > generate.file
/home/hwc/anaconda3/bin/jellyfish count -t 4 -C -m 19 -s 1G -g generate.file -G 2 -o $pre
/home/hwc/anaconda3/bin/jellyfish histo -v -o $pre.histo $pre -t 4 -h 10000
/home/hwc/anaconda3/bin/jellyfish stats $pre -o $pre.stat
Rscript /home/hwc/software/genomescope/genomescope.R Kmer_21.histo 19 150 ./ 100000
## 02.assembly_FALCON
fc_run fc_run.cfg
fc_unzip.py fc_unzip.cfg
## 03.assembly_polish
### Create a file to record the location information of the second-generation sequence
realpath LGH_L4_104104.R1.fastq LGH_L4_104104.R2.fastq > sgs.fofn
### 从NextPolish目录下复制配置文件
cp ~/opt/biosoft/NextPolish/doc/run.cfg run2.cfg
### Modifying a configuration file
[General]
job_type = local
job_prefix = nextPolish
task = default
rewrite = 1212
rerun = 3
parallel_jobs = 6
multithread_jobs = 10
genome = raw_genome.fasta
genome_size = auto
workdir = ./01_rundir
polish_options = -p {multithread_jobs}
[sgs_option]
sgs_fofn = ./sgs.fofn
sgs_options = -max_depth 100 -bwa
[lgs_option]
lgs_fofn = ./lgs.fofn
lgs_options = -min_read_len 1k -max_depth 100
lgs_minimap2_options = -x map-pb -t {multithread_jobs}
### run nextPolish
nextPolish run2.cfg
### run allhic
k=20
genome=~/NextPolish/01_rundir/purged.fa
fq1=~/BDHC200000129-1A_1.fq
fq2=~/BDHC200000129-1A_2.fq
ln -s $genome ./draft.asm.fasta
bwa index draft.asm.fasta
samtools faidx draft.asm.fasta
bwa aln -t 20 draft.asm.fasta $fq1 > sample_R1.sai
bwa aln -t 20 draft.asm.fasta $fq2 > sample_R2.sai
bwa sampe draft.asm.fasta sample_R1.sai sample_R2.sai $fq1 $fq2 > sample.bwa_aln.sam
PreprocessSAMs.pl sample.bwa_aln.sam draft.asm.fasta HINDIII
filterBAM_forHiC.pl sample.bwa_aln.REduced.paired_only.bam sample.clean.sam
samtools view -H sample.bwa_aln.bam >header
cat header sample.clean.sam | samtools view -bS - > sample.clean.bam
samtools view -bt draft.asm.fasta.fai sample.clean.sam > sample.clean.bam
ALLHiC_partition -b sample.clean.bam -r draft.asm.fasta -e AAGCTT -k $k
allhic extract sample.clean.bam draft.asm.fasta --RE AAGCTT
for K in `seq 1 $k`
do
allhic optimize sample.clean.counts_AAGCTT.*g${K}.txt sample.clean.clm
done
ALLHiC_build draft.asm.fasta
seqkit fx2tab -l -i -n groups.asm.fasta > len.txt
grep 'sample.clean.counts_AAGCTT' len.txt > chrn.list
ALLHiC_plot sample.clean.bam groups.agp chrn.list 500k pdf
# 2.Gene Structure Annotation
## 01.Repeat prediction and non-coding RNA annotation
### Repeat annotation
/pub/software/RepeatModeler/BuildDatabase -name sesame genome.fasta
/pub/software/RepeatModeler/RepeatModeler -database sesame -pa 20 -LTRStruct
/pub/software/RepeatMasker/RepeatMasker -pa 20 -qq -lib sesame-families.fa test_data/genome.fasta >repeatmasker.log 2>&1
### non-coding RNA annotation
tRNAscan-SE -o tRNA.out -f tRNA.ss -m tRNA.stats ~/genome.fasta
cmsearch --cut_ga --nohmmonly --rfam --noali --cpu 8 --tblout rfam_out.tab /opt/biosoft/infernal-1.1.3/Rfam.cm ~/genome.fasta > rfam_out.txt
## 02.Gene prediction
/opt/biosoft/geta-2.4.5/bin/geta.pl --RM_species Viridiplantae --genome /home/hwc/data/GeneStructureAnnotation/genome.allhic.fasta -1 /home/hwc/data/GeneStructureAnnotation/transcriptomes/lengguohua2_w_L1_394X94.R1.fastq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/lengguohua3_w_L1_183A83.R1.fastq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/STJ1_FRAS202088039-1a_1.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/STJ2_FRAS202088040-1r_1.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/STJ3_FRAS202088041-1r_1.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJS1_FRAS202088036-1r_1.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJS2_FRAS202088037-1r_1.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJS3_FRAS202088038-1r_1.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJSY_FRAS202088042-1a_1.fq.gz -2 /home/hwc/data/GeneStructureAnnotation/transcriptomes/lengguohua2_w_L1_394X94.R2.fastq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/lengguohua3_w_L1_183A83.R2.fastq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/STJ1_FRAS202088039-1a_2.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/STJ2_FRAS202088040-1r_2.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/STJ3_FRAS202088041-1r_2.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJS1_FRAS202088036-1r_2.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJS2_FRAS202088037-1r_2.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJS3_FRAS202088038-1r_2.fq.gz,/home/hwc/data/GeneStructureAnnotation/transcriptomes/TJSY_FRAS202088042-1a_2.fq.gz --protein /home/hwc/data/GeneStructureAnnotation/Sp10.fa --augustus_species Barthea_barthei_20210107 --out_prefix out --cpu 40 --gene_prefix Barthea --pfam_db /opt/biosoft/hmmer-3.2.1/Pfam-AB.hmm
## 03.Gene functional annotation
### NR
diamond blastp --db nr --query ./proteins.fasta --out Nr.xml --outfmt 5 --sensitive --max-target-seqs 20 --evalue 1e-5 --id 10 --index-chunks 1
### COG
diamond blastp --db kog --query ./proteins.fasta --out kog.xml --outfmt 5 --sensitive --max-target-seqs 200 --evalue 1e-5 --id 10 --tmpdir /dev/shm --index-chunks 1
### eggNOG
diamond blastp --db eggnog_proteins --query ./proteins.fasta --out eggNOG.xml --outfmt 5 --sensitive --max-target-seqs 20 --evalue 1e-5 --id 10 --tmpdir /dev/shm --index-chunks 1
### Pfam
para_hmmscan.pl --outformat --cpu 4 --hmm_db Pfam-A.hmm ./proteins.fasta > Pfam.tab
### Interpro
/opt/biosoft/interproscan/interproscan.sh --output-file-base out --cpu 8 --formats TSV,XML,GFF3 --goterms --input ./proteins.fasta
### KAAS: http://www.genome.jp/kaas-bin/kaas_main web tools to submit sequence
gene_annotation_from_kaas.pl query.ko > KEGG.txt
### GO: Integrate GO annotation results in eggNOG and InterPro
go_from_eggNOG_and_interpro.pl ../04.eggNOG/eggNOG.emapper.annotations ../05.InterPro/interpro.tsv > go.annot
go_reducing_go_number_para.pl /opt/biosoft/go_class/bin/go-basic.obo go.annot 8 > go_reduced.annot
sort go_reduced.annot > go.annot; rm go_reduced.annot
gene_annotation_from_table.pl go.annot > GO.txt