-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit d452a30
Showing
46 changed files
with
923,142 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
To run the tutorial please go to the tutorial subfolder. | ||
|
||
|
||
Introduction: | ||
|
||
The user wishes to analyze deep sequencing data mapping to a ~6 kb region on C. elegans chromosome II for known and novel miRNA genes. | ||
|
||
|
||
|
||
--------------------------------------------------------------------------------------------------------------------------------------------------- | ||
|
||
|
||
Preliminary files: | ||
|
||
cel_cluster.fa: a fasta file with the reference genome (this file is in fact a ~6 kb region of the C. elegans chromosome II). | ||
|
||
mature_ref_this_species.fa: a fasta file with the reference miRBase mature miRNAs for the species (C. elegans miRBase v.14 mature miRNAs) | ||
|
||
mature_ref_other_species.fa: a fasta file with the reference miRBase mature miRNAs for related species (C. briggsae and D. melanogaster miRBase v.14 mature miRNAs) | ||
|
||
precursors_ref_this_species.fa: a fasta file with the reference miRBase precursor miRNAs for the species (C. elegans miRBase v.14 precursor miRNAs) | ||
|
||
reads.fa: a fasta file with the deep sequencing reads. | ||
|
||
|
||
-------------------------------------------------------------------------------------------------------------------------------------------------- | ||
|
||
|
||
Analysis: | ||
|
||
|
||
|
||
Step 1: | ||
|
||
build an index of the genome (in this case the ~6 kb region): | ||
|
||
bowtie-build cel_cluster.fa cel_cluster | ||
|
||
|
||
|
||
Step 2: | ||
|
||
process reads and map them to the genome. | ||
|
||
The -c option designates that the input file is a fasta file (for other input formats, see the README file). The -j options removes entries with | ||
non-canonical letters (letters other than a,c,g,t,u,n,A,C,G,T,U,N). The -k option clips adapters. The -l option discards reads shorter than 18 nts. | ||
The -m option collapses the reads. The -p option maps the processed reads against the previously indexed genome (cel_cluster). The -s option | ||
designates the name of the output file of processed reads and the -t option designates the name of the output file of the genome mappings. Last, | ||
-v gives verbose output to the screen. | ||
|
||
mapper.pl reads.fa -c -j -k TCGTATGCCGTCTTCTGCTTGT -l 18 -m -p cel_cluster -s reads_collapsed.fa -t reads_collapsed_vs_genome.arf -v | ||
|
||
|
||
|
||
Step 3: | ||
|
||
fast quantitation of reads mapping to known miRBase precursors. | ||
|
||
(This step is not required for identification of known and novel miRNAs in the deep sequencing data when using miRDeep2.pl.) | ||
|
||
quantifier.pl -p precursors_ref_this_species.fa -m mature_ref_this_species.fa -r reads_collapsed.fa -t cel -y 16_19 | ||
|
||
The miRNA_expressed.csv gives the read counts of the reference miRNAs in the data in tabular format. The results can also be browsed by opening | ||
expression_16_19.html with an internet browser. | ||
|
||
|
||
|
||
Step 4: | ||
|
||
identification of known and novel miRNAs in the deep sequencing data: | ||
|
||
miRDeep2.pl reads_collapsed.fa cel_cluster.fa reads_collapsed_vs_genome.arf mature_ref_this_species.fa mature_ref_other_species.fa precursors_ref_this_species.fa -t C.elegans 2> report.log | ||
|
||
|
||
|
||
Step 5: | ||
|
||
browse the results. | ||
|
||
open the results.html using an internet browser. Notice that cel-miR-37 is predicted twice, since both potential precursors excised from this locus | ||
can fold into hairpins. However, the annotated hairpin scores much higher than the non-annotated one (miRDeep2 score 6.1e+4 vs. -0.2). | ||
|
||
|
||
|
||
|
||
|
||
-------------------------------------------------------------------------------------------------------------------------------------------- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,263 @@ | ||
#!/usr/bin/perl | ||
|
||
use strict; | ||
|
||
my $usage = "\nError:\nperl bwa_sam_converter.pl mapped.sam reads.fa reads_vs_genome.arf multiple\n | ||
If output_reads.fa is not specified a file called reads_ready.fa is automatically created\n | ||
If mapped.arf is not specified a file called signature.arf is automatically created\n\n"; | ||
|
||
|
||
my $sam = shift or die $usage; | ||
my $rout = shift; | ||
my $arf = shift; | ||
my $multi =shift; | ||
|
||
open IN,"<$sam" or die "$usage"; | ||
|
||
my @line; | ||
my @edit_string; | ||
my @ref_seq; | ||
my $num; | ||
|
||
my @processed_seq; | ||
my @processed_edit; | ||
|
||
my $rev=0; | ||
|
||
my $offset; | ||
|
||
my @cigar; ## the cigar string in sam file refers just to the read sequence | ||
my $print_read; | ||
|
||
|
||
if(not $rout){ | ||
open OUT,">reads.fa" or die "cannot create file reads.fa\n"; | ||
}else{ | ||
open OUT,">$rout" or die "cannot create file $rout.fa\n"; | ||
} | ||
|
||
if(not $arf){ | ||
open ARF,">reads_vs_genome.arf" or die "cannot create file reads_vs_genome.arf\n"; | ||
}else{ | ||
open ARF,">$arf" or die "cannot create file $arf\n"; | ||
} | ||
|
||
my $count=0; | ||
|
||
my $edit; | ||
my $edit_str; | ||
my $strand; | ||
my $rev = 0; | ||
|
||
my $genome_seq; | ||
my $edit_s; | ||
|
||
my $cline; | ||
|
||
my %reads; | ||
|
||
while(<IN>){ | ||
next if(/^\@/); | ||
$cline = $_; | ||
$strand = "+"; | ||
$edit = 0; | ||
$edit_str=""; | ||
@line = split(/\t/); | ||
|
||
## next if read is not aligned | ||
next if($line[1] eq 4); | ||
|
||
## determine if read is coming from minus strand | ||
|
||
# if($line[1] ne 0){ | ||
$rev=FLAGinfo($line[1]); | ||
# }else{ | ||
# $rev =0; | ||
# } | ||
|
||
# print "$strand\n"; | ||
## set strand | ||
$strand = "-" if ($rev); | ||
# print "$strand\n"; | ||
## print ID of read to reads_ready.fa | ||
#print OUT ">$line[0]\n"; | ||
|
||
## grep edit string, this one corresponds to the genome sequence | ||
if($cline =~ m/MD:Z:(\S+)\s+/){ | ||
@edit_string = split(//,$1); | ||
} | ||
|
||
@ref_seq = split(//,$line[9]); | ||
|
||
|
||
$print_read=""; | ||
|
||
$offset = 0; | ||
|
||
@cigar = split(//,$line[5]); | ||
|
||
$num = ""; | ||
|
||
#print "i\tnum\toffset\n"; | ||
for(my $i=0; $i < scalar (@cigar) ; $i++){ | ||
|
||
if($cigar[$i] =~ m/\d/){ | ||
$num .= $cigar[$i]; | ||
|
||
}elsif($cigar[$i] =~ m/M/){ | ||
$edit_str.= 'm' x $num; | ||
$print_read .= join(/ /,@ref_seq[$offset..($num-1+$offset)]); | ||
|
||
$offset += $num; | ||
|
||
$num=""; | ||
|
||
}elsif($cigar[$i] =~ m/I/){ | ||
$offset += $num; | ||
$edit_str.= 'I'; | ||
$num=""; | ||
$edit++; | ||
|
||
}elsif($cigar[$i] =~ m/D/){ | ||
$edit_str.= 'D'; | ||
$print_read .= ('N' x $num); | ||
$num=""; | ||
|
||
## process N's in Cigar string | ||
}elsif($cigar[$i] =~ m/N/){ | ||
$num=""; | ||
}else{ | ||
} | ||
} | ||
|
||
$offset = 0; | ||
|
||
$num=""; | ||
|
||
@processed_seq = split(//,$print_read); ## right genome sequence length with N for deleted chars in read sequence | ||
@processed_edit = split(//,$edit_str); | ||
|
||
## now process edit string | ||
for(my $i=0; $i < scalar (@edit_string) ; $i++){ | ||
|
||
if($edit_string[$i] =~ m/\d/){ | ||
$num .= $edit_string[$i]; | ||
|
||
}elsif($edit_string[$i] =~ m/\^/){ ## get the deleted nt in read sequence | ||
$i++; | ||
$processed_seq[$num+$offset] = $edit_string[$i]; | ||
$edit++; | ||
|
||
$offset+= $num+1; | ||
$num=""; | ||
|
||
}elsif($edit_string[$i] =~ m/\w/){ | ||
$edit++; | ||
$offset+=$num; | ||
|
||
$processed_seq[$offset] = $edit_string[$i]; | ||
$processed_edit[$offset] = 'M'; | ||
|
||
$offset++; | ||
$num=""; | ||
}else{} | ||
} | ||
|
||
$genome_seq = join("",@processed_seq); | ||
$edit_s = join("",@processed_edit); | ||
|
||
|
||
### here reverse if not multi | ||
if(not $multi){ | ||
|
||
if($strand eq "-"){ | ||
$genome_seq = reverse($genome_seq); | ||
$genome_seq =~ tr/ACGT/TGCA/; | ||
$line[9] = reverse($line[9]); | ||
$line[9] =~ tr/ACGT/TGCA/; | ||
$edit_s = reverse($edit_s); | ||
|
||
} | ||
} | ||
|
||
if($reads{$line[0]} and $reads{$line[0]}{'mm'} > $edit ){ | ||
$reads{$line[0]}{'mm'} = $edit; | ||
$reads{$line[0]}{'seq'} = $line[9]; | ||
}else{ | ||
$reads{$line[0]}{'mm'} = $edit; | ||
$reads{$line[0]}{'seq'} = $line[9]; | ||
} | ||
|
||
#print OUT $genome_seq; | ||
#print OUT "\n"; | ||
|
||
if($line[0] !~ /_x\d+/){ | ||
$line[0].="_x1"; | ||
} | ||
|
||
print ARF "$line[0]\t",length($line[9]),"\t1\t",length($line[9]),"\t",lc $line[9],"\t$line[2]\t",length($genome_seq),"\t$line[3]\t",($line[3] -1 + (length($genome_seq))),"\t",lc $genome_seq,"\t$strand\t$edit\t$edit_s\n"; | ||
} | ||
close IN; | ||
|
||
for(keys %reads){ | ||
print OUT ">$_\n$reads{$_}{'seq'}\n"; | ||
} | ||
|
||
close OUT; | ||
|
||
|
||
sub FLAGinfo{ | ||
my $in=shift; | ||
my %bwa_codes; | ||
|
||
## read in bwa hex codes | ||
while(<DATA>){ | ||
chomp; | ||
if(/^(\d+)\s+(.+)$/){ | ||
$bwa_codes{$1} = $2; | ||
} | ||
} | ||
|
||
my $rest; | ||
|
||
$rest = $in; | ||
|
||
my @arr; | ||
|
||
##modulo operations to determine binary number of decimal number | ||
while($rest ne 0){ | ||
push(@arr,$rest%2); | ||
$rest = int($rest/2); | ||
} | ||
|
||
|
||
|
||
my $hex; | ||
my $bin; | ||
|
||
my $rev = 0; | ||
|
||
## translate binary to hexadecimal number and check if read is on minus strand | ||
for(my $i=0; $i < scalar @arr; $i++){ | ||
$bin = $arr[$i] * 2**$i; | ||
$hex = sprintf("%x", $bin); | ||
if($arr[$i] ne 0){ | ||
$rev = 1 if($hex eq 10); | ||
} | ||
} | ||
return($rev); | ||
} | ||
|
||
__DATA__ | ||
0 . | ||
1 the read is paired in sequencing | ||
2 the read is mapped in a proper pair | ||
4 the read sequence is unmapped | ||
8 the mate is unmapped | ||
10 read is mapped to minus strand (given seq in col 10 is therefore the reverse complement of the plus strand) | ||
20 strand of the mate | ||
40 the read is the first read in a pair | ||
80 the read is the second read in a pair | ||
100 the alignment is not primary | ||
200 the read fails plattform/vendor quality checks | ||
400 the read is either a PCR duplicate or an optical duplicate |
Oops, something went wrong.