forked from chasewnelson/SNPGenie
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnpgenie-1.2.pl
executable file
·27816 lines (26261 loc) · 665 KB
/
snpgenie-1.2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/perl
# SNPGenie for CLC and Geneious output
# PROGRAM: Perl program to calculate evolutionary paramaters from NGS SNP Reports
# generated from pooled DNA samples.
# Copyright (C) 2015 Chase W. Nelson
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# DATE CREATED: April 10, 2015
# AUTHOR: Chase W. Nelson
# CONTACT1: [email protected]
# CONTACT2: [email protected]
# AFFILIATION1: Austin L. Hughes lab, University of South Carolina (Columbia, SC, USA)
# AFFILIATION2: Wen-Hsiung Li lab, Academia Sinica (Taipei, Taiwan)
use strict;
#use warnings;
use IO::Handle;
use Data::Dumper;
require File::Temp;
use File::Temp qw(tempfile);
use Getopt::Long;
my $time1 = time;
my $local_time1 = localtime;
# Die if no arguments, explaining what they should be
#unless (@ARGV) {
# die "\nThis program accepts arguments:\n[1] Blah;\n[2] Blah2;\n[3] Blah3.\n\n";
#}
# GET THE TIME
# Initialize variables
my $minfreq; # the min. allele freq. to be considered (prop., not percentage)
my $snpreport;
my $fastafile;
my $gtffile;
my $sepfiles;
my $slidingwindow;
my $ratiomode;
my $sitebasedmode; # not supported or recommended; site (not codon) contexts
my $complementmode;
my $clc_mode = 0;
my $geneious_mode = 0;
my $vcf_mode = 0;
my $progress_period_count = 0;
my @snp_report_file_names_arr;
my $multi_seq_mode = 0;
my $the_fasta_file = '';
my @fasta_file_names_arr;
my $fasta_arr_size;
my $cds_file;
my $param_file_contents = "SNPGenie version 1.2 parameter log.\n\n";
# Get user input, if given. If a Boolean argument is passed, its value is 1; else undef
GetOptions( "minfreq:f" => \$minfreq, # optional floating point parameter
"snpreport:s" => \$snpreport, # optional string parameter
"fastafile:s" => \$fastafile, # optional string parameter
"gtffile:s" => \$gtffile, # optional string parameter
"sepfiles" => \$sepfiles, # optional Boolean; set to false if not given
"slidingwindow:i" => \$slidingwindow, # optional integer parameter
"ratiomode" => \$ratiomode, # optional Boolean; set to false if not given
"sitebasedmode" => \$sitebasedmode) # optional Boolean; set to false if not given
# "complementmode" => \$complementmode) # optional Boolean; set to false if not given
or die "\n## WARNING: Error in command line arguments. SNPGenie terminated.\n\n";
# N.B.: When an argument, e.g., slidingwindow, is called only as a flag, its value is 0
# When it is not called at all, it is null
# Create a directory for the results
if (-d "SNPGenie_Results") { # Can also use "./SNPGenie_Results"; use "-e" to check file
die "\n\n## WARNING:\n## The directory SNPGenie_Results already exists in the ".
"working directory.\n## Please rename or move this directory so a new one ".
"can be created.\n\n";
} else {
mkdir('SNPGenie_Results');
## Set OPTIONS given the user's INPUT and RECORD PARAMETERS
if(! $minfreq) {
$minfreq = 0;
$param_file_contents .= "MINIMUM ALLELE FREQUENCY: Default used; all SNPs included\n";
} elsif(($minfreq >= 1) || ($minfreq < 0)) {
die "\n## WARNING: The --minfreq option must be a decimal between 0 and 1\n".
"## SNPGenie terminated.\n\n";
} else {
$param_file_contents .= "MINIMUM ALLELE FREQUENCY: $minfreq\n";
}
if(! $sepfiles) {
$sepfiles = 0; # default behavior: no separate codon files for each SNP Report
$param_file_contents .= "SEPARATE FILES OUTPUT: No\n";
} else {
$sepfiles = 1;
$param_file_contents .= "SEPARATE FILES OUTPUT: Yes\n";
}
if($slidingwindow == 0) { # Called as a flag, but given no value
$slidingwindow = 9; # default behavior: nonamer peptide
$param_file_contents .= "SLIDING WINDOW LENGTH: Default used; 9 codons\n";
} elsif($slidingwindow > 0) {
$param_file_contents .= "SLIDING WINDOW LENGTH: $slidingwindow\n";
} else {
$param_file_contents .= "SLIDING WINDOW LENGTH: None\n";
}
if(! $ratiomode) {
$ratiomode = 0; # default behavior: no separate codon files for each SNP Report
$param_file_contents .= "RATIO MODE: Default used; No\n";
} else {
$ratiomode = 1;
$param_file_contents .= "RATIO MODE: Yes\n";
}
if(! $sitebasedmode) {
$sitebasedmode = 0; # default behavior: no separate codon files for each SNP Report
$param_file_contents .= "SITE-BASED MODE: Default used; No\n";
} else {
$sitebasedmode = 1;
$param_file_contents .= "SITE-BASED MODE: Default used; Yes\n";
}
# Get SNP Report name(s)
if(! $snpreport) {
@snp_report_file_names_arr = &get_txt_file_names;
my @snp_report_file_names_ADD_arr = &get_csv_file_names;
my @snp_report_file_names_ADD_VCF_arr = &get_vcf_file_names;
push(@snp_report_file_names_arr,@snp_report_file_names_ADD_arr);
push(@snp_report_file_names_arr,@snp_report_file_names_ADD_VCF_arr);
$param_file_contents .= "SNP REPORTS: Default auto-detected file(s): @snp_report_file_names_arr\n";
} else {
@snp_report_file_names_arr = ($snpreport);
$param_file_contents .= "SNP REPORTS: User submitted file: $snpreport\n";
}
#print "\n@snp_report_file_names_arr\n\n";
if (scalar (@snp_report_file_names_arr) == 0) {
die "\n\n## WARNING: There are no SNP Reports. SNPGenie terminated.\n\n";
}
if(! $fastafile) {
@fasta_file_names_arr = &get_fasta_file_names;
#print "\nWorking directory fasta files are: @fasta_file_names_arr\n\n";
$param_file_contents .= "REFERENCE FASTA FILE: Default auto-detected file(s): @fasta_file_names_arr\n";
} else {
$fasta_file_names_arr[0] = $fastafile;
$param_file_contents .= "REFERENCE FASTA FILE: User submitted file: $fastafile\n";
}
$fasta_arr_size = scalar(@fasta_file_names_arr);
#print "\nThe size of the fasta array is $fasta_arr_size\n";
if($fasta_arr_size > 1) {
$multi_seq_mode = 1;
$param_file_contents .= "MULTI-SEQUENCE MODE: Yes\n";
} elsif($fasta_arr_size == 1) {
$multi_seq_mode = 0;
$the_fasta_file = $fasta_file_names_arr[0];
$param_file_contents .= "MULTI-SEQUENCE MODE: No\n";
} else {
die "\n\n## WARNING: There are no FASTA (.fa or .fasta) files in the working directory. ".
"SNPGenie terminated.\n\n";
}
if(! $gtffile) { # default behavior
$cds_file = &get_cds_file_name;
$param_file_contents .= "GTF FILE: Default auto-detected file: $cds_file\n";
} else {
$cds_file = $gtffile;
$param_file_contents .= "GTF FILE: User submitted file: $cds_file\n";
}
#print "\n@fasta_file_names_arr\n\n";
STDOUT->autoflush(1);
chdir('SNPGenie_Results');
open(PARAM_FILE,">>SNPGenie\_parameters\.txt");
print PARAM_FILE "$param_file_contents";
close PARAM_FILE;
### NUCLEOTIDE DIVERSITY FILE
open(OUTFILE_NT_DIV,">>codon\_results\.txt");
my $ntd_headers_to_print = "file\tproduct\tsite\tcodon\tnum_overlap_ORF_nts\t".
"mean_nonsyn_diffs\tmean_syn_diffs\t";
if($sitebasedmode == 1) {
$ntd_headers_to_print .= "mean_nonsyn_diffs_site_based\tmean_syn_diffs_site_based\t";
}
$ntd_headers_to_print .= "nonsyn_sites\tsyn_sites\t";
if($sitebasedmode == 1) {
$ntd_headers_to_print .= "nonsyn_sites_site_based\tsyn_sites_site_based\t";
}
$ntd_headers_to_print .= "nonsyn_sites_ref\tsyn_sites_ref\t";
#$ntd_headers_to_print .= "piN\tpiS\t";
if($ratiomode == 1) {
$ntd_headers_to_print .= "piN/piS\t";
}
$ntd_headers_to_print .= "mean_nonsyn_diffs_vs_ref\tmean_syn_diffs_vs_ref\t".
"mean_gdiv\tmean_nonsyn_gdiv\tmean_syn_gdiv\n";
#$ntd_headers_to_print .= "mean_dN_vs_ref\tmean_dS_vs_ref\n"; # \tAverage_cov
print OUTFILE_NT_DIV "$ntd_headers_to_print";
close OUTFILE_NT_DIV;
### GENE DIVERSITY FILE
open(OUTFILE_GENE_DIV,">>site\_results\.txt");
print OUTFILE_GENE_DIV "file\tproduct\tsite\tref_nt\tmaj_nt\t".
"position_in_codon\t".
"overlapping_ORFs\tcodon_start_site\tcodon\tpi\t".
#"Polymorphic (Y=1; N=0)\t".
"gdiv\t".
"class_vs_ref\tclass\t".
"coverage\t".
"A\tC\tG\tT\n";
close OUTFILE_GENE_DIV;
### PRODUCT SUMMARY FILE
open(PRODUCT_SUMMARY,">>product\_results\.txt");
print PRODUCT_SUMMARY "file\tproduct\tmean_nonsyn_diffs\tmean_syn_diffs\t".
"mean_nonsyn_diffs_vs_ref\tmean_syn_diffs_vs_ref\t".
"nonsyn_sites\tsyn_sites\t".
"piN\tpiS\tmean_dN_vs_ref\tmean_dS_vs_ref\t".
"mean_gdiv_polymorphic\tmean_gdiv_nonsyn\tmean_gdiv_syn\n";
close PRODUCT_SUMMARY;
### POPULATION SUMMARY NONCODING RESULTS
if($multi_seq_mode == 0) {
open(POP_SUMMARY,">>population\_summary\.txt");
print POP_SUMMARY "file\tsites\tsites_coding\tsites_noncoding\t".
"pi\tpi_coding\tpi_noncoding\t".
#"mean_nonsyn_diffs\tmean_syn_diffs\t".
#"mean_nonsyn_diffs_vs_ref\tmean_syn_diffs_vs_ref\t".
"nonsyn_sites\tsyn_sites\t".
"piN\tpiS\tmean_dN_vs_ref\tmean_dS_vs_ref\t".
"mean_gdiv_polymorphic\tmean_gdiv_nonsyn\tmean_gdiv_syn\t".
"mean_gdiv\t".
"sites_polymorphic\t".
"mean_gdiv_coding_poly\t".
"sites_coding_poly\t".
"mean_gdiv_noncoding_poly\t".
"sites_noncoding_poly\n";
close POP_SUMMARY;
}
### A LOG file
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "file\tproduct\tsite\t".
"LOG\n";
close ERROR_FILE;
chdir('..');
}
# Hash for storing which product we've seen, just for error-reporting purposes
my %seen_product_hash;
my %seen_no_start_hash;
my %seen_no_stop_hash;
# Executive error switches
my $exec_errors = 0;
my $warn_5nt = 0;
my $warn_frequencies = 0;
my $SNP_report_counter = 0;
print "\n\n################################################################################".
"\n## ##".
"\n## SNPGenie Initiated! ##".
"\n## ##".
"\n################################################################################\n";
# Print LICENSE
print "\n ############################### LICENSE: #################################\n";
print " ## SNPGenie Copyright (C) 2015 Chase W. Nelson ##\n".
" ## This program comes with ABSOLUTELY NO WARRANTY; ##\n".
" ## This is free software, and you are welcome to redistribute it ##\n".
" ## under certain conditions; see LICENSE.txt. ##";
print "\n ############################################################################\n";
# GET THE TIME
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "NA\tNA\tNA\t".
"SNPGenie initiated at local time $local_time1\n";
close ERROR_FILE;
chdir('..');
# We will
# [1] determine whether COMPLEMENT ENTRIES are to be considered, and
# [2] if so, construct some way to DO EVERYTHING BELOW, but do it for the
# rev. complement SNP reports against the rev. complement FASTA with respect to the
# "-" strand records in the GTF file.
# Complement mode?
$complementmode = &determine_complement_mode($cds_file);
# Announce and initialize REVERSE COMPLEMENT MODE
my %hh_compl_position_info; # saved with respect to the + strand
#my @curr_compl_products;
my @curr_compl_products_ordered_by_start;
if($complementmode && ($multi_seq_mode == 0)) {
print "\nThere are - strand records in the GTF file. COMPLEMENT MODE activated...\n";
chdir('SNPGenie_Results');
open(PARAM_FILE,">>SNPGenie\_parameters\.txt");
print PARAM_FILE "COMPLEMENT MODE: Yes\n";
close PARAM_FILE;
chdir('..');
# Look through the GTF file for the - strand entries
# translate the start and stop sites to + strand sites using $fasta_length
#my $rev_complement_seq = &reverse_complement_from_fasta($fasta_to_open);
#my $rev_compl_seq = &reverse_complement_from_fasta($the_fasta_file);
#my $seq_length = length($rev_compl_seq);
open(GTF_FILE_AGAIN, "$cds_file") or die "\nCould not open the GTF file $cds_file - $!\n\n";
while(<GTF_FILE_AGAIN>) {
if($_ =~ /CDS\t(\d+)\t(\d+)\t\.\t\-\t\d+\tgene_id \"gene\:([\w\s\.\:']+)\"/) { # Line is - strand
my $rev_compl_start = $1; # Where the gene itself actually STOPS
my $rev_compl_stop = $2; # Where the gene itself actually STARTS
my $this_product = $3;
#my $feature_length = ($rev_compl_stop - $rev_compl_start + 1);
#my $offset = ($seq_length - $rev_compl_stop);
#my $this_start = ($offset + 1);
#my $this_start = $seq_length - $rev_compl_stop + 1;
#my $this_stop = ($this_start + $feature_length - 1);
#my $this_stop = $seq_length - $rev_compl_start + 1;
if(exists $hh_compl_position_info{$this_product}->{start}) {
$hh_compl_position_info{$this_product}->{start_2} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop_2} = $rev_compl_stop;
} else {
$hh_compl_position_info{$this_product}->{start} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop} = $rev_compl_stop;
}
} elsif($_ =~ /CDS\t(\d+)\t(\d+)\t\.\t\-\t\d+\tgene_id \"([\w\s\.\:']+ [\w\s\.\:']+)\"/) {
my $rev_compl_start = $1; # Where the gene itself actually STOPS
my $rev_compl_stop = $2; # Where the gene itself actually STARTS
my $this_product = $3;
if(exists $hh_compl_position_info{$this_product}->{start}) {
$hh_compl_position_info{$this_product}->{start_2} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop_2} = $rev_compl_stop;
} else {
$hh_compl_position_info{$this_product}->{start} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop} = $rev_compl_stop;
}
} elsif($_ =~ /CDS\t(\d+)\t(\d+)\t\.\t\-\t\d+\tgene_id \"([\w\s\.\:']+)\"/) {
my $rev_compl_start = $1; # Where the gene itself actually STOPS
my $rev_compl_stop = $2; # Where the gene itself actually STARTS
my $this_product = $3;
if(exists $hh_compl_position_info{$this_product}->{start}) {
$hh_compl_position_info{$this_product}->{start_2} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop_2} = $rev_compl_stop;
} else {
$hh_compl_position_info{$this_product}->{start} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop} = $rev_compl_stop;
}
# NOW, IN CASE transcript_id comes first
} elsif($_ =~ /CDS\t(\d+)\t(\d+)\t\.\t\-\t\d+\ttranscript_id \"[\w\s\.\:']+\"; gene_id \"gene\:([\w\s\.\:']+)\"/) {
my $rev_compl_start = $1; # Where the gene itself actually STOPS
my $rev_compl_stop = $2; # Where the gene itself actually STARTS
my $this_product = $3;
if(exists $hh_compl_position_info{$this_product}->{start}) {
$hh_compl_position_info{$this_product}->{start_2} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop_2} = $rev_compl_stop;
} else {
$hh_compl_position_info{$this_product}->{start} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop} = $rev_compl_stop;
}
} elsif($_ =~ /CDS\t(\d+)\t(\d+)\t\.\t\-\t\d+\ttranscript_id \"[\w\s\.\:']+\"; gene_id \"([\w\s\.\:']+ [\w\s\.\:']+)\"/) {
my $rev_compl_start = $1; # Where the gene itself actually STOPS
my $rev_compl_stop = $2; # Where the gene itself actually STARTS
my $this_product = $3;
if(exists $hh_compl_position_info{$this_product}->{start}) {
$hh_compl_position_info{$this_product}->{start_2} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop_2} = $rev_compl_stop;
} else {
$hh_compl_position_info{$this_product}->{start} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop} = $rev_compl_stop;
}
} elsif($_ =~ /CDS\t(\d+)\t(\d+)\t\.\t\-\t\d+\ttranscript_id \"[\w\s\.\:']+\"; gene_id \"([\w\s\.\:']+)\"/) {
my $rev_compl_start = $1; # Where the gene itself actually STOPS
my $rev_compl_stop = $2; # Where the gene itself actually STARTS
my $this_product = $3;
if(exists $hh_compl_position_info{$this_product}->{start}) {
$hh_compl_position_info{$this_product}->{start_2} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop_2} = $rev_compl_stop;
} else {
$hh_compl_position_info{$this_product}->{start} = $rev_compl_start;
$hh_compl_position_info{$this_product}->{stop} = $rev_compl_stop;
}
}
}
close GTF_FILE_AGAIN;
#@curr_compl_products = sort(keys %hh_compl_position_info);
@curr_compl_products_ordered_by_start = sort { $hh_compl_position_info{$a}->{start} <=> $hh_compl_position_info{$b}->{start} } keys %hh_compl_position_info;
#print "\nproduct\tstart\tstop\n";
#foreach (@curr_compl_products_ordered_by_start) {
# print "$_\t" . $hh_compl_position_info{$_}->{start} . "\t" . $hh_compl_position_info{$_}->{stop} . "\n";
#}
} else {
#print "\nThere are NO - strand records in the GTF file. COMPLEMENT MODE NOT activated...\n";
chdir('SNPGenie_Results');
open(PARAM_FILE,">>SNPGenie\_parameters\.txt");
print PARAM_FILE "COMPLEMENT MODE: No\n";
close PARAM_FILE;
chdir('..');
}
# PROCESS THE SNP REPORTS
foreach my $curr_snp_report_name (@snp_report_file_names_arr) {
my $file_nm = $curr_snp_report_name;
#my $curr_newline = &detect_newline_char($curr_snp_report_name);
#$/ = $curr_newline;
my %h_nc_results;
my $seen_percent_warning = 0;
if($multi_seq_mode == 1) {
print "\nThere are $fasta_arr_size FASTA files in the working directory. MULTI-SEQUENCE MODE activated...\n";
} elsif ($multi_seq_mode == 0) {
print "\nThere is $fasta_arr_size FASTA file in the working directory. ONE-SEQUENCE MODE activated...\n";
}
if($minfreq > 0) {
print "\nYour MIN. MINOR ALLELE FREQ. is $minfreq. All variants falling below this frequency will be ignored...\n";
} else {
print "\nYou have not selected a MIN. MINOR ALLELE FREQ. All variants in the SNP Report will be included...\n";
}
print "\n\n########################### CURRENTLY PROCESSING: ###########################\n".
"$file_nm... ";
#print "\n\n$_\n\n";
# Generate new file name prefix
my $new_file_prefix;
if($curr_snp_report_name =~/\.txt/) {
$new_file_prefix = $`;
} elsif($curr_snp_report_name =~/\.csv/) {
$new_file_prefix = $`;
} else {
$new_file_prefix = "inFile";
}
#print "\nPrefix for $curr_snp_report_name: $new_file_prefix\n\n";
# At first, created the tempfile before getting its headers; we've reversed the
# order so we can determine the format of the SNP Report (e.g., Geneious or CLC)
# before creating the tempfile. Should not create problems.
my @header_names_arr = &get_header_names($curr_snp_report_name,$curr_snp_report_name);
#print "@header_names_arr";
# CHECK FOR NON-CLC SNP REPORT, and reset the MODE if so
foreach (@header_names_arr) {
if($_ eq 'Minimum' || $_ eq 'Polymorphism Type' || $_ eq 'Change') {
# Switch to GENEIOUS mode
$geneious_mode = 1;
$clc_mode = 0;
$vcf_mode = 0;
#print "\n\nWe are in GENEIOUS MODE!\n\n";
} elsif($_ eq 'Reference Position' || $_ eq 'Overlapping annotations') {
# We remain in CLC mode
$geneious_mode = 0;
$clc_mode = 1;
$vcf_mode = 0;
#print "\n\nWe are in CLC MODE!\n\n";
} elsif($curr_snp_report_name =~ /\.vcf/) {
# Switch to VCF mode
$geneious_mode = 0;
$clc_mode = 0;
$vcf_mode = 1;
}
}
if($geneious_mode == 1 && $clc_mode == 0 && $vcf_mode == 0) {
print "GENEIOUS format detected\n";
} elsif($geneious_mode == 0 && $clc_mode == 1 && $vcf_mode == 0) {
print "CLC GENOMICS WORKBENCH format detected\n";
} elsif($geneious_mode == 0 && $clc_mode == 0 && $vcf_mode == 1) {
print "VCF format detected\n";
} else {
die "## WARNING: Conflicting SNP Report formats detected. Please contact author. ".
"## SNPGenie TERMINATED.\n\n";
}
# Create tempfiles
my $temp_snp_report_TEMPLATE = "temp_snp_report_XXXX";
# Remember, the following automatically OPENS the file, too.
my ($TEMP_SNP_REPORT_HANDLE,$temp_snp_report_name) =
tempfile($temp_snp_report_TEMPLATE, SUFFIX => ".txt", UNLINK => 1);
#print "\n\n\nTEMP FILE: $temp_snp_report_name\n\n\n";
# POPULATE the temporary file based on FORMAT MODE
if($clc_mode == 1) {
&populate_tempfile_clc($curr_snp_report_name,$temp_snp_report_name);
$curr_snp_report_name = $temp_snp_report_name;
} elsif($geneious_mode == 1) {
# (1) snpgenie_prep_geneious;
# (2) snpgnie_geneious_to_clc;
&populate_tempfile_geneious($curr_snp_report_name,$temp_snp_report_name);
$curr_snp_report_name = $temp_snp_report_name;
} elsif($vcf_mode == 1) {
&populate_tempfile_vcf($curr_snp_report_name,$temp_snp_report_name,$cds_file);
$curr_snp_report_name = $temp_snp_report_name;
}
# Includes the automatic deletion of the tempfile afterwards.
$/ = "\n";
my @new_header_names_arr = &get_header_names($curr_snp_report_name,$file_nm);
@header_names_arr = @new_header_names_arr;
#print "\n\nHEADER:\n@header_names_arr\n\n";
my $index_ref_pos;
my $index_type;
#my $index_len;
my $index_ref;
my $index_allele;
my $index_count;
my $index_cov;
my $index_freq;
my $index_over_annot;
#my $index_cod_reg_chg;
#my $index_ami_aci_chg;
my $seen_index_ref_pos = 0;
my $seen_index_type = 0;
#my $seen_index_len = 0;
my $seen_index_ref = 0;
my $seen_index_allele = 0;
my $seen_index_count = 0;
my $seen_index_cov = 0;
my $seen_index_freq = 0;
my $seen_index_over_annot = 0;
#my $seen_index_cod_reg_chg = 0;
#my $seen_index_ami_aci_chg = 0;
#print "\nref_pos: $index_ref_pos length: $index_len ref: $index_ref allele: $index_allele ".
# "count: $index_count cov: $index_cov freq: $index_freq over_annot: $index_over_annot ".
# "cod_reg_chg: $index_cod_reg_chg amino acid chg: $index_ami_aci_chg\n";
#print "\n\n$_\n\n";
# Determine the index of each column -- this is CLC
for (my $i=0; $i<scalar(@header_names_arr); $i++) {
if ($header_names_arr[$i] =~ /Reference Position/) {
$index_ref_pos = $i;
$seen_index_ref_pos = 1;
} elsif ($header_names_arr[$i] =~ /Type/) {
$index_type = $i;
$seen_index_type = 1;
# } elsif ($header_names_arr[$i] =~ /Length/) {
# $index_len = $i;
# $seen_index_len = 1;
} elsif ($header_names_arr[$i] =~ /Reference/) { # Since this comes AFTER
# "Reference Position, we're fine
$index_ref = $i;
$seen_index_ref = 1;
} elsif ($header_names_arr[$i] =~ /Allele/) {
$index_allele = $i;
$seen_index_allele = 1;
} elsif ($header_names_arr[$i] =~ /Count/) {
$index_count = $i;
$seen_index_count = 1;
} elsif ($header_names_arr[$i] =~ /Coverage/) {
$index_cov = $i;
$seen_index_cov = 1;
} elsif ($header_names_arr[$i] =~ /Frequency/) {
$index_freq = $i;
$seen_index_freq = 1;
} elsif ($header_names_arr[$i] =~ /Overlapping annotations/) {
$index_over_annot = $i;
$seen_index_over_annot = 1;
# } elsif ($header_names_arr[$i] =~ /Coding region change/) {
# $index_cod_reg_chg = $i;
# $seen_index_cod_reg_chg = 1;
# } elsif ($header_names_arr[$i] =~ /Amino acid change/) {
# $index_ami_aci_chg = $i;
# $seen_index_ami_aci_chg = 1;
}
}
if($seen_index_ref_pos == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Reference Position\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Reference Position\". SNPGenie terminated\n\n";
} elsif($seen_index_type == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Type\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Type\". SNPGenie terminated\n\n";
# } elsif($seen_index_len == 0) {
# die "\n\n## WARNING: $curr_snp_report_name does not contain the column header \"Length\". SNPGenie terminated\n\n";
} elsif($seen_index_ref == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Reference\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Reference\". SNPGenie terminated\n\n";
} elsif($seen_index_allele == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Allele\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Allele\". SNPGenie terminated\n\n";
} elsif($seen_index_count == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Count\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Count\". SNPGenie terminated\n\n";
} elsif($seen_index_cov == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Coverage\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Coverage\". SNPGenie terminated\n\n";
} elsif($seen_index_freq == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Frequency\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Frequency\". SNPGenie terminated\n\n";
} elsif($seen_index_over_annot == 0) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"Does not contain the column header \"Overlapping annotations\". SNPGenie terminated\n";
close ERROR_FILE;
chdir('..');
#unlink $curr_snp_report_name;
die "\n\n## WARNING: $file_nm does not contain the column header \"Overlapping annotations\". SNPGenie terminated\n\n";
# } elsif($seen_index_cod_reg_chg == 0) {
# die "\n\n## WARNING: $curr_snp_report_name does not contain the column header \"Coding region change\". SNPGenie terminated\n\n";
# } elsif($seen_index_ami_aci_chg == 0) {
# die "\n\n## WARNING: $curr_snp_report_name does not contain the column header \"Amino acid change\". SNPGenie terminated\n\n";
}
#print "\n\n$_\n\n";
my @product_names_arr = &get_product_names($curr_snp_report_name,$index_over_annot);
#print "\n\nThe file $curr_snp_report_name has the following products: @product_names_arr\n";
my @product_names_to_add_arr = &get_product_names_from_gtf($cds_file);
# This part is technically unnecessary, but allows additional checks for consistency
my %final_product_names_hash;
foreach (@product_names_arr) {
$final_product_names_hash{$_} = 1;
}
foreach (@product_names_to_add_arr) {
$final_product_names_hash{$_} = 1;
}
@product_names_arr = sort(keys %final_product_names_hash);
#print "\n\nAll my product names are: @product_names_arr\n\n";
# Now we have a specific file we're looking at ($curr_snp_report_name), and we
# know all the names of products within that file (stored in @product_names_arr),
# and we know all the names of fasta files in this directory (stored in
# @fasta_file_names_arr). So we have to identify the products in their names,
# taking special care with the HA's -- HA, HA1, HA2
# Create a hash with the product names as keys, and the fasta files they refer to
# as values
my %product_to_fasta_file;
# MULTI-SEQUENCE MODE
if($multi_seq_mode == 1) {
foreach my $curr_product (@product_names_arr) {
#print "\tCurrently processing product: $curr_product...\n";
# The fasta file name must begin with "PRODUCTNAME_"
foreach my $curr_fasta_file_name (@fasta_file_names_arr) {
#print "\n\nMy current fasta file is: $curr_fasta_file_name\n\n";
#product - file prefix
#HA - HA
#HA1 - HA
#HA2 - HA
#PB1 - PB1
#NP - NP
#PA - PA
#PB2 - PB2
#NA - NA
# Find what prefix the current fasta file has, e.g., "NEP" in "NEP_1918.fa"
my $fasta_file_contains;
# To allow for primes (') in the name
if ($curr_fasta_file_name =~/^([\w\s']+?)_/) { # includes newline
$fasta_file_contains = $1;
#print "\n\nFasta file $curr_fasta_file_name contains: $1\n\n";
}
# Find out if the file prefix is a fit for the product and, if so, store it
if ($curr_product eq $fasta_file_contains) {
$product_to_fasta_file{$curr_product} = $curr_fasta_file_name;
} elsif ($curr_product =~/$fasta_file_contains/) {
$product_to_fasta_file{$curr_product} = $curr_fasta_file_name;
}
}
}
#print ".";
foreach (keys %product_to_fasta_file) {
print "\n–The product $_ refers to the file $product_to_fasta_file{$_}\n";
}
print "\n";
} else { # $multi_seq_mode == 0, ONE-SEQUENCE MODE, fasta is same for all
foreach my $curr_product (@product_names_arr) {
$product_to_fasta_file{$curr_product} = $the_fasta_file;
}
print "\n-All products are found in the same sequence file: $the_fasta_file\n\n";
}
# Now we have, for the current SNP Report we're examining, the fasta files to which
# each of the products refer, in the hash %products_to_fasta_file
# Next, let's store the contents of the SNP Report in a hash of hashes, with the
# outer key as the product, followed by and making sure to store the name of the
# associated fasta file
my %hh_product_position_info; # SEE EXAMPLE
# Example:
#my %hh_example = (
# 'HA' => {
# 'fasta' => 'theFile.txt',
# 'start' => 666,
# 'stop' =>,
# 158 => {
# 'fasta' => $product_to_fasta_file{'HA'},
# 'ref' => 'C',
# 'cov' => 3002,
# 'A' => 0,
# 'C' => 0,
# 'G' => 0,
# 'T' => 0
# },
# 160 => {
# 'A' => 0,
# 'C' => 0,
# 'G' => 0,
# 'T' => 0
# }
# },
# 'NA' => {
# 158 => {
# 'A' => 0,
# 'C' => 0,
# 'G' => 0,
# 'T' => 0
# },
# 160 => {
# 'A' => 0,
# 'C' => 0,
# 'G' => 0,
# 'T' => 0
# }
# }
#);
# For site-by-site plain nucleotide diversity of all sites in the FASTA.
# No partitioning into synonymous and nonsynonymous.
my %hh_nc_position_info;
##### DATA STORAGE ##################################################################
# Open current SNP Report to store data for each line
my $line = 0;
#open (INFILE, $curr_snp_report_name);
while (<$TEMP_SNP_REPORT_HANDLE>) {
if($line == 0) {
$line++;
} else {
#chomp;
# CHOMP for 3 operating systems
if($_ =~ /\r\n?/) {
$_ =~ s/\r\n//;
} elsif($_ =~ /\r$/) {
$_ =~ s/\r//;
} elsif($_ =~ /\n$/) {
$_ =~ s/\n//;
}
#if ($_ =~/(.+)\r$/) {
# $_ = $1;
# print "\nYes, in $curr_snp_report_name we have an ending \\r\n";
#}
my @line_arr = split(/\t/,$_,-1);
#$line++;
#if ($line_arr[$index_over_annot] eq '') {
# print "\nIn $curr_snp_report_name, line $line, line_arr[index_over_annot] ".
# "is ne ''";
#}
# LESSON: even when the first line has no entry for Overlapping annotations,
# that datum is still (ne '').
# Check that we're dealing with a single nucleotide variant, or
# SNV in the "Type" column, and that we have a specific product, meaning
# the "Overlapping annotations" column is not blank
#if(($line_arr[$index_type] eq 'SNV' || $line_arr[$index_type] eq 'MNV') && ($line_arr[$index_over_annot] =~/CDS/)) {
if(($line_arr[$index_type] eq 'SNV') && ($line_arr[$index_count] > 0) && ($line_arr[$index_freq] > 0)) {
# Store data in variables to save room on screen and verify
my $position = $line_arr[$index_ref_pos];
my $coverage = $line_arr[$index_cov];
my $count = $line_arr[$index_count];
my $reference_nt = $line_arr[$index_ref];
my $variant_nt = $line_arr[$index_allele];
my $frequency = $line_arr[$index_freq];
my $var_prop = ($count/$coverage);
#my $var_prop = ($line_arr[$index_freq])/100;
my $ref_prop = (1-$var_prop);
my $ref_prop_key = "$reference_nt\_prop";
my $var_prop_key = "$variant_nt\_prop";
#print "\nSNV site $position\. ref_prop_key: ref_prop_key: $ref_prop_key | ref_prop: $ref_prop | var_prop_key: $var_prop_key | var_prop: $var_prop";
if(length($reference_nt) == length($variant_nt)) {
# Now, syn. and nonsyn. nucleotide diversity for coding variants
if($line_arr[$index_over_annot] =~/CDS/) {
# was if(($line_arr[$index_type] eq 'SNV') && ($line_arr[$index_over_annot] ne ''))
# Get the product name(s) for this line
my $product_name;
my $mature_peptide_name;
my $over_annot = $line_arr[$index_over_annot];
#print "\n\nover_annot is: $over_annot\n\n";
# Lines ALWAYS contain a product name
#if ($over_annot =~/CDS: (\w+)/) {
# $product_name = $1;
#}
my @peptide_coord_arr;
my @product_coord_arr;
# ORIGINAL HERE
# Lines SOMETIMES contain a mature peptide name
#if ($over_annot =~/Mature peptide: (\w+)/) {
# $product_name = $1;
# #@peptide_coord_arr = &get_product_coordinates($product_name);
#} elsif ($over_annot =~/CDS: (\w+)/) {
# $product_name = $1;
#}
# EXPLANATION: &get_product_names is used to get @product_names_arr,
# which is looped through for every $curr_product. &get_product_names
# still had the if/elsif structure. If the SNP report had a Mature peptide
# line for HA1 or HA2, but no HA by itself, HA did not exist as one of the
# $curr_product's. This would then not have been used as a key to store
# anything in the %product_to_fasta_file hash. Thus there would be no
# fasta file corresponding to HA when defining "my $fasta_file" below,
# and no nucleotides at all, therefore no sites at all. The codons were
# indeed blank in the results file.
# EXPERIMENTAL
if ($over_annot =~/Mature peptide: ([\w\s\.']+)/) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\tNA\tNA\t".
"\"Mature peptide\" annotation must take into account ".
"MNV records for CLC SNP Reports\n";
close ERROR_FILE;
chdir('..');
print "\n## WARNING: \"Mature peptide\" annotation must take into account\n".
"### MNV records for CLC SNP Reports.\n";
$mature_peptide_name = $1;
@peptide_coord_arr = &get_product_coordinates($mature_peptide_name,$curr_snp_report_name);
}
if ($over_annot =~/CDS: ([\w\s\.']+)/) {
$product_name = $1;
@product_coord_arr = &get_product_coordinates($product_name,$curr_snp_report_name);
#print "\n\n$product_name product_coord arr: @product_coord_arr\n\n";
}
# In some files, we have such atrocities as:
# ORF: gag
# ORF: gag, ORF: pol
# ORF: Nef, ORF: Env, ORF: Rev
#if ($over_annot =~/ORF: (\w+)/) {
# $product_name = $1;
# @product_coord_arr = &get_product_coordinates($product_name);
#}
#
#print "The array: @product_coord_arr";
#elsif ($over_annot =~/Gene: (\w+)/) { # CHANGED THIS TO IGNORE GENE-ONLY ANNOTATIONS: WANT CDS
# $product_name = $1;
# @product_coord_arr = &get_product_coordinates($product_name);
#}
if($seen_percent_warning == 0 && $frequency < 0.01) {
chdir('SNPGenie_Results');
open(ERROR_FILE,">>SNPGenie\_LOG\.txt");
print ERROR_FILE "$file_nm\t$product_name\t$position\t".
"There is a variant frequency <0.01%. If this ".
"was unexpected, make sure that the values in the ".
"\"Frequency\" column are percentages, not ".
"decimals.\n";
close ERROR_FILE;
chdir('..');
print "\n## WARNING There is a variant frequency <0.01%. If this ".
"was unexpected, make sure that the \n## values in the ".
"\"Frequency\" column are percentages, not ".
"decimals.\n";
$seen_percent_warning = 1;
}
my $fasta_file = $product_to_fasta_file{$product_name}; # SHOULDN'T MATTER
#my @product_coord_arr = &get_product_coordinates($product_name);
my $product_start = $product_coord_arr[0];
my $product_stop = $product_coord_arr[1];
my $product_start_2;
my $product_stop_2;
if ($product_coord_arr[2]) {
$product_start_2 = $product_coord_arr[2];
$product_stop_2 = $product_coord_arr[3];
#print "\nYes, there's a second sequence and it goes from $product_start_2 to $product_stop_2\n";
}
#my $A_count = 0;