-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpfam_make_fastas.R
75 lines (67 loc) · 2.75 KB
/
pfam_make_fastas.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Make FASTA files from alignments.
# Setting the working directory.
# setwd("~/not_backed_up/alignments")
# Import libraries.
library(stringr)
# library(plyr)
# Load image. This image should contain the alignment summaries used to subset the data.
load("/extra/ljkosinski/Scripts/RateShift/pfam_summary_9-13-19.RData")
# Variables for subsetting the data.
pfam.seqs.max <- 300 # This is a less than or equal to.
pfam.seqs.min <- 200 # This is greater than.
#pfam.sites <- 100
# Should the output be in FASTA or Phylip format?
# I am not sure I have the Phylip format output working properly.
phylip.format <- FALSE
# Pfam directory
pfam.dir <- "/extra/ljkosinski/Alignments/Pfam300/"
# Function for generating FASTA files.
# Currently, the column name for the Pfam ID must be titled "PfamUID",
# the species column must be titled "NewickSpeciesName", and
# the alignment column must be titled "AlignedPeptide".
# Can also be set to generate Phylip files instead.
generate.fasta <- function(pfam.id, pfam.alignments, phylip = FALSE){
full.seq <- ""
df.subset <- pfam.alignments[pfam.alignments$PfamUID == pfam.id, ]
df.length <- length(df.subset[, 2])
current.seq <- ""
if (phylip == FALSE){
for (i in 1:df.length) {
current.seq <- paste(">", df.subset$NewickSpeciesName[i], "|", df.subset$UID[i], "-", df.subset$PfamUID[i],
"\n", df.subset$AlignedPeptide[i], "\n", sep = "")
full.seq <- paste(full.seq, current.seq, sep = "")
}
} else if (phylip == TRUE){
header <- paste(df.length, " ", str_length(df.subset$AlignedPeptide[1]), "\n", sep = "")
full.seq <- paste(full.seq, header, sep = "")
for (i in 1:df.length) {
current.seq <- paste(df.subset$NewickSpeciesName[i], "|", df.subset$UID[i], "-", df.subset$PfamUID[i], "\t", df.subset$AlignedPeptide[i], "\n", sep = "")
full.seq <- paste(full.seq, current.seq, sep = "")
}
} else {
print("Format not set properly!")
}
return(full.seq)
}
# Outputting FASTA files based on the appropriate sites/seqs parameters.
pfam.subset <- pfam.summary[
pfam.summary$Sequences <= pfam.seqs.max
& pfam.summary$Sequences > pfam.seqs.min
#& pfam.summary$Sites <= pfam.sites
, ]
#print(length(pfam.subset$PfamUID))
current.pfam <- ""
current.alignment <- ""
for (i in 1:length(pfam.subset$PfamUID)) {
if (phylip.format == FALSE){
file.type <- ".fasta"
} else {
file.type <- ".phylip"
}
current.pfam <- paste(pfam.dir, pfam.subset$PfamUID[i], file.type, sep = "")
print(current.pfam)
current.alignment <- generate.fasta(pfam.subset$PfamUID[i], align.data, phylip.format)
write(current.alignment, file = current.pfam)
current.pfam <- ""
current.alignment <- ""
}