-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_CDS_AA.py
72 lines (58 loc) · 2.71 KB
/
get_CDS_AA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation
# Prompt user for GFF and genome FASTA file paths
gff_file = input("Please enter the path to the GFF file: ")
genome_fasta = input("Please enter the path to the genome FASTA file: ")
# Load genome sequence
genome_dict = SeqIO.to_dict(SeqIO.parse(genome_fasta, "fasta"))
# Print scaffold names in the genome FASTA file
print("Scaffold names in the genome FASTA:")
for scaffold in genome_dict.keys():
print(scaffold)
# Lists to store CDS sequences and proteins
cds_sequences = []
protein_sequences = []
# Parsing the GFF file manually
with open(gff_file) as gff_handle:
for line in gff_handle:
if line.startswith("#"):
continue # Skip header lines
parts = line.strip().split("\t")
if len(parts) < 9:
continue # Skip malformed lines
seq_id = parts[0]
feature_type = parts[2]
start = int(parts[3])
end = int(parts[4])
strand = parts[6]
attributes = parts[8]
# Extract the CDS if the feature type is CDS
if feature_type == "CDS":
# Parse the ID from the attributes (assuming "ID=" exists)
cds_id = "unknown"
for attribute in attributes.split(";"):
if attribute.startswith("ID="):
cds_id = attribute.split("=")[1]
# Extract the relevant sequence from the genome
if seq_id in genome_dict:
feature_seq = genome_dict[seq_id].seq[start-1:end] # Adjust to 0-based index
if strand == "-":
feature_seq = feature_seq.reverse_complement()
# Store the CDS sequence
cds_record = SeqRecord(feature_seq, id=cds_id, description="CDS sequence")
cds_sequences.append(cds_record)
# Translate the CDS to a protein sequence
protein_seq = feature_seq.translate(to_stop=True)
protein_record = SeqRecord(protein_seq, id=cds_id, description="Protein sequence")
protein_sequences.append(protein_record)
# Output file names
cds_output = input("Please enter the output file name for CDS sequences (e.g., cds_sequences.fasta): ")
protein_output = input("Please enter the output file name for protein sequences (e.g., protein_sequences.fasta): ")
# Save the CDS sequences to a FASTA file
with open(cds_output, "w") as cds_outfile:
SeqIO.write(cds_sequences, cds_outfile, "fasta")
# Save the protein sequences to a FASTA file
with open(protein_output, "w") as protein_outfile:
SeqIO.write(protein_sequences, protein_outfile, "fasta")
print("CDS and protein sequences have been extracted and saved.")