12
12
13
13
14
14
class TabularProtein (StoredProtein ):
15
+ """A protein object that is stored in a tabular format.
16
+
17
+ Attributes:
18
+ id (str): id of the protein
19
+ sequence (str): sequence of the protein
20
+ props (dict[str, Any]): properties of the protein
21
+ representations (Iterable[TabularProtein]): representations of the protein
22
+ """
15
23
16
24
def __init__ (
17
25
self ,
@@ -21,12 +29,14 @@ def __init__(
21
29
props : dict [str , Any ] | None = None ,
22
30
representations : Iterable ["TabularProtein" ] | None = None ,
23
31
) -> None :
24
- """
25
- Create a new protein instance.
26
-
27
- :param parent: parent protein
28
- :param protein_id: identifier of the protein
29
- :param sequence: sequence of the protein
32
+ """Create a new protein instance.
33
+
34
+ Args:
35
+ protein_id (str): identifier of the protein
36
+ sequence (str): sequence of the protein
37
+ parent (TabularProtein): parent protein
38
+ props (dict[str, Any]): properties of the protein
39
+ representations (Iterable[TabularProtein]): representations of the protein
30
40
"""
31
41
self ._parent = parent
32
42
self ._id = protein_id
@@ -36,28 +46,42 @@ def __init__(
36
46
37
47
@property
38
48
def id (self ) -> str :
49
+ """Get the id of the protein."""
39
50
return self ._id
40
51
41
52
@property
42
53
def sequence (self ) -> str | None :
54
+ """Get the sequence of the protein."""
43
55
return self ._sequence
44
56
45
57
@property
46
58
def props (self ) -> dict [str , Any ] | None :
59
+ """Get the properties of the protein."""
47
60
return self ._props
48
61
49
62
def as_pdb (self ) -> str | None :
63
+ """Return the protein as a PDB file."""
50
64
return self ._props ["pdb" ] if "pdb" in self ._props else None
51
65
52
66
def as_fasta (self ) -> str | None :
67
+ """Return the protein as a FASTA file."""
53
68
return self ._props ["fasta" ] if "fasta" in self ._props else None
54
69
55
70
@property
56
71
def representations (self ) -> Iterable ["TabularProtein" ]:
72
+ """Get all representations of the protein."""
57
73
return self ._representations
58
74
59
75
60
76
class TabularProteinStorage (ProteinStorage , PandasDataTable ):
77
+ """A storage class for proteins stored in a tabular format.
78
+
79
+ Attributes:
80
+ sequenceCol (str): name of the column that contains all protein sequences
81
+ proteinSeqProvider (Callable): function that provides protein
82
+ sequenceProp (str): name of the property that contains all protein sequences
83
+ proteins (Iterable[TabularProtein]): all proteins in the store
84
+ """
61
85
62
86
def __init__ (
63
87
self ,
@@ -75,6 +99,23 @@ def __init__(
75
99
store_format : str = "pkl" ,
76
100
parallel_generator : ParallelGenerator | None = None ,
77
101
):
102
+ """Create a new protein storage instance.
103
+
104
+ Args:
105
+ name (str): name of the storage
106
+ df (pd.DataFrame): data frame containing the proteins
107
+ sequence_col (str): name of the column that contains all protein sequences
108
+ sequence_provider (Callable): function that provides protein
109
+ store_dir (str): directory to store the data
110
+ overwrite (bool): overwrite the existing data
111
+ index_cols (list[str]): columns to use as index
112
+ n_jobs (int): number of parallel jobs
113
+ chunk_size (int): size of the chunks
114
+ protein_col (str): name of the column that contains the protein ids
115
+ random_state (int): random state
116
+ store_format (str): format to store the data
117
+ parallel_generator (ParallelGenerator): parallel generator
118
+ """
78
119
super ().__init__ (
79
120
name ,
80
121
df if df is not None else pd .DataFrame (columns = [sequence_col ,
@@ -158,16 +199,32 @@ def getPCMInfo(self) -> tuple[dict[str, str], dict]:
158
199
159
200
@property
160
201
def sequenceProp (self ) -> str :
202
+ """Get the name of the property that contains all protein sequences."""
161
203
return self ._sequenceCol
162
204
163
205
def add_protein (self , protein : TabularProtein , raise_on_existing = True ):
206
+ """Add a protein to the store.
207
+
208
+ Args:
209
+ protein (TabularProtein): protein sequence
210
+ raise_on_existing (bool):
211
+ raise an exception if the protein already exists in the store
212
+ """
164
213
self .addEntries (
165
214
[protein .id ],
166
215
{prop : [val ] for prop , val in protein .props },
167
216
raise_on_existing
168
217
)
169
218
170
219
def _make_proteins_from_chunk (self , df : pd .DataFrame ) -> list [TabularProtein ]:
220
+ """Create a list of proteins from a chunk of the data frame.
221
+
222
+ Args:
223
+ df (pd.DataFrame): chunk of the data frame
224
+
225
+ Returns:
226
+ list[TabularProtein]: list of proteins
227
+ """
171
228
ids = df [self .idProp ].values
172
229
sequences = df [self .sequenceProp ].values
173
230
props = df .columns .difference ([self .idProp , self .sequenceProp ])
@@ -182,12 +239,28 @@ def _make_proteins_from_chunk(self, df: pd.DataFrame) -> list[TabularProtein]:
182
239
183
240
@property
184
241
def proteins (self ) -> list [TabularProtein ]:
242
+ """Get all proteins in the store.
243
+
244
+ Returns:
245
+ list[TabularProtein]: list of proteins
246
+ """
185
247
ret = []
186
248
for chunk in self .iterChunks (len (self )):
187
249
ret .extend (self ._make_proteins_from_chunk (chunk ))
188
250
return ret
189
251
190
252
def getProtein (self , protein_id : str ) -> TabularProtein :
253
+ """Get a protein from the store using its name.
254
+
255
+ Args:
256
+ protein_id (str): name of the protein to search
257
+
258
+ Returns:
259
+ TabularProtein: instance of `Protein`
260
+
261
+ Raises:
262
+ ValueError: if the protein is not found
263
+ """
191
264
df = self .getDF ()
192
265
protein = df [df [self .idProp ] == protein_id ]
193
266
if protein .empty :
0 commit comments