Skip to content

Commit

Permalink
Update links to fixed Papyrus++ dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
OlivierBeq committed Apr 7, 2023
1 parent 86733d0 commit cf9bdbf
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 15 deletions.
2 changes: 1 addition & 1 deletion src/papyrus_scripts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
from .utils.mol_reader import MolSupplier
from .utils import IO, UniprotMatch

__version__ = '1.0.0'
__version__ = '1.0.1'
17 changes: 12 additions & 5 deletions src/papyrus_scripts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _chunked_keep_quality(chunks: Union[PandasTextFileReader, Iterator], min_qua
yield filtered_chunk


def process_group(group):
def process_group(group, additional_columns: Optional[List[str]] = None):
"""Aggregate data from one group accordingly"""
if (group.values[0] == group.values).all(): # If all values are equal, return first record
group['pchembl_value_Mean'] = group['pchembl_value']
Expand All @@ -93,13 +93,20 @@ def process_group(group):
group['pchembl_value_Median'] = group['pchembl_value']
group['pchembl_value_MAD'] = np.NaN
return group.iloc[:1, :]
# Lambda: Return one value if all are the same
listvals = lambda x: ';'.join(set(str(y) for y in x)) if (x.values[0] == x.values).all() else ';'.join(
str(y) for y in x)
# Lambda: Return all values everytime
listallvals = lambda x: ';'.join(str(y) for y in x)
# Aggregation rules
mappings = {'source': 'first', 'CID': listvals, 'AID': listvals,
'type_IC50': listallvals, 'type_EC50': listallvals, 'type_KD': listallvals,
'type_Ki': listallvals, 'type_other': listallvals, 'relation': listvals,
'type_IC50': listvals, 'type_EC50': listvals, 'type_KD': listvals,
'type_Ki': listvals, 'type_other': listvals, 'relation': listvals,
'pchembl_value': listallvals}
# Consider other columns
if additional_columns is not None:
for column in additional_columns:
mappings[column] = listvals
return pd.concat([group.groupby('Activity_ID').aggregate(mappings).reset_index(),
group.groupby('Activity_ID')['pchembl_value'].aggregate(pchembl_value_Mean='mean',
pchembl_value_StdDev='std',
Expand All @@ -110,9 +117,9 @@ def process_group(group):
).reset_index(drop=True)], axis=1)


def process_groups(groups):
def process_groups(groups, additional_columns: Optional[List[str]] = None):
"""Aggregate data from multiple groups"""
return pd.concat([process_group(group) for group in groups])
return pd.concat([process_group(group, additional_columns) for group in groups])


def keep_source(data: Union[pd.DataFrame, PandasTextFileReader, Iterator], source: Union[List[str], str] = 'all', njobs: int = 1,
Expand Down
18 changes: 9 additions & 9 deletions src/papyrus_scripts/utils/links.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@
],
"papyrus++": {
"name": "05.4++_combined_set_without_stereochemistry.tsv.xz",
"url": "https://drive.google.com/uc?id=1Z3Y7BTrUxgui1E6e1qYbG-YiCEPf_95L&confirm=t",
"size": 96413812,
"sha256": "e37aef941739ec2524a13a25d515812e91b90f2241bf4c1185096ed5e5a01999"
"url": "https://drive.google.com/uc?id=1ES8V3Pbw3xVpDIjs9J2RL2jeTHr_YTm0&confirm=t",
"size": 40278204,
"sha256": "42dcbe76b33ad541f6c54673eccffa15af64785cf844938c0f73518dfdf4404b"
},
"2D_papyrus": {
"name": "05.4_combined_set_without_stereochemistry.tsv.xz",
Expand Down Expand Up @@ -132,9 +132,9 @@
}],
"papyrus++": {
"name": "05.5++_combined_set_without_stereochemistry.tsv.xz",
"url": "https://zenodo.org/record/7019874/files/05.5%2B%2B_combined_set_without_stereochemistry.tsv.xz?download=1",
"size": 116092692,
"sha256": "ad15ac5535c6640f8dc3e501b1c7fd0cb39b7bbcd025e4506ec5efc82ec0a266"
"url": "https://drive.google.com/uc?id=11QDDruvyf8OhVuGf5V38etvorL8SPqcp&confirm=t",
"size": 41357608,
"sha256": "8ecaea9533f3c475dca6d335f30dd1b4abb259fa77b7441548dd15879e1afa58"
},
"2D_papyrus": {
"name": "05.5_combined_set_without_stereochemistry.tsv.xz",
Expand Down Expand Up @@ -224,9 +224,9 @@
},
"papyrus++": {
"name": "05.6++_combined_set_without_stereochemistry.tsv.xz",
"url": "https://zenodo.org/record/7377161/files/05.6%2B%2B_combined_set_without_stereochemistry.tsv.xz?download=1",
"size": 104203892,
"sha256": "3499c9364a3a9044090ccdf44f21346546945f729c79925de0906194eec9a0b2"
"url": "https://drive.google.com/uc?id=1KCzcAJBvn_EcYjyBrOTmxzWFKGoazJp2&confirm=t",
"size": 31085780,
"sha256": "7518019c3ba287cd4cd0ff29425fe9da8a4760d891d22ed1abb33da4920cf96a"
},
"2D_papyrus": {
"name": "05.6_combined_set_without_stereochemistry.tsv.xz",
Expand Down

0 comments on commit cf9bdbf

Please sign in to comment.