diff --git a/src/papyrus_scripts/__init__.py b/src/papyrus_scripts/__init__.py index bfb035e..d522244 100644 --- a/src/papyrus_scripts/__init__.py +++ b/src/papyrus_scripts/__init__.py @@ -16,4 +16,4 @@ from .utils.mol_reader import MolSupplier from .utils import IO, UniprotMatch -__version__ = '1.0.0' +__version__ = '1.0.1' diff --git a/src/papyrus_scripts/preprocess.py b/src/papyrus_scripts/preprocess.py index 8642bd2..d15bb0a 100644 --- a/src/papyrus_scripts/preprocess.py +++ b/src/papyrus_scripts/preprocess.py @@ -83,7 +83,7 @@ def _chunked_keep_quality(chunks: Union[PandasTextFileReader, Iterator], min_qua yield filtered_chunk -def process_group(group): +def process_group(group, additional_columns: Optional[List[str]] = None): """Aggregate data from one group accordingly""" if (group.values[0] == group.values).all(): # If all values are equal, return first record group['pchembl_value_Mean'] = group['pchembl_value'] @@ -93,13 +93,20 @@ def process_group(group): group['pchembl_value_Median'] = group['pchembl_value'] group['pchembl_value_MAD'] = np.NaN return group.iloc[:1, :] + # Lambda: Return one value if all are the same listvals = lambda x: ';'.join(set(str(y) for y in x)) if (x.values[0] == x.values).all() else ';'.join( str(y) for y in x) + # Lambda: Return all values everytime listallvals = lambda x: ';'.join(str(y) for y in x) + # Aggregation rules mappings = {'source': 'first', 'CID': listvals, 'AID': listvals, - 'type_IC50': listallvals, 'type_EC50': listallvals, 'type_KD': listallvals, - 'type_Ki': listallvals, 'type_other': listallvals, 'relation': listvals, + 'type_IC50': listvals, 'type_EC50': listvals, 'type_KD': listvals, + 'type_Ki': listvals, 'type_other': listvals, 'relation': listvals, 'pchembl_value': listallvals} + # Consider other columns + if additional_columns is not None: + for column in additional_columns: + mappings[column] = listvals return pd.concat([group.groupby('Activity_ID').aggregate(mappings).reset_index(), group.groupby('Activity_ID')['pchembl_value'].aggregate(pchembl_value_Mean='mean', pchembl_value_StdDev='std', @@ -110,9 +117,9 @@ def process_group(group): ).reset_index(drop=True)], axis=1) -def process_groups(groups): +def process_groups(groups, additional_columns: Optional[List[str]] = None): """Aggregate data from multiple groups""" - return pd.concat([process_group(group) for group in groups]) + return pd.concat([process_group(group, additional_columns) for group in groups]) def keep_source(data: Union[pd.DataFrame, PandasTextFileReader, Iterator], source: Union[List[str], str] = 'all', njobs: int = 1, diff --git a/src/papyrus_scripts/utils/links.json b/src/papyrus_scripts/utils/links.json index ab9eb96..a39fb6a 100644 --- a/src/papyrus_scripts/utils/links.json +++ b/src/papyrus_scripts/utils/links.json @@ -28,9 +28,9 @@ ], "papyrus++": { "name": "05.4++_combined_set_without_stereochemistry.tsv.xz", - "url": "https://drive.google.com/uc?id=1Z3Y7BTrUxgui1E6e1qYbG-YiCEPf_95L&confirm=t", - "size": 96413812, - "sha256": "e37aef941739ec2524a13a25d515812e91b90f2241bf4c1185096ed5e5a01999" + "url": "https://drive.google.com/uc?id=1ES8V3Pbw3xVpDIjs9J2RL2jeTHr_YTm0&confirm=t", + "size": 40278204, + "sha256": "42dcbe76b33ad541f6c54673eccffa15af64785cf844938c0f73518dfdf4404b" }, "2D_papyrus": { "name": "05.4_combined_set_without_stereochemistry.tsv.xz", @@ -132,9 +132,9 @@ }], "papyrus++": { "name": "05.5++_combined_set_without_stereochemistry.tsv.xz", - "url": "https://zenodo.org/record/7019874/files/05.5%2B%2B_combined_set_without_stereochemistry.tsv.xz?download=1", - "size": 116092692, - "sha256": "ad15ac5535c6640f8dc3e501b1c7fd0cb39b7bbcd025e4506ec5efc82ec0a266" + "url": "https://drive.google.com/uc?id=11QDDruvyf8OhVuGf5V38etvorL8SPqcp&confirm=t", + "size": 41357608, + "sha256": "8ecaea9533f3c475dca6d335f30dd1b4abb259fa77b7441548dd15879e1afa58" }, "2D_papyrus": { "name": "05.5_combined_set_without_stereochemistry.tsv.xz", @@ -224,9 +224,9 @@ }, "papyrus++": { "name": "05.6++_combined_set_without_stereochemistry.tsv.xz", - "url": "https://zenodo.org/record/7377161/files/05.6%2B%2B_combined_set_without_stereochemistry.tsv.xz?download=1", - "size": 104203892, - "sha256": "3499c9364a3a9044090ccdf44f21346546945f729c79925de0906194eec9a0b2" + "url": "https://drive.google.com/uc?id=1KCzcAJBvn_EcYjyBrOTmxzWFKGoazJp2&confirm=t", + "size": 31085780, + "sha256": "7518019c3ba287cd4cd0ff29425fe9da8a4760d891d22ed1abb33da4920cf96a" }, "2D_papyrus": { "name": "05.6_combined_set_without_stereochemistry.tsv.xz",