Skip to content

Commit

Permalink
filter prefixes - option to require all prefixes (#379)
Browse files Browse the repository at this point in the history
* nothing commit

* committing so i can install in koza

* add to filter_out_prefix as well

* add extra tests for new options

* revert to two methods, but keep require_all_prefixes option

* bump version

* uncomment tests (#2)

* match filter out and filter return

* fix lint issues i think

* bumped version too far

* Remove unnecessary Args: from filter_prefixes
  • Loading branch information
glass-ships authored Jun 1, 2023
1 parent 90ca6a5 commit 1891bb6
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 28 deletions.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
venv/
.venv
venv
sssom/__pycache__/
.idea
sssom.egg-info
Expand All @@ -24,4 +25,4 @@ schema/sssom.schema.json
schema/sssom.yaml
schema/sssom_datamodel.py
sssom/internal_context.py
sssom/sssom_datamodel.py
sssom/sssom_datamodel.py
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@
SSSOM (Simple Standard for Sharing Ontology Mappings) is a TSV and RDF/OWL standard for ontology mappings

```
WARNING: the export formats (json, rdf) of sssom-py are not yet finalised! Please expect changes in future releases!
WARNING:
The export formats (json, rdf) of sssom-py are not yet finalised!
Please expect changes in future releases!
```

See https://github.com/OBOFoundry/SSSOM
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
##########################
[metadata]
name = sssom
version = 0.3.30
version = 0.3.31
description = Operations on SSSOM mapping tables
long_description = file: README.md
long_description_content_type = text/markdown
Expand Down
42 changes: 22 additions & 20 deletions sssom/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1170,55 +1170,57 @@ def get_prefixes_used_in_metadata(meta: MetadataType) -> List[str]:


def filter_out_prefixes(
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
df: pd.DataFrame,
filter_prefixes: List[str],
features: list = KEY_FEATURES,
require_all_prefixes: bool = False,
) -> pd.DataFrame:
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.
"""Filter out rows which contains a CURIE with a prefix in the filter_prefixes list.
:param df: Pandas DataFrame
:param df: Pandas DataFrame of SSSOM Mapping
:param filter_prefixes: List of prefixes
:param features: List of dataframe column names dataframe to consider
:param require_all_prefixes: If True, all prefixes must be present in a row to be filtered out
:return: Pandas Dataframe
"""
filter_prefix_set = set(filter_prefixes)
rows = []
selection = all if require_all_prefixes else any

for _, row in df.iterrows():
prefixes = {get_prefix_from_curie(curie) for curie in row[features]}
# Confirm if none of the CURIEs in the list above appear in the filter_prefixes list.
# If TRUE, append row.
if not any(prefix in prefixes for prefix in filter_prefix_set):
if not selection(prefix in prefixes for prefix in filter_prefix_set):
rows.append(row)
if rows:
return pd.DataFrame(rows)
else:
return pd.DataFrame(columns=features)

return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)


def filter_prefixes(
df: pd.DataFrame, filter_prefixes: List[str], features: list = KEY_FEATURES
df: pd.DataFrame,
filter_prefixes: List[str],
features: list = KEY_FEATURES,
require_all_prefixes: bool = True,
) -> pd.DataFrame:
"""Filter any row where a CURIE in one of the key column uses one of the given prefixes.
"""Filter out rows which do NOT contain a CURIE with a prefix in the filter_prefixes list.
:param df: Pandas DataFrame
:param df: Pandas DataFrame of SSSOM Mapping
:param filter_prefixes: List of prefixes
:param features: List of dataframe column names dataframe to consider
:param require_all_prefixes: If True, all prefixes must be present in a row to be filtered out
:return: Pandas Dataframe
"""
filter_prefix_set = set(filter_prefixes)
rows = []
selection = all if require_all_prefixes else any

for _, row in df.iterrows():
prefixes = {
get_prefix_from_curie(curie) for curie in row[features] if curie is not None
}
# Confirm if all of the CURIEs in the list above appear in the filter_prefixes list.
# If TRUE, append row.
if all(prefix in filter_prefix_set for prefix in prefixes):
if selection(prefix in filter_prefix_set for prefix in prefixes):
rows.append(row)
if rows:
return pd.DataFrame(rows)
else:
return pd.DataFrame(columns=features)

return pd.DataFrame(rows) if rows else pd.DataFrame(columns=features)


# TODO this is not used anywhere
Expand Down
38 changes: 34 additions & 4 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,54 @@ def test_broken_predicate_list(self):
iri_list.extend(p_iri)
self.assertEqual(3, len(iri_list))

def test_filter_prefixes(self):
def test_filter_prefixes_any(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
original_msdf = self.msdf
filtered_df = filter_prefixes(
original_msdf.df, prefix_filter_list, self.features
original_msdf.df,
prefix_filter_list,
self.features,
require_all_prefixes=False,
)
self.assertEqual(len(filtered_df), 136)

def test_filter_prefixes_all(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
original_msdf = self.msdf
filtered_df = filter_prefixes(
original_msdf.df,
prefix_filter_list,
self.features,
require_all_prefixes=True,
)
self.assertEqual(len(filtered_df), 40)

def test_filter_out_prefixes(self):
def test_filter_out_prefixes_any(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
original_msdf = self.msdf
filtered_df = filter_out_prefixes(
original_msdf.df, prefix_filter_list, self.features
original_msdf.df,
prefix_filter_list,
self.features,
require_all_prefixes=False,
)
self.assertEqual(len(filtered_df), 5)

def test_filter_out_prefixes_all(self):
"""Test filtering MSDF.df by prefixes provided."""
prefix_filter_list = ["x", "y"]
original_msdf = self.msdf
filtered_df = filter_out_prefixes(
original_msdf.df,
prefix_filter_list,
self.features,
require_all_prefixes=True,
)
self.assertEqual(len(filtered_df), 101)

def test_remove_mappings(self):
"""Test remove mappings."""
prefix_filter_list = ["x", "y"]
Expand Down

0 comments on commit 1891bb6

Please sign in to comment.