Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RE2022-275: add filtering to heatmap endpoint #587

Merged
merged 6 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 2 additions & 35 deletions src/service/data_products/genome_attributes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
SELECTION_ID_PREFIX,
)
from src.service.data_products.table_models import TableAttributes
from src.service.filtering.filtering_processing import get_filters
from src.service.filtering.filtering_processing import get_filters, FILTER_STRATEGY_TEXT
from src.service.http_bearer import KBaseHTTPBearer
from src.service.processing import SubsetSpecification
from src.service.routes_common import PATH_VALIDATOR_COLLECTION_ID
Expand All @@ -63,40 +63,7 @@
```
GET <host>/collections/GTBD/data_products/genome_attribs/?filter_Completeness=[80,90]
```

The filter criteria depends on the type of the column and its filter strategy.

```
Type Strategy Filter criteria
------ -------- ---------------
string fulltext arbitrary string
string prefix arbitrary string
date range (see below)
int range (see below)
float range (see below)
```

Full text searches tokenize, stem, and normalize the input and removes stop words.
Prefix searches tokenize and lower case the input and match the beginning of words in the
data being searched.

Range criteria takes the form of a low and high limit to apply to the data. At least one of the
two limits must be provided. A comma separated the limits. Square brackets on either side
of the limits denote the limit is inclusive; parentheses or no character denote that the limit
is exclusive. For example:

```
1, numbers greater than 1
[1, numbers greater or equal to 1
,6) numbers less than 6
,6] numbers less than or equal to six
1,6 numbers greater than 1 and less than six
[1,6] numbers between 1 and 6, inclusive
```

Note that the OpenAPI UI does not allow entering arbitrary query parameters and therefore is
not usable for column filtering operations.
"""
""" + FILTER_STRATEGY_TEXT


class GenomeAttribsSpec(common_models.DataProductSpec):
Expand Down
107 changes: 100 additions & 7 deletions src/service/data_products/heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from fastapi import APIRouter, Depends, Request, Query, Path, Response

import src.common.storage.collection_and_field_names as names
from src.common.product_models import columnar_attribs_common_models as col_models
from src.common.product_models.common_models import FIELD_MATCH_STATE, FIELD_SELECTION_STATE
from src.common.product_models import heatmap_common_models as heatmap_models
from src.service import app_state, kb_auth, models
Expand Down Expand Up @@ -38,7 +39,9 @@
get_load_version_and_processes,
get_missing_ids,
)
from src.service.filtering.filtering_processing import get_filters, FILTER_STRATEGY_TEXT
from src.service.filtering.filters import FilterSet
from src.service.filtering.generic_view import get_generic_view_name
from src.service.http_bearer import KBaseHTTPBearer
from src.service.processing import SubsetSpecification
from src.service.routes_common import PATH_VALIDATOR_COLLECTION_ID
Expand All @@ -47,6 +50,10 @@

_OPT_AUTH = KBaseHTTPBearer(optional=True)

# Default string columns present in heatmap row data but not existing in the HeatMapMeta
_ID_COLS = [names.FLD_KBASE_ID]
_NGRAM_COLS = [names.FLD_KB_DISPLAY_NAME]


def _bools_to_ints(list_: list):
return [int(item) if isinstance(item, bool) else item for item in list_]
Expand Down Expand Up @@ -85,6 +92,32 @@
# This class needs to be pickleable so only create the data product spec on demand
# and don't make it part of the state

def _get_filtering_text(self) -> str:
return f"""

**FILTERING:**

The returned data can be filtered by column content by adding query parameters of the format
```
filter_<column id>=<filter criteria>
```
For example:
```
GET <host>/collections/PMI/data_products/{self._id}/?filter_1=[0,2]
GET <host>/collections/PMI/data_products/{self._id}/?filter_49=true
```

For metadata columns such as '{_ID_COLS[0]}' and '{_NGRAM_COLS[0]}', the filter format shifts to utilizing the
column name rather than the column ID.
```
filter_<column name>=<filter criteria>
```
For example:
```
GET <host>/collections/PMI/data_products/{self._id}/?filter_kbase_id=69278_1006_1
```
""" + FILTER_STRATEGY_TEXT

def _create_router(self) -> APIRouter:
router = APIRouter(tags=[self._api_category], prefix=f"/{self._id}")
router.add_api_route(
Expand All @@ -108,6 +141,7 @@
+ "permissions are required.\n\n"
+ "When creating selections from genome attributes, use the "
+ f"`{names.FLD_KBASE_ID}` field values as input."
+ self._get_filtering_text()
)
router.add_api_route(
"/cell/{cell_id}",
Expand Down Expand Up @@ -210,6 +244,17 @@
storage = app_state.get_app_state(r).arangostorage
_, load_ver = await get_load_version(
storage, collection_id, self._id, load_ver_override, user)

return await self._get_heatmap_meta(storage, collection_id, load_ver, load_ver_override)

Check warning on line 248 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L248

Added line #L248 was not covered by tests

async def _get_heatmap_meta(
self,
storage: ArangoStorage,
collection_id: str,
load_ver: str,
load_ver_override: bool
) -> heatmap_models.HeatMapMeta:

doc = await get_collection_singleton_from_db(
storage, self._colname_meta, collection_id, load_ver, bool(load_ver_override))
return heatmap_models.HeatMapMeta(**remove_collection_keys(doc))
Expand All @@ -233,6 +278,47 @@
)
return heatmap_models.CellDetail(**remove_collection_keys(doc))

def _append_col(
self,
columns: list[col_models.AttributesColumn],
col_list: list[str],
column_type: col_models.ColumnType,
filter_strategy: col_models.FilterStrategy) -> None:
# create AttributesColumn objects from a list of column names and add them, in place, to the given columns list
for col_name in col_list:
columns.append(col_models.AttributesColumn(

Check warning on line 289 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L288-L289

Added lines #L288 - L289 were not covered by tests
key=col_name,
type=column_type,
filter_strategy=filter_strategy,
))

async def _get_heatmap_columns(
self,
storage: ArangoStorage,
coll_id: str,
load_ver: str,
load_ver_override: bool
) -> list[col_models.AttributesColumn]:
# Retrieve a list of AttributesColumn objects derived from the ColumnInformation objects within HeatMapMeta.
# Additionally, include columns that exist in the heatmap row data but are not present in HeatMapMeta.

column_meta = await self._get_heatmap_meta(storage, coll_id, load_ver, load_ver_override)

Check warning on line 305 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L305

Added line #L305 was not covered by tests

columns = [heatmap_models.transfer_col_heatmap_to_attribs(col)

Check warning on line 307 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L307

Added line #L307 was not covered by tests
for category in column_meta.categories for col in category.columns]

# append columns existing in the heatmap row data but not in the HeatMapMeta
self._append_col(columns, _ID_COLS, col_models.ColumnType.STRING, col_models.FilterStrategy.IDENTITY)
self._append_col(columns, _NGRAM_COLS, col_models.ColumnType.STRING, col_models.FilterStrategy.NGRAM)

Check warning on line 312 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L311-L312

Added lines #L311 - L312 were not covered by tests

return columns

Check warning on line 314 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L314

Added line #L314 was not covered by tests

def _trans_field_func(self, field_name: str) -> str:
# Transforms the field name into a valid column name extracted from the filter query.
# For instance, converts a query field name '1' into a valid column name 'col_1_val'.

return heatmap_models.form_heatmap_cell_val_key(field_name) if field_name.isdigit() else field_name

Check warning on line 320 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L320

Added line #L320 was not covered by tests

async def get_heatmap(
self,
r: Request,
Expand Down Expand Up @@ -270,19 +356,26 @@
)
if status_only:
return self._response(dp_match=dp_match, dp_sel=dp_sel)
filters = FilterSet(
collection_id,
load_ver,
collection=self._colname_data,
columns = await self._get_heatmap_columns(appstate.arangostorage, collection_id, load_ver, load_ver_override)
filters = await get_filters(

Check warning on line 360 in src/service/data_products/heatmap.py

View check run for this annotation

Codecov / codecov/patch

src/service/data_products/heatmap.py#L359-L360

Added lines #L359 - L360 were not covered by tests
r,
arango_coll=self._colname_data,
coll_id=collection_id,
load_ver=load_ver,
load_ver_override=load_ver_override,
data_product=self._id,
columns=columns,
view_name=get_generic_view_name(self._id),
count=count,
sort_on=names.FLD_KB_DISPLAY_NAME,
sort_desc=False,
match_spec=SubsetSpecification(
subset_process=dp_match, mark_only=match_mark, prefix=MATCH_ID_PREFIX),
selection_spec=SubsetSpecification(
subset_process=dp_sel, mark_only=selection_mark, prefix=SELECTION_ID_PREFIX),
sort_on=names.FLD_KB_DISPLAY_NAME,
sort_descending=False,
start_after=start_after,
limit=limit,
trans_field_func=self._trans_field_func
)
return await self._query(
appstate.arangostorage, filters, match_proc=dp_match, selection_proc=dp_sel)
Expand Down Expand Up @@ -339,7 +432,7 @@
filters: FilterSet,
match_proc: models.DataProductProcess | None,
selection_proc: models.DataProductProcess | None,
) -> heatmap_models.HeatMap:
) -> Response:
data = []
await query_simple_collection_list(
store,
Expand Down
42 changes: 41 additions & 1 deletion src/service/filtering/filtering_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,47 @@
from src.service.processing import SubsetSpecification

FILTER_PREFIX = "filter_"

FILTER_STRATEGY_TEXT = """

The filter criteria depends on the type of the column and its filter strategy.

```
Type Strategy Filter criteria
------ -------- ---------------
string fulltext arbitrary string
string prefix arbitrary string
string ngram arbitrary string
date range (see below)
int range (see below)
float range (see below)
boolean true or false
```

Full text searches tokenize, stem, and normalize the input and removes stop words.
Prefix searches tokenize and lower case the input and match the beginning of words in the
data being searched.
N-gram matches documents whose attribute value has an n-gram similarity higher than
the specified threshold (set to 1.0) compared to the target value. The similarity is calculated
by counting how long the longest sequence of matching n-grams is, divided by the target’s total n-gram count.
The minimum input length is 3.

Range criteria takes the form of a low and high limit to apply to the data. At least one of the
two limits must be provided. A comma separated the limits. Square brackets on either side
of the limits denote the limit is inclusive; parentheses or no character denote that the limit
is exclusive. For example:

```
1, numbers greater than 1
[1, numbers greater or equal to 1
,6) numbers less than 6
,6] numbers less than or equal to six
1,6 numbers greater than 1 and less than six
[1,6] numbers between 1 and 6, inclusive
```

Note that the OpenAPI UI does not allow entering arbitrary query parameters and therefore is
not usable for column filtering operations.
"""

def _get_filter_map(r: Request) -> dict[str, str]:
"""
Expand Down
Loading