From fe4129a5b242ae641fdc18a41abfd30f16bb2b48 Mon Sep 17 00:00:00 2001 From: Alessio Siniscalchi Date: Wed, 23 Oct 2024 18:00:01 +0200 Subject: [PATCH 1/3] add prototype function to support search typeahead --- cads_catalogue_api_service/search_utils.py | 59 ++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/cads_catalogue_api_service/search_utils.py b/cads_catalogue_api_service/search_utils.py index a263c8a..77c7dd9 100644 --- a/cads_catalogue_api_service/search_utils.py +++ b/cads_catalogue_api_service/search_utils.py @@ -27,6 +27,65 @@ WEIGHT_FULLTEXT = 0.03 +def apply_filters_typeahead( + session: sa.orm.Session, + chars: str, + search: sa.orm.Query | None, + portals: list[str] | None = None, + limit: int | None = None, +): + """Apply filters to return words matching initial input characters, as suggestions for searching datasets. + + Args: + session: sqlalchemy session object + chars: characters of the words to find + search: current dataset query + portals: list of datasets portals to consider + limit: if specified, limit length of resulting words + """ + if search is None: + search = session.query(cads_catalogue.database.Resource) + search = search.filter(cads_catalogue.database.Resource.hidden == False) # noqa E712 + if portals: + search = search.filter(cads_catalogue.database.Resource.portal.in_(portals)) + g = sa.func.unnest( + sa.func.string_to_array( + sa.func.lower(cads_catalogue.database.Resource.title), " " + ) + ).label("g") + t = search.with_entities(g).scalar_subquery().alias("t") + suggestion = sa.func.unnest(sa.func.array_agg(sa.func.distinct(t.c.g))).label( + "suggestion" + ) + tt = session.query(suggestion).select_from(t).scalar_subquery().alias("tt") + # consider only (resulting words with length > 2) AND (words starting with chars): + filter = sa.and_( + sa.func.length(tt.c.suggestion).__gt__(2), tt.c.suggestion.ilike(chars + "%") + ) + search = ( + session.query(tt.c.suggestion) + .select_from(tt) + .filter(filter) + .order_by(tt.c.suggestion) + ) + if limit is not None: + search = search.limit(limit) # type: ignore + + # final sql should be something like: + # SELECT suggestion FROM + # ( + # SELECT unnest(array_agg(DISTINCT t.g)) AS suggestion FROM + # ( + # SELECT unnest(string_to_array(lower(title), ' ')) AS g FROM resources + # WHERE resources.hidden = true AND resources.portal IN ('cams', 'c3s') + # ) + # as t + # ) as tt + # WHERE length(tt.suggestion) > 2 and tt.suggestion ilike 'er%' limit 10; + + return search + + def split_by_category(keywords: list) -> list: """Given a list of keywords composed by a "category: value", split them in multiple lists. From 7660de1bf9b409218bc0bf252f45456fc26a7001 Mon Sep 17 00:00:00 2001 From: Alessio Siniscalchi Date: Thu, 24 Oct 2024 09:44:10 +0200 Subject: [PATCH 2/3] style --- cads_catalogue_api_service/search_utils.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cads_catalogue_api_service/search_utils.py b/cads_catalogue_api_service/search_utils.py index 77c7dd9..fd2ff5c 100644 --- a/cads_catalogue_api_service/search_utils.py +++ b/cads_catalogue_api_service/search_utils.py @@ -30,7 +30,7 @@ def apply_filters_typeahead( session: sa.orm.Session, chars: str, - search: sa.orm.Query | None, + search: sa.orm.Query | None = None, portals: list[str] | None = None, limit: int | None = None, ): @@ -38,7 +38,7 @@ def apply_filters_typeahead( Args: session: sqlalchemy session object - chars: characters of the words to find + chars: initial characters of the words to find search: current dataset query portals: list of datasets portals to consider limit: if specified, limit length of resulting words @@ -71,17 +71,19 @@ def apply_filters_typeahead( if limit is not None: search = search.limit(limit) # type: ignore - # final sql should be something like: + # final sql for `apply_filters_typeahead(session, 'er', portals=['cams', 'c3s'], limit=10)`: # SELECT suggestion FROM # ( - # SELECT unnest(array_agg(DISTINCT t.g)) AS suggestion FROM + # SELECT unnest(array_agg(distinct(t.g))) AS suggestion FROM # ( # SELECT unnest(string_to_array(lower(title), ' ')) AS g FROM resources # WHERE resources.hidden = true AND resources.portal IN ('cams', 'c3s') # ) - # as t - # ) as tt - # WHERE length(tt.suggestion) > 2 and tt.suggestion ilike 'er%' limit 10; + # AS t + # ) AS tt + # WHERE length(tt.suggestion) > 2 AND tt.suggestion ILIKE 'er%' + # ORDER BY tt.suggestion + # LIMIT 10; return search From 2ac2e34598e57dce46a39fae5d34402fdd2db058 Mon Sep 17 00:00:00 2001 From: Luca Fabbri Date: Thu, 14 Nov 2024 10:21:41 +0100 Subject: [PATCH 3/3] Typeahead API --- cads_catalogue_api_service/main.py | 2 ++ cads_catalogue_api_service/typeahead.py | 36 +++++++++++++++++++++++++ 2 files changed, 38 insertions(+) create mode 100644 cads_catalogue_api_service/typeahead.py diff --git a/cads_catalogue_api_service/main.py b/cads_catalogue_api_service/main.py index bf69d8f..d369107 100644 --- a/cads_catalogue_api_service/main.py +++ b/cads_catalogue_api_service/main.py @@ -45,6 +45,7 @@ messages, middlewares, schema_org, + typeahead, vocabularies, ) @@ -89,6 +90,7 @@ async def lifespan(application: fastapi.FastAPI): app.include_router(collection_ext.router) app.include_router(doi.router) app.include_router(contents.router) +app.include_router(typeahead.router) def catalogue_openapi() -> dict[str, Any]: diff --git a/cads_catalogue_api_service/typeahead.py b/cads_catalogue_api_service/typeahead.py new file mode 100644 index 0000000..bfab5bf --- /dev/null +++ b/cads_catalogue_api_service/typeahead.py @@ -0,0 +1,36 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import fastapi + +from . import dependencies, search_utils + +router = fastapi.APIRouter( + prefix="", + tags=["typeahead"], + responses={fastapi.status.HTTP_404_NOT_FOUND: {"description": "Not found"}}, + include_in_schema=False, +) + + +@router.get("/typeahead") +def typeahead( + session=fastapi.Depends(dependencies.get_session), + portals: list[str] | None = fastapi.Depends(dependencies.get_portals), + chars: str = fastapi.Query(..., min_length=2, max_length=50), +) -> list[str]: + """Typeahead for CADS webportal search feature.""" + search = search_utils.apply_filters_typeahead( + session, chars, search=None, portals=portals + ) + result = session.execute(search) + return result.scalars().all()