diff --git a/api_examples/hsfs/knn_search/news-search-knn.ipynb b/api_examples/hsfs/knn_search/news-search-knn.ipynb index 1526b433..89724613 100644 --- a/api_examples/hsfs/knn_search/news-search-knn.ipynb +++ b/api_examples/hsfs/knn_search/news-search-knn.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "719b9af9-7375-4679-8e0a-8ae745805320", + "metadata": {}, + "source": [ + "# Requirements: Hopsworks 3.7+\n", + "\n", + "WARNING: this notebook does not currently work with serverless Hopsworks." + ] + }, { "cell_type": "markdown", "id": "8b0ba628", @@ -37,6 +47,18 @@ "Since creating embeddings for the full news is time-consuming, here we sample some articles." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "776aaa6d-b2b3-4c6a-9814-1dcc547f0a36", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install hsfs==3.7.0rc5 -q\n", + "#!pip install hopsworks==3.7.0rc1 -q\n", + "#!pip install sentence_transformers -q" + ] + }, { "cell_type": "code", "execution_count": null, @@ -45,7 +67,19 @@ "outputs": [], "source": [ "import pandas as pd\n", - "\n", + "from sentence_transformers import SentenceTransformer\n", + "import logging\n", + "import hopsworks\n", + "from hsfs import embedding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "28f3cca3-ce77-4b31-b837-4c247d28cfbb", + "metadata": {}, + "outputs": [], + "source": [ "df_all = pd.read_csv(\"https://repo.hops.works/dev/jdowling/Articles.csv\", encoding='utf-8', encoding_errors='ignore')\n", "df = df_all.sample(n=300).reset_index().drop([\"index\"], axis=1)\n", "df[\"news_id\"] = list(range(len(df)))" @@ -67,16 +101,6 @@ "Next, you need to create embeddings for heading and body of the news. The embeddings will then be used for kNN search against the embedding of the news description you want to search. Here we use a light weighted language model (LM) which encodes the news into embeddings. You can use any other language models including LLM (llama, Mistral)." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "776aaa6d-b2b3-4c6a-9814-1dcc547f0a36", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install sentence_transformers -q" - ] - }, { "cell_type": "code", "execution_count": null, @@ -84,7 +108,6 @@ "metadata": {}, "outputs": [], "source": [ - "from sentence_transformers import SentenceTransformer\n", "model = SentenceTransformer('all-MiniLM-L6-v2')" ] }, @@ -135,7 +158,6 @@ "metadata": {}, "outputs": [], "source": [ - "import hopsworks\n", "proj = hopsworks.login()\n", "fs = proj.get_feature_store()" ] @@ -156,9 +178,16 @@ "outputs": [], "source": [ "version = 1\n", - "from hsfs import embedding\n", - "\n", - "emb = embedding.EmbeddingIndex(index_name=f\"news_fg_{version}\")\n", + "emb = embedding.EmbeddingIndex(index_name=f\"news_fg_{version}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bbd31bc-b806-41e5-95a6-0dc510e2fe99", + "metadata": {}, + "outputs": [], + "source": [ "# specify the name and dimension of the embedding features \n", "emb.add_embedding(\"embedding_body\", len(df[\"embedding_body\"][0]))\n", "emb.add_embedding(\"embedding_heading\", len(df[\"embedding_heading\"][0]))" @@ -185,8 +214,16 @@ " primary_key=[\"news_id\"],\n", " version=version,\n", " online_enabled=True\n", - ")\n", - "\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f82da02-0e37-4907-8790-a68145253845", + "metadata": {}, + "outputs": [], + "source": [ "news_fg.insert(df, write_options={\"start_offline_materialization\": False})" ] }, @@ -214,7 +251,6 @@ "outputs": [], "source": [ "# set the logging level to WARN to avoid INFO message\n", - "import logging\n", "logging.getLogger().setLevel(logging.WARN)" ] }, @@ -290,7 +326,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -304,7 +340,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.8" + "version": "3.10.0" } }, "nbformat": 4, diff --git a/api_examples/hsfs/knn_search/requirements.txt b/api_examples/hsfs/knn_search/requirements.txt new file mode 100644 index 00000000..f59b13f2 --- /dev/null +++ b/api_examples/hsfs/knn_search/requirements.txt @@ -0,0 +1,3 @@ +hsfs==3.7.0rc5 +hopsworks==3.7.0rc1 +sentence_transformers