diff --git a/integrations/snowflake/requirements.txt b/integrations/snowflake/requirements.txt new file mode 100644 index 00000000..4ac5bf24 --- /dev/null +++ b/integrations/snowflake/requirements.txt @@ -0,0 +1,2 @@ +hopsworks +snowflake-connector-python \ No newline at end of file diff --git a/integrations/snowflake/snowflake-data-source-hopsworks-feature-training-pipelines.ipynb b/integrations/snowflake/snowflake-data-source-hopsworks-feature-training-pipelines.ipynb new file mode 100644 index 00000000..5e9cb72e --- /dev/null +++ b/integrations/snowflake/snowflake-data-source-hopsworks-feature-training-pipelines.ipynb @@ -0,0 +1,1094 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ba8e158f-ec52-4695-aedc-ec0e9ac69253", + "metadata": {}, + "source": [ + "# 👨🏻‍🏫 Snowflake as a Source for Feature Groups in Hopsworks \n", + "\n", + "Follow this [guide](https://docs.hopsworks.ai/latest/user_guides/fs/storage_connector/creation/snowflake/) to set up a Snowflake connector in Hopsworks.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e059b3e6-6612-4045-a938-e7338943843a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jdowling/anaconda3/envs/book/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/17565\n", + "Connected. Call `.close()` to terminate connection gracefully.\n" + ] + } + ], + "source": [ + "import hopsworks\n", + "from hsfs.feature import Feature\n", + "import snowflake.connector\n", + "\n", + "proj = hopsworks.login()\n", + "fs = proj.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "174f00f2-a93e-4fee-b87b-35c2e55ec855", + "metadata": {}, + "source": [ + "## 🔮 Retrieve a Connector\n", + "\n", + "Firstly, connect to feature store and then retrieve your **Snowflake storage connector**.\n", + "\n", + "Replace `my_storage_connector_name` with your Snowflake storage connector name.\n", + "\n", + "In Snowflake, you will need to go to the data marketplace and `get` the [Chicago Divvy Bike Status dataset](https://app.snowflake.com/marketplace/listing/GZSTZBWGAEV/ahead-chicago-divvy-bike-station-status?search=chicago%20bike). \n", + "Add the dataset to a schema called \"PUBLIC\" (or change the details in the connector below)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "aea55a75-18c0-40fe-9d50-72a15f5d18bc", + "metadata": {}, + "outputs": [], + "source": [ + "connector = fs.get_storage_connector(\"my_storage_connector_name\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "147b9198-fb20-4124-b2d1-edb4aace7073", + "metadata": {}, + "outputs": [], + "source": [ + "def get_connection():\n", + " conn = snowflake.connector.connect(\n", + " user=connector.user,\n", + " password=connector.password,\n", + " account=connector.account,\n", + " warehouse=connector.warehouse,\n", + " database=\"CHICAGO_DIVVY_BIKE_STATION_STATUS\",\n", + " schema=\"PUBLIC\"\n", + " )\n", + " return conn" + ] + }, + { + "cell_type": "markdown", + "id": "c03a3d4b-94da-4e03-95b8-b387b9d51427", + "metadata": {}, + "source": [ + "## 📝 Read Data \n", + "\n", + "You can retrieve your data by passing a SQL query as a string to the snowflake connector." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d10b4046-2f36-4afd-9aea-286982c310de", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SHORT_NAMESTATION_TYPENAMELONELECTRIC_BIKE_SURCHARGE_WAIVEREXTERNAL_IDLEGACY_IDCAPACITYHAS_KIOSKSTATION_IDREGION_IDEIGHTD_STATION_SERVICESLAT
0\"TA1309000064\"\"classic\"\"Wolcott Ave & Polk St\"-87.673688false\"a3ab86b6-a135-11e9-9cda-0a87ae2ba916\"\"342\"23true\"a3ab86b6-a135-11e9-9cda-0a87ae2ba916\"\"0\"[]41.871262
1\"15575\"\"classic\"\"Broadway & Thorndale Ave\"-87.6601406209false\"a3af2c5f-a135-11e9-9cda-0a87ae2ba916\"\"458\"19true\"a3af2c5f-a135-11e9-9cda-0a87ae2ba916\"\"0\"[]41.98974251144
2\"KA1503000065\"\"classic\"\"Woodlawn Ave & Lake Park Ave\"-87.5970051479false\"a3ad4d1b-a135-11e9-9cda-0a87ae2ba916\"\"413\"15true\"a3ad4d1b-a135-11e9-9cda-0a87ae2ba916\"\"0\"[]41.81409271048
3\"15491\"\"classic\"\"63rd St Beach\"-87.57632374763489false\"a3a547b8-a135-11e9-9cda-0a87ae2ba916\"\"101\"15true\"a3a547b8-a135-11e9-9cda-0a87ae2ba916\"\"0\"[]41.78091096424803
4\"13292\"\"classic\"\"Kedzie Ave & Palmer Ct\"-87.707322false\"a3a9f76a-a135-11e9-9cda-0a87ae2ba916\"\"290\"15true\"a3a9f76a-a135-11e9-9cda-0a87ae2ba916\"\"0\"[]41.921525
..........................................
1365None\"lightweight\"\"Michigan Ave & 102nd St\"-87.61984false\"motivate_CHI_1674190492950080350\"\"1674190492950080350\"10false\"1674190492950080350\"None[]41.7083
1366None\"lightweight\"\"Pullman - Planet Fitness\"-87.59779false\"motivate_CHI_1677249879663712418\"\"1677249879663712418\"10false\"1677249879663712418\"None[]41.69782
1367None\"lightweight\"\"Lamon Ave & Belmont Ave\"-87.7492834false\"motivate_CHI_1563698701206292480\"\"1563698701206292480\"9false\"1563698701206292480\"None[]41.9390108
1368None\"lightweight\"\"Racine Ave & 76th\"-87.654054false\"motivate_CHI_1674190591734328324\"\"1674190591734328324\"10false\"1674190591734328324\"None[]41.755786
1369None\"lightweight\"\"Torrence Ave & 98th St\"-87.559986false\"motivate_CHI_1674190634684001360\"\"1674190634684001360\"10false\"1674190634684001360\"None[]41.717059
\n", + "

1370 rows × 13 columns

\n", + "
" + ], + "text/plain": [ + " SHORT_NAME STATION_TYPE NAME \\\n", + "0 \"TA1309000064\" \"classic\" \"Wolcott Ave & Polk St\" \n", + "1 \"15575\" \"classic\" \"Broadway & Thorndale Ave\" \n", + "2 \"KA1503000065\" \"classic\" \"Woodlawn Ave & Lake Park Ave\" \n", + "3 \"15491\" \"classic\" \"63rd St Beach\" \n", + "4 \"13292\" \"classic\" \"Kedzie Ave & Palmer Ct\" \n", + "... ... ... ... \n", + "1365 None \"lightweight\" \"Michigan Ave & 102nd St\" \n", + "1366 None \"lightweight\" \"Pullman - Planet Fitness\" \n", + "1367 None \"lightweight\" \"Lamon Ave & Belmont Ave\" \n", + "1368 None \"lightweight\" \"Racine Ave & 76th\" \n", + "1369 None \"lightweight\" \"Torrence Ave & 98th St\" \n", + "\n", + " LON ELECTRIC_BIKE_SURCHARGE_WAIVER \\\n", + "0 -87.673688 false \n", + "1 -87.6601406209 false \n", + "2 -87.5970051479 false \n", + "3 -87.57632374763489 false \n", + "4 -87.707322 false \n", + "... ... ... \n", + "1365 -87.61984 false \n", + "1366 -87.59779 false \n", + "1367 -87.7492834 false \n", + "1368 -87.654054 false \n", + "1369 -87.559986 false \n", + "\n", + " EXTERNAL_ID LEGACY_ID CAPACITY \\\n", + "0 \"a3ab86b6-a135-11e9-9cda-0a87ae2ba916\" \"342\" 23 \n", + "1 \"a3af2c5f-a135-11e9-9cda-0a87ae2ba916\" \"458\" 19 \n", + "2 \"a3ad4d1b-a135-11e9-9cda-0a87ae2ba916\" \"413\" 15 \n", + "3 \"a3a547b8-a135-11e9-9cda-0a87ae2ba916\" \"101\" 15 \n", + "4 \"a3a9f76a-a135-11e9-9cda-0a87ae2ba916\" \"290\" 15 \n", + "... ... ... ... \n", + "1365 \"motivate_CHI_1674190492950080350\" \"1674190492950080350\" 10 \n", + "1366 \"motivate_CHI_1677249879663712418\" \"1677249879663712418\" 10 \n", + "1367 \"motivate_CHI_1563698701206292480\" \"1563698701206292480\" 9 \n", + "1368 \"motivate_CHI_1674190591734328324\" \"1674190591734328324\" 10 \n", + "1369 \"motivate_CHI_1674190634684001360\" \"1674190634684001360\" 10 \n", + "\n", + " HAS_KIOSK STATION_ID REGION_ID \\\n", + "0 true \"a3ab86b6-a135-11e9-9cda-0a87ae2ba916\" \"0\" \n", + "1 true \"a3af2c5f-a135-11e9-9cda-0a87ae2ba916\" \"0\" \n", + "2 true \"a3ad4d1b-a135-11e9-9cda-0a87ae2ba916\" \"0\" \n", + "3 true \"a3a547b8-a135-11e9-9cda-0a87ae2ba916\" \"0\" \n", + "4 true \"a3a9f76a-a135-11e9-9cda-0a87ae2ba916\" \"0\" \n", + "... ... ... ... \n", + "1365 false \"1674190492950080350\" None \n", + "1366 false \"1677249879663712418\" None \n", + "1367 false \"1563698701206292480\" None \n", + "1368 false \"1674190591734328324\" None \n", + "1369 false \"1674190634684001360\" None \n", + "\n", + " EIGHTD_STATION_SERVICES LAT \n", + "0 [] 41.871262 \n", + "1 [] 41.98974251144 \n", + "2 [] 41.81409271048 \n", + "3 [] 41.78091096424803 \n", + "4 [] 41.921525 \n", + "... ... ... \n", + "1365 [] 41.7083 \n", + "1366 [] 41.69782 \n", + "1367 [] 41.9390108 \n", + "1368 [] 41.755786 \n", + "1369 [] 41.717059 \n", + "\n", + "[1370 rows x 13 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn = get_connection()\n", + "\n", + "# SQL query to fetch the data\n", + "query = \"SELECT * FROM STATION_INFO_FLATTEN\"\n", + "\n", + "# Execute the query\n", + "cur = conn.cursor()\n", + "cur.execute(query)\n", + "rows = cur.fetchall()\n", + "\n", + "# Convert to DataFrame\n", + "import pandas as pd\n", + "df = pd.DataFrame(rows, columns=[x[0] for x in cur.description])\n", + "\n", + "# Close the cursor and connection\n", + "cur.close()\n", + "conn.close()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "7b5dc17f-7dfd-417d-8d5b-ab2abc125134", + "metadata": {}, + "source": [ + "## 📝 Write Data to Hopsworks \n", + "\n", + "Create a feature group and write the Pandas DataFrame to the Feature Group.\n", + "Hopsworks will automatically lowercase the uppercase column names." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "13df21a3-7453-474f-b8b7-136caef4da8a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "FeatureGroupWarning: The ingested dataframe contains upper case letters in feature names: `['SHORT_NAME', 'STATION_TYPE', 'NAME', 'LON', 'ELECTRIC_BIKE_SURCHARGE_WAIVER', 'EXTERNAL_ID', 'LEGACY_ID', 'CAPACITY', 'HAS_KIOSK', 'STATION_ID', 'REGION_ID', 'EIGHTD_STATION_SERVICES', 'LAT']`. Feature names are sanitized to lower case in the feature store.\n", + "DeprecationWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://c.app.hopsworks.ai:443/p/17565/fs/17485/fg/730480\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading Dataframe: 100.00% |█████████████████████████████████| Rows 1370/1370 | Elapsed Time: 00:07 | Remaining Time: 00:00\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching job: chicago_bike_stations_1_offline_fg_materialization\n", + "Job started successfully, you can follow the progress at \n", + "https://c.app.hopsworks.ai/p/17565/jobs/named/chicago_bike_stations_1_offline_fg_materialization/executions\n" + ] + }, + { + "data": { + "text/plain": [ + "(, None)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bike_stations = fs.get_or_create_feature_group(name=\"chicago_bike_stations\",\n", + " version=1,\n", + " description=\"Chicago bike station details\",\n", + " primary_key=[\"station_id\"]\n", + " )\n", + "bike_stations.insert(df)" + ] + }, + { + "cell_type": "markdown", + "id": "fbc78abe-77dc-4c9a-b77b-ed2f5cb90148", + "metadata": {}, + "source": [ + "## 📝 Read Data \n", + "\n", + "This time, we are reading from a table with a timestamp. We are limiting it to the most recent 50k rows, but you can change it if you want." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "71ea530d-d74b-463f-9edd-16dadaa72b6d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDSTATION_STATUSNUM_BIKES_AVAILABLENUM_EBIKES_AVAILABLELAST_UPDATED
0\"418\"\"active\"202021-10-20 19:52:20
1\"565\"\"active\"222021-10-20 19:52:20
2\"588\"\"active\"752021-10-20 19:52:20
3\"545\"\"active\"102021-10-20 19:52:20
4\"153\"\"active\"812021-10-20 19:52:20
..................
49995\"682\"\"active\"002021-10-20 20:55:35
49996\"1594046362333434512\"\"active\"552021-10-20 20:55:35
49997\"57\"\"active\"532021-10-20 20:55:35
49998\"1448642183732401786\"\"active\"552021-10-20 20:55:35
49999\"280\"\"active\"002021-10-20 20:55:35
\n", + "

50000 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " ID STATION_STATUS NUM_BIKES_AVAILABLE \\\n", + "0 \"418\" \"active\" 2 \n", + "1 \"565\" \"active\" 2 \n", + "2 \"588\" \"active\" 7 \n", + "3 \"545\" \"active\" 1 \n", + "4 \"153\" \"active\" 8 \n", + "... ... ... ... \n", + "49995 \"682\" \"active\" 0 \n", + "49996 \"1594046362333434512\" \"active\" 5 \n", + "49997 \"57\" \"active\" 5 \n", + "49998 \"1448642183732401786\" \"active\" 5 \n", + "49999 \"280\" \"active\" 0 \n", + "\n", + " NUM_EBIKES_AVAILABLE LAST_UPDATED \n", + "0 0 2021-10-20 19:52:20 \n", + "1 2 2021-10-20 19:52:20 \n", + "2 5 2021-10-20 19:52:20 \n", + "3 0 2021-10-20 19:52:20 \n", + "4 1 2021-10-20 19:52:20 \n", + "... ... ... \n", + "49995 0 2021-10-20 20:55:35 \n", + "49996 5 2021-10-20 20:55:35 \n", + "49997 3 2021-10-20 20:55:35 \n", + "49998 5 2021-10-20 20:55:35 \n", + "49999 0 2021-10-20 20:55:35 \n", + "\n", + "[50000 rows x 5 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conn = get_connection()\n", + "\n", + "query = \"\"\"\n", + " SELECT STATION_ID as id\n", + " , STATION_STATUS as station_status\n", + " , NUM_BIKES_AVAILABLE as num_bikes_available\n", + " , NUM_EBIKES_AVAILABLE as num_ebikes_available\n", + " , LAST_UPDATED as last_updated\n", + " FROM STATION_STATUS_FLATTEN_FULL ORDER BY last_updated LIMIT 50000 \n", + "\"\"\"\n", + "# Execute the query\n", + "cur = conn.cursor()\n", + "cur.execute(query)\n", + "rows = cur.fetchall()\n", + "\n", + "# Convert to DataFrame\n", + "import pandas as pd\n", + "df2 = pd.DataFrame(rows, columns=[x[0] for x in cur.description])\n", + "\n", + "# Close the cursor and connection\n", + "cur.close()\n", + "conn.close()\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "68c17241-ef96-49b8-bbf5-398fdb541044", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{\"kwargs\": {\"column\": \"id\"}, \"meta\": {}, \"expectation_type\": \"expect_column_values_to_not_be_null\"}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from great_expectations.core import ExpectationSuite, ExpectationConfiguration\n", + "\n", + "# Create an Expectation Suite\n", + "expectation_suite = ExpectationSuite(\n", + " expectation_suite_name=\"transaction_suite\")\n", + "\n", + "expectation_suite.add_expectation(\n", + " ExpectationConfiguration(\n", + " expectation_type=\"expect_column_values_to_not_be_null\",\n", + " kwargs={\"column\":\"id\"}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "788e8316-50a7-4a79-8c8e-362493f54af8", + "metadata": {}, + "source": [ + "## 📝 Create Feature Group \n", + "\n", + "This time, we are creating a feature group with a timestamp." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0b8c97fb-335e-41ff-982e-272fa002805a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "FeatureGroupWarning: The ingested dataframe contains upper case letters in feature names: `['ID', 'STATION_STATUS', 'NUM_BIKES_AVAILABLE', 'NUM_EBIKES_AVAILABLE', 'LAST_UPDATED']`. Feature names are sanitized to lower case in the feature store.\n", + "DeprecationWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://c.app.hopsworks.ai:443/p/17565/fs/17485/fg/729472\n", + "Validation succeeded.\n", + "Validation Report saved successfully, explore a summary at https://c.app.hopsworks.ai:443/p/17565/fs/17485/fg/729472\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading Dataframe: 100.00% |███████████████████████████████| Rows 50000/50000 | Elapsed Time: 00:09 | Remaining Time: 00:00\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Launching job: chicago_bike_station_status_1_offline_fg_materialization\n", + "Job started successfully, you can follow the progress at \n", + "https://c.app.hopsworks.ai/p/17565/jobs/named/chicago_bike_station_status_1_offline_fg_materialization/executions\n" + ] + }, + { + "data": { + "text/plain": [ + "(,\n", + " {\n", + " \"success\": true,\n", + " \"evaluation_parameters\": {},\n", + " \"statistics\": {\n", + " \"evaluated_expectations\": 1,\n", + " \"successful_expectations\": 1,\n", + " \"unsuccessful_expectations\": 0,\n", + " \"success_percent\": 100.0\n", + " },\n", + " \"meta\": {\n", + " \"great_expectations_version\": \"0.15.12\",\n", + " \"expectation_suite_name\": \"transaction_suite\",\n", + " \"run_id\": {\n", + " \"run_time\": \"2024-04-18T06:00:48.814759+00:00\",\n", + " \"run_name\": null\n", + " },\n", + " \"batch_kwargs\": {\n", + " \"ge_batch_id\": \"011c36b6-fd49-11ee-98f3-00155d1167e0\"\n", + " },\n", + " \"batch_markers\": {},\n", + " \"batch_parameters\": {},\n", + " \"validation_time\": \"20240418T060048.814651Z\",\n", + " \"expectation_suite_meta\": {\n", + " \"great_expectations_version\": \"0.15.12\"\n", + " }\n", + " },\n", + " \"results\": [\n", + " {\n", + " \"success\": true,\n", + " \"result\": {\n", + " \"element_count\": 50000,\n", + " \"unexpected_count\": 0,\n", + " \"unexpected_percent\": 0.0,\n", + " \"unexpected_percent_total\": 0.0,\n", + " \"partial_unexpected_list\": []\n", + " },\n", + " \"expectation_config\": {\n", + " \"kwargs\": {\n", + " \"column\": \"id\"\n", + " },\n", + " \"meta\": {\n", + " \"expectationId\": 454659\n", + " },\n", + " \"expectation_type\": \"expect_column_values_to_not_be_null\"\n", + " },\n", + " \"meta\": {\n", + " \"ingestionResult\": \"INGESTED\",\n", + " \"validationTime\": \"2024-04-18T06:00:48.000814Z\"\n", + " },\n", + " \"exception_info\": {\n", + " \"raised_exception\": false,\n", + " \"exception_message\": null,\n", + " \"exception_traceback\": null\n", + " }\n", + " }\n", + " ]\n", + " })" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bike_station_status = fs.get_or_create_feature_group(name=\"chicago_bike_station_status\",\n", + " version=1,\n", + " description=\"Chicago bike station details\",\n", + " primary_key=[\"id\"],\n", + " event_time=\"last_updated\",\n", + " online_enabled=True,\n", + " expectation_suite=expectation_suite\n", + " )\n", + "bike_station_status.insert(df2)" + ] + }, + { + "cell_type": "markdown", + "id": "a07ab095-bf92-4f7d-a45e-6fabf9d959e5", + "metadata": {}, + "source": [ + "## 📝 Create a Feature View and Training Data \n", + "\n", + "Join features from our feature group with no event_time (bike_stations) with our feature group with event_time (bike_station_status)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "68561032-6a05-4b7e-8ab1-1cd66b8a00cf", + "metadata": {}, + "outputs": [], + "source": [ + "# select the features for your model\n", + "selected_features = bike_station_status.select(['station_status','num_bikes_available']).join(bike_stations.select(['station_type', 'capacity', 'has_kiosk']), left_on=\"id\", right_on=\"station_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "b43c8d25-ab0e-450f-86b6-660fa64fa7b4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature view created successfully, explore it at \n", + "https://c.app.hopsworks.ai:443/p/17565/fs/17485/fv/chicago_bike_availability/version/1\n" + ] + } + ], + "source": [ + "fv = fs.get_or_create_feature_view(name=\"chicago_bike_availability\", \n", + " version=1,\n", + " description=\"Predict bike availability\",\n", + " query=selected_features,\n", + " labels=[\"num_bikes_available\"]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6f3e911a-4f0d-402f-890b-4d7552f6926c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Finished: Reading data from Hopsworks, using ArrowFlight (3.09s) \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "DeprecationWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.\n", + "DeprecationWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.\n", + "VersionWarning: Incremented version to `1`.\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = fv.train_test_split(test_size=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b6e9dc96-aafd-46d8-8fe7-8f682d2c5154", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
station_statusstation_typecapacityhas_kiosk
0\"active\"\"lightweight\"10false
2\"active\"\"lightweight\"9false
3\"active\"\"lightweight\"9false
4\"active\"\"lightweight\"8false
5\"active\"\"lightweight\"4false
...............
7193\"planned\"\"lightweight\"6false
7194\"active\"\"lightweight\"9false
7195\"active\"\"lightweight\"9false
7196\"active\"\"lightweight\"6false
7197\"active\"\"lightweight\"6false
\n", + "

6478 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " station_status station_type capacity has_kiosk\n", + "0 \"active\" \"lightweight\" 10 false\n", + "2 \"active\" \"lightweight\" 9 false\n", + "3 \"active\" \"lightweight\" 9 false\n", + "4 \"active\" \"lightweight\" 8 false\n", + "5 \"active\" \"lightweight\" 4 false\n", + "... ... ... ... ...\n", + "7193 \"planned\" \"lightweight\" 6 false\n", + "7194 \"active\" \"lightweight\" 9 false\n", + "7195 \"active\" \"lightweight\" 9 false\n", + "7196 \"active\" \"lightweight\" 6 false\n", + "7197 \"active\" \"lightweight\" 6 false\n", + "\n", + "[6478 rows x 4 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1c18c2e-523c-4241-a2b0-f3b0dfac8f6a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/integrations/snowflake/snowflake_feature_pipeline_external.ipynb b/integrations/snowflake/snowflake_feature_pipeline_external.ipynb index 2c59d298..3a52061e 100644 --- a/integrations/snowflake/snowflake_feature_pipeline_external.ipynb +++ b/integrations/snowflake/snowflake_feature_pipeline_external.ipynb @@ -7,9 +7,9 @@ "source": [ "# 👨🏻‍🏫 Snowflake External Feature Group Creation\n", "\n", - "Follow this [guide](https://docs.hopsworks.ai/3.1/user_guides/fs/storage_connector/creation/snowflake/) to set up a connection to Snowflake.\n", + "Follow this [guide](https://docs.hopsworks.ai/latest/user_guides/fs/storage_connector/creation/snowflake/) to set up a Snowflake connector in Hopsworks.\n", "\n", - "In addition, you can read about [External Feature Groups](https://docs.hopsworks.ai/3.0/user_guides/fs/feature_group/create_external/)." + "In addition, you can read about [External Feature Groups](https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/create_external/)." ] }, { @@ -159,20 +159,23 @@ ], "metadata": { "kernelspec": { - "display_name": "PySpark", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "pysparkkernel" + "name": "python3" }, "language_info": { "codemirror_mode": { - "name": "python", + "name": "ipython", "version": 3 }, + "file_extension": ".py", "mimetype": "text/x-python", - "name": "pyspark", - "pygments_lexer": "python3" + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +}