diff --git a/integrations/azuresql/AzureSQL_Example.ipynb b/integrations/azuresql/AzureSQL_Example.ipynb new file mode 100644 index 00000000..7b2cd25d --- /dev/null +++ b/integrations/azuresql/AzureSQL_Example.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c44ef758", + "metadata": {}, + "source": [ + "## Example: Connecting Hopsworks with AzureSQL\n", + "\n", + "### Instructions\n", + "\n", + "#### Retrieve the connection details from Azure Portal\n", + "\n", + "You can connect to AzureSQL from Hopsworks using the JDBC connection. You can see your JDBC connection details from the Azure Portal:\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "#### Open the Firewall (Optional)\n", + "Depending on your AzureSQL firewall configuration, you might need to whitelist the Hopsworks IPs with the firewall.\n", + "\n", + "#### Create the storage connector in Hopsworks\n", + "\n", + "All the connection attributes in the screenshot above should be set as `arguments` in a storage connector of type `JDBC` in Hopsworks.\n", + "Additionally you should set an extra `argument` named `driver` with value `com.microsoft.sqlserver.jdbc.SQLServerDriver`\n", + "\n", + "Relevant Documentation: https://docs.hopsworks.ai/latest/user_guides/fs/storage_connector/creation/jdbc/\n", + "\n", + "
\n", + "\n", + "
\n", + "\n", + "#### Download the JDBC Driver JAR and upload it in your project in Hopsworks\n", + "\n", + "You can download the AzureSQL JAR from here: https://learn.microsoft.com/en-us/sql/connect/jdbc/download-microsoft-jdbc-driver-for-sql-server.\n", + "The zip file contains the JAR for both Java 8 and Java 11. You should upload and use the Java 8 JAR \n", + "\n", + "### Use the connector:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "eb09b335", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection closed.\n", + "Connected. Call `.close()` to terminate connection gracefully.\n", + "\n", + "Logged in to project, explore it here https://snurran.hops.works/p/15480\n", + "Connected. Call `.close()` to terminate connection gracefully." + ] + } + ], + "source": [ + "import hopsworks\n", + "project = hopsworks.login()\n", + "fs = project.get_feature_store()" + ] + }, + { + "cell_type": "markdown", + "id": "f6266ba5", + "metadata": {}, + "source": [ + "#### Retrieve the Storage Connector using the APIs" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "01cd8d8e", + "metadata": {}, + "outputs": [], + "source": [ + "sc = fs.get_storage_connector(\"azure_sql\")" + ] + }, + { + "cell_type": "markdown", + "id": "e14ebb7c", + "metadata": {}, + "source": [ + "#### Example 1: External Feature Groups \n", + "\n", + "Use the storage connector to create an external feature group in Hopsworks.\n", + " \n", + "
\n", + "These APIs are only supported in a (Py)Spark Execution Engine\n", + "
\n", + "\n", + "Specify a Query (e.g. `SELECT * FROM test`) to execute every time the feature data is needed to create a new training dataset.\n", + "With external feature groups, the offline data stays in the external system and the query is executed every time.\n", + "\n", + "Relevant Documentation: https://docs.hopsworks.ai/latest/user_guides/fs/feature_group/create_external/" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "36b5688c", + "metadata": {}, + "outputs": [], + "source": [ + "external_feature_group = fs.create_external_feature_group(\n", + " name=\"profiles_upstream\",\n", + " version=1,\n", + " storage_connector = sc,\n", + " query=\"SELECT * FROM test\",\n", + " statistics_config={'histograms': True, 'correlations': True}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "6f26abf4", + "metadata": {}, + "outputs": [], + "source": [ + "external_feature_group.save()" + ] + }, + { + "cell_type": "markdown", + "id": "e80dc773", + "metadata": {}, + "source": [ + "#### Example 2: Derived Feature Groups \n", + "\n", + "Use the previously created external feature group as data source to create additional derived features:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "0e0bcd82", + "metadata": {}, + "outputs": [], + "source": [ + "external_feature_group = fs.get_external_feature_group(name=\"profiles_upstream\", version=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c1df358d", + "metadata": {}, + "outputs": [], + "source": [ + "profiles_df = external_feature_group.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e56f442d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------+---+--------------------+----------+------------+-------+----------------+\n", + "| name|sex| mail| birthdate| city|country| cc_num|\n", + "+-------------+---+--------------------+----------+------------+-------+----------------+\n", + "|Tonya Gregory| F|sandratorres@hotm...|1976-01-16|Far Rockaway| US|4796807885357879|\n", + "| Lisa Gilbert| F| michael53@yahoo.com|1986-09-30| Encinitas| US|4529266636192966|\n", + "|Carolyn Meyer| F| anthony47@yahoo.com|2001-07-13| Canton| US|4922690008243953|\n", + "| Sara Morris| F| amylloyd@yahoo.com|1938-06-23| Greenpoint| US|4897369589533543|\n", + "| Paul Ashley| M|matthew97@hotmail...|1974-12-06| Rutland| US|4848518335893425|\n", + "+-------------+---+--------------------+----------+------------+-------+----------------+\n", + "only showing top 5 rows" + ] + } + ], + "source": [ + "profiles_df.show(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "02fd1936", + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "\n", + "derived_df = profiles_df.withColumn(\"age\", F.floor(F.datediff(F.current_date(), F.to_date(F.col('birthdate'), 'yyyy-mm-dd'))/365.25))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "097c0a51", + "metadata": {}, + "outputs": [], + "source": [ + "derived_fg = fs.get_or_create_feature_group(\n", + " name=\"profiles_derived\",\n", + " version=1,\n", + " primary_key=['mail'],\n", + " online_enabled=True,\n", + " parents=[external_feature_group],\n", + " statistics_config={'histograms': True, 'correlations': True}\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "512e9007", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature Group created successfully, explore it at \n", + "https://snurran.hops.works/p/15480/fs/15428/fg/16402\n", + "(None, None)" + ] + } + ], + "source": [ + "derived_fg.insert(derived_df)" + ] + }, + { + "cell_type": "markdown", + "id": "4ac72cae", + "metadata": {}, + "source": [ + "#### Example 3: Create a Training Dataset\n", + "\n", + "
\n", + "These APIs are also supported from a Python Engine:\n", + " \n", + " - https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_view_api/#create_train_test_split\n", + " \n", + " - https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_view_api/#create_training_data\n", + " \n", + " - https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_view_api/#create_train_validation_test_split\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "42bb554f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature view created successfully, explore it at \n", + "https://snurran.hops.works/p/15480/fs/15428/fv/azure_sql_demo/version/1" + ] + } + ], + "source": [ + "fv = fs.create_feature_view(\n", + " name=\"azure_sql_demo\",\n", + " version=1,\n", + " query=external_feature_group.select_all(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ff1437f5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1, None)\n", + "VersionWarning: Incremented version to `1`." + ] + } + ], + "source": [ + "fv.create_training_data()" + ] + }, + { + "cell_type": "markdown", + "id": "12bc3908", + "metadata": {}, + "source": [ + "#### Example 4: Use the Storage Connector to read the data in a Spark DataFrame\n", + "\n", + "You can use this option if you want to retrieve raw data to create features without having to create an external feature group" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "214f89f4", + "metadata": {}, + "outputs": [], + "source": [ + "df = sc.read(query=\"\"\"\n", + " SELECT *\n", + " FROM test\n", + " WHERE city = 'Canton'\n", + "\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c082a195", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------+---+--------------------+----------+------+-------+----------------+\n", + "| name|sex| mail| birthdate| City|Country| cc_num|\n", + "+----------------+---+--------------------+----------+------+-------+----------------+\n", + "| Carolyn Meyer| F| anthony47@yahoo.com|2001-07-13|Canton| US|4922690008243953|\n", + "|Brandon Mitchell| M| ajackson@yahoo.com|1967-03-25|Canton| US|4928442302922211|\n", + "| John Sutton| M| qcalderon@gmail.com|1998-10-18|Canton| US|4459273780148699|\n", + "| Taylor Pitts| F|cohenrussell@gmai...|1989-10-08|Canton| US|4038150065544828|\n", + "| Larry Andrews| M| annette68@yahoo.com|1938-02-06|Canton| US|4421833463642311|\n", + "| Hector Cook| M| utucker@hotmail.com|1963-10-04|Canton| US|4069293169784098|\n", + "| Abigail Murray| F|rodriguezjulie@ho...|1971-02-01|Canton| US|4839891313999949|\n", + "|Jessica Gonzales| F|rphillips@hotmail...|1936-01-09|Canton| US|4179358160776166|\n", + "| Anthony Fowler| M| jason33@yahoo.com|1978-10-08|Canton| US|4752254878774038|\n", + "+----------------+---+--------------------+----------+------+-------+----------------+" + ] + } + ], + "source": [ + "df.show(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1553d8f9", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PySpark", + "language": "python", + "name": "pysparkkernel" + }, + "language_info": { + "codemirror_mode": { + "name": "python", + "version": 3 + }, + "mimetype": "text/x-python", + "name": "pyspark", + "pygments_lexer": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/integrations/azuresql/images/AzureSQL_ConnDetails.png b/integrations/azuresql/images/AzureSQL_ConnDetails.png new file mode 100644 index 00000000..25ee33b0 Binary files /dev/null and b/integrations/azuresql/images/AzureSQL_ConnDetails.png differ diff --git a/integrations/azuresql/images/Hopsworks_JDBC_SC.png b/integrations/azuresql/images/Hopsworks_JDBC_SC.png new file mode 100644 index 00000000..fa0292b5 Binary files /dev/null and b/integrations/azuresql/images/Hopsworks_JDBC_SC.png differ