diff --git a/docs/js/inject-api-links.js b/docs/js/inject-api-links.js index 6c8a4a3b3..89082c67d 100644 --- a/docs/js/inject-api-links.js +++ b/docs/js/inject-api-links.js @@ -1,18 +1,17 @@ window.addEventListener("DOMContentLoaded", function () { var windowPathNameSplits = window.location.pathname.split("/"); - var majorVersionRegex = new RegExp("(\\d+[.]\\d+)") + var majorVersionRegex = new RegExp("(\\d+[.]\\d+)"); var latestRegex = new RegExp("latest"); - if (majorVersionRegex.test(windowPathNameSplits[1])) { // On landing page docs.hopsworks.api/3.0 - URL contains major version + if (majorVersionRegex.test(windowPathNameSplits[1])) { // On landing page docs.hopsworks.api/4.0 - URL contains major version // Version API dropdown document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/generated/api/login/"; - document.getElementById("hsfs_api_link").href = "https://docs.hopsworks.ai/feature-store-api/" + windowPathNameSplits[1] + "/generated/api/connection_api/"; - document.getElementById("hsml_api_link").href = "https://docs.hopsworks.ai/machine-learning-api/" + windowPathNameSplits[1] + "/generated/connection_api/"; - } else { // on docs.hopsworks.api/feature-store-api/3.0 / docs.hopsworks.api/hopsworks-api/3.0 / docs.hopsworks.api/machine-learning-api/3.0 + document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + windowPathNameSplits[1] + "/javadoc"; + } else { // on / docs.hopsworks.api/hopsworks-api/4.0 if (latestRegex.test(windowPathNameSplits[2]) || latestRegex.test(windowPathNameSplits[1])) { - var majorVersion = "latest"; + var majorVersion = "latest"; } else { - var apiVersion = windowPathNameSplits[2]; - var majorVersion = apiVersion.match(majorVersionRegex)[0]; + var apiVersion = windowPathNameSplits[2]; + var majorVersion = apiVersion.match(majorVersionRegex)[0]; } // Version main navigation document.getElementsByClassName("md-tabs__link")[0].href = "https://docs.hopsworks.ai/" + majorVersion; @@ -24,8 +23,6 @@ window.addEventListener("DOMContentLoaded", function () { document.getElementsByClassName("md-tabs__link")[6].href = "https://docs.hopsworks.ai/" + majorVersion + "/admin/"; // Version API dropdown document.getElementById("hopsworks_api_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/generated/api/login/"; - document.getElementById("hsfs_api_link").href = "https://docs.hopsworks.ai/feature-store-api/" + majorVersion + "/generated/api/connection_api/"; - document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/feature-store-api/" + majorVersion + "/javadoc"; - document.getElementById("hsml_api_link").href = "https://docs.hopsworks.ai/machine-learning-api/" + majorVersion + "/generated/connection_api/"; + document.getElementById("hsfs_javadoc_link").href = "https://docs.hopsworks.ai/hopsworks-api/" + majorVersion + "/javadoc"; } }); diff --git a/java/beam/pom.xml b/java/beam/pom.xml index b240612d9..c90394fa5 100644 --- a/java/beam/pom.xml +++ b/java/beam/pom.xml @@ -5,7 +5,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java index c059520f7..db01f295a 100644 --- a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/FeatureStore.java @@ -160,7 +160,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver @Override public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, StatisticsConfig statisticsConfig, String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { throw new UnsupportedOperationException("Not supported for Beam"); } diff --git a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java index 9d3c41ee6..e74b51ade 100644 --- a/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java +++ b/java/beam/src/main/java/com/logicalclocks/hsfs/beam/StreamFeatureGroup.java @@ -17,6 +17,14 @@ package com.logicalclocks.hsfs.beam; +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.beam.sdk.values.PCollection; + import com.logicalclocks.hsfs.Feature; import com.logicalclocks.hsfs.FeatureGroupBase; import com.logicalclocks.hsfs.FeatureStoreException; @@ -26,19 +34,14 @@ import com.logicalclocks.hsfs.StatisticsConfig; import com.logicalclocks.hsfs.Storage; import com.logicalclocks.hsfs.StorageConnector; -import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.beam.engine.BeamProducer; +import com.logicalclocks.hsfs.beam.engine.FeatureGroupEngine; import com.logicalclocks.hsfs.constructor.QueryBase; import com.logicalclocks.hsfs.metadata.Statistics; + import lombok.Builder; import lombok.NonNull; -import org.apache.beam.sdk.values.PCollection; - -import java.io.IOException; -import java.text.ParseException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; public class StreamFeatureGroup extends FeatureGroupBase> { @@ -48,8 +51,9 @@ public class StreamFeatureGroup extends FeatureGroupBase> { @Builder public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, String onlineTopicName, - String eventTime, OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String eventTime, + OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { this(); this.featureStore = featureStore; this.name = name; @@ -61,6 +65,7 @@ public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integ ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; this.onlineEnabled = onlineEnabled; + this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); this.onlineTopicName = onlineTopicName; diff --git a/java/flink/pom.xml b/java/flink/pom.xml index 7e39ece2a..11564004f 100644 --- a/java/flink/pom.xml +++ b/java/flink/pom.xml @@ -5,7 +5,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java index b6314bad4..60dcbaeb6 100644 --- a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/FeatureStore.java @@ -165,8 +165,9 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, - OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, + String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { throw new UnsupportedOperationException("Not supported for Flink"); } diff --git a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java index c3cd6cbd0..0fa821fb3 100644 --- a/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java +++ b/java/flink/src/main/java/com/logicalclocks/hsfs/flink/StreamFeatureGroup.java @@ -17,6 +17,15 @@ package com.logicalclocks.hsfs.flink; +import java.io.IOException; +import java.text.ParseException; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.DataStreamSink; + import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.logicalclocks.hsfs.Feature; import com.logicalclocks.hsfs.FeatureGroupBase; @@ -27,22 +36,14 @@ import com.logicalclocks.hsfs.StatisticsConfig; import com.logicalclocks.hsfs.Storage; import com.logicalclocks.hsfs.StorageConnector; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.constructor.QueryBase; - +import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; import com.logicalclocks.hsfs.metadata.Statistics; -import com.logicalclocks.hsfs.flink.engine.FeatureGroupEngine; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.NonNull; -import org.apache.flink.streaming.api.datastream.DataStream; -import org.apache.flink.streaming.api.datastream.DataStreamSink; - -import java.io.IOException; -import java.text.ParseException; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; @AllArgsConstructor @JsonIgnoreProperties(ignoreUnknown = true) @@ -53,9 +54,9 @@ public class StreamFeatureGroup extends FeatureGroupBase> { @Builder public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, - String onlineTopicName, String topicName, String notificationTopicName, String eventTime, - OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String topicName, String notificationTopicName, + String eventTime, OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { this(); this.featureStore = featureStore; this.name = name; @@ -67,6 +68,7 @@ public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integ ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; this.onlineEnabled = onlineEnabled; + this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); this.onlineTopicName = onlineTopicName; diff --git a/java/hsfs/pom.xml b/java/hsfs/pom.xml index c56061427..b7bd606c2 100644 --- a/java/hsfs/pom.xml +++ b/java/hsfs/pom.xml @@ -5,7 +5,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java index ad391ef90..057838cad 100644 --- a/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java +++ b/java/hsfs/src/main/java/com/logicalclocks/hsfs/FeatureStoreBase.java @@ -122,8 +122,8 @@ public abstract Object getOrCreateStreamFeatureGroup(String name, Integer versio public abstract Object getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, - OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, StatisticsConfig statisticsConfig, + String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException; public abstract Object createExternalFeatureGroup(); diff --git a/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java b/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java index 4e0fb0419..d6c3d0b2e 100644 --- a/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java +++ b/java/hsfs/src/main/java/com/logicalclocks/hsfs/TimeTravelFormat.java @@ -19,5 +19,6 @@ public enum TimeTravelFormat { NONE, - HUDI + HUDI, + DELTA } diff --git a/java/pom.xml b/java/pom.xml index cc3dd776c..0a5cc707f 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -7,7 +7,7 @@ com.logicalclocks hsfs-parent pom - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT hsfs spark diff --git a/java/spark/pom.xml b/java/spark/pom.xml index 185da5d20..4c2d188fb 100644 --- a/java/spark/pom.xml +++ b/java/spark/pom.xml @@ -22,7 +22,7 @@ hsfs-parent com.logicalclocks - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 4.0.0 diff --git a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java index 33e3b6058..65dbc66d7 100644 --- a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java +++ b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/FeatureStore.java @@ -404,7 +404,7 @@ public StreamFeatureGroup.StreamFeatureGroupBuilder createStreamFeatureGroup() { public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version) throws IOException, FeatureStoreException { return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, null, - null, null, null, false, null, null, null); + null, null, null, false, TimeTravelFormat.HUDI, null, null, null); } /** @@ -438,7 +438,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver boolean onlineEnabled, String eventTime) throws IOException, FeatureStoreException { return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, null, - primaryKeys, null, null, onlineEnabled, null, eventTime, null); + primaryKeys, null, null, onlineEnabled, TimeTravelFormat.HUDI, null, eventTime, null); } /** @@ -477,7 +477,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, null, - primaryKeys, partitionKeys, null, onlineEnabled, null, eventTime, null); + primaryKeys, partitionKeys, null, onlineEnabled, TimeTravelFormat.HUDI, null, eventTime, null); } /** @@ -506,6 +506,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver * the first primary key of the feature group will be used as hudi precombine key. * @param onlineEnabled Define whether the feature group should be made available also in the online feature store * for low latency access. + * @param timeTravelFormat Format used for time travel, defaults to `"HUDI"`. * @param statisticsConfig A configuration object, to generally enable descriptive statistics computation for * this feature group, `"correlations`" to turn on feature correlation computation, * `"histograms"` to compute feature value frequencies and `"exact_uniqueness"` to compute @@ -523,13 +524,14 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer ver public StreamFeatureGroup getOrCreateStreamFeatureGroup(String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, boolean onlineEnabled, - StatisticsConfig statisticsConfig, String eventTime, - OnlineConfig onlineConfig) + TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, + String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { return featureGroupEngine.getOrCreateStreamFeatureGroup(this, name, version, description, - primaryKeys, partitionKeys, hudiPrecombineKey, onlineEnabled, statisticsConfig, eventTime, - onlineConfig); + primaryKeys, partitionKeys, hudiPrecombineKey, onlineEnabled, timeTravelFormat, + statisticsConfig, eventTime, onlineConfig); } /** diff --git a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java index 0c8b9bae3..4f423e8f3 100644 --- a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java +++ b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/StreamFeatureGroup.java @@ -17,13 +17,23 @@ package com.logicalclocks.hsfs.spark; -import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.io.IOException; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; -import com.logicalclocks.hsfs.spark.constructor.Query; -import com.logicalclocks.hsfs.spark.engine.FeatureGroupEngine; -import com.logicalclocks.hsfs.spark.engine.StatisticsEngine; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.streaming.StreamingQuery; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.logicalclocks.hsfs.EntityEndpointType; import com.logicalclocks.hsfs.Feature; +import com.logicalclocks.hsfs.FeatureGroupBase; import com.logicalclocks.hsfs.FeatureStoreException; import com.logicalclocks.hsfs.HudiOperationType; import com.logicalclocks.hsfs.JobConfiguration; @@ -31,26 +41,16 @@ import com.logicalclocks.hsfs.StatisticsConfig; import com.logicalclocks.hsfs.Storage; import com.logicalclocks.hsfs.StorageConnector; -import com.logicalclocks.hsfs.FeatureGroupBase; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.metadata.Statistics; +import com.logicalclocks.hsfs.spark.constructor.Query; +import com.logicalclocks.hsfs.spark.engine.FeatureGroupEngine; +import com.logicalclocks.hsfs.spark.engine.StatisticsEngine; import lombok.AllArgsConstructor; import lombok.Builder; import lombok.NonNull; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.streaming.StreamingQuery; - -import java.io.IOException; -import java.text.ParseException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - @AllArgsConstructor @JsonIgnoreProperties(ignoreUnknown = true) public class StreamFeatureGroup extends FeatureGroupBase> { @@ -61,9 +61,10 @@ public class StreamFeatureGroup extends FeatureGroupBase> { @Builder public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, List features, StatisticsConfig statisticsConfig, - String onlineTopicName, String topicName, String notificationTopicName, String eventTime, - OnlineConfig onlineConfig, StorageConnector storageConnector, String path) { + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List features, + StatisticsConfig statisticsConfig, String onlineTopicName, String topicName, + String notificationTopicName, String eventTime, OnlineConfig onlineConfig, + StorageConnector storageConnector, String path) { this(); this.featureStore = featureStore; this.name = name; @@ -75,6 +76,7 @@ public StreamFeatureGroup(FeatureStore featureStore, @NonNull String name, Integ ? partitionKeys.stream().map(String::toLowerCase).collect(Collectors.toList()) : null; this.hudiPrecombineKey = hudiPrecombineKey != null ? hudiPrecombineKey.toLowerCase() : null; this.onlineEnabled = onlineEnabled; + this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI; this.features = features; this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig(); this.onlineTopicName = onlineTopicName; diff --git a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java index 96ddfd5f2..f791d8bcd 100644 --- a/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java +++ b/java/spark/src/main/java/com/logicalclocks/hsfs/spark/engine/FeatureGroupEngine.java @@ -364,7 +364,8 @@ public List getFeatureGroups(FeatureStore featureStore, String fgN public StreamFeatureGroup getOrCreateStreamFeatureGroup(FeatureStore featureStore, String name, Integer version, String description, List primaryKeys, List partitionKeys, String hudiPrecombineKey, - boolean onlineEnabled, StatisticsConfig statisticsConfig, + boolean onlineEnabled, TimeTravelFormat timeTravelFormat, + StatisticsConfig statisticsConfig, String eventTime, OnlineConfig onlineConfig) throws IOException, FeatureStoreException { StreamFeatureGroup featureGroup; @@ -381,6 +382,7 @@ public StreamFeatureGroup getOrCreateStreamFeatureGroup(FeatureStore featureStor .partitionKeys(partitionKeys) .hudiPrecombineKey(hudiPrecombineKey) .onlineEnabled(onlineEnabled) + .timeTravelFormat(timeTravelFormat) .statisticsConfig(statisticsConfig) .eventTime(eventTime) .onlineConfig(onlineConfig) diff --git a/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java b/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java index bedd9716e..86a85bbdc 100644 --- a/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java +++ b/java/spark/src/test/java/com/logicalclocks/hsfs/spark/TestFeatureGroup.java @@ -20,6 +20,7 @@ import com.logicalclocks.hsfs.Feature; import com.logicalclocks.hsfs.FeatureStoreException; import com.logicalclocks.hsfs.Project; +import com.logicalclocks.hsfs.TimeTravelFormat; import com.logicalclocks.hsfs.metadata.FeatureGroupApi; import com.logicalclocks.hsfs.FeatureGroupBase; import com.logicalclocks.hsfs.metadata.HopsworksClient; @@ -67,7 +68,7 @@ public void testFeatureGroupPrimaryKey() { StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("primaryKey"), Collections.singletonList("partitionKey"), "hudiPrecombineKey", - true, features, null, "onlineTopicName", null, null, null, null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, null, null, null, null); Exception pkException = assertThrows(FeatureStoreException.class, () -> { featureGroupEngine.saveFeatureGroupMetaData(featureGroup, @@ -93,7 +94,7 @@ public void testFeatureGroupEventTimeFeature() { StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("featureA"), null, null, - true, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); Exception eventTimeException = assertThrows(FeatureStoreException.class, () -> { streamFeatureGroupEngine.saveFeatureGroupMetaData(featureGroup, @@ -119,7 +120,7 @@ public void testFeatureGroupPartitionPrecombineKeys() { StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("featureA"), Collections.singletonList("partitionKey"), "hudiPrecombineKey", - true, features, null, "onlineTopicName", null, null, null, null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, null, null, null, null); Exception partitionException = assertThrows(FeatureStoreException.class, () -> { streamFeatureGroupEngine.saveFeatureGroupMetaData(featureGroup, @@ -164,7 +165,7 @@ public void testFeatureGroupAppendFeaturesResetSubject() throws FeatureStoreExce StreamFeatureGroup featureGroup = new StreamFeatureGroup(featureStore, "fgName", 1, "description", Collections.singletonList("featureA"), null, null, - true, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); + true, TimeTravelFormat.HUDI, features, null, "onlineTopicName", null, null, "eventTime", null, null, null); featureGroup.featureGroupEngine = featureGroupEngine; // Act diff --git a/locust_benchmark/Dockerfile b/locust_benchmark/Dockerfile index e437ab9b2..47ef44106 100644 --- a/locust_benchmark/Dockerfile +++ b/locust_benchmark/Dockerfile @@ -1,4 +1,4 @@ -FROM locustio/locust:2.17.0 +FROM locustio/locust:2.23.1 USER root diff --git a/locust_benchmark/Jenkinsfile b/locust_benchmark/Jenkinsfile new file mode 100644 index 000000000..9d4465e97 --- /dev/null +++ b/locust_benchmark/Jenkinsfile @@ -0,0 +1,20 @@ +@Library("jenkins-library@main") + +import com.logicalclocks.jenkins.k8s.ImageBuilder + + +node("local") { + stage('Clone repository') { + checkout scm + } + + stage('Build and push image(s)') { + version = readFile "${env.WORKSPACE}/locust_benchmark/KUBE_IMAGE_VERSION" + withEnv(["VERSION=${version.trim()}"]) { + + def builder = new ImageBuilder(this) + m = readFile "${env.WORKSPACE}/locust_benchmark/build-manifest.json" + builder.run(m) + } + } +} \ No newline at end of file diff --git a/locust_benchmark/KUBE_IMAGE_VERSION b/locust_benchmark/KUBE_IMAGE_VERSION new file mode 100644 index 000000000..8b25206ff --- /dev/null +++ b/locust_benchmark/KUBE_IMAGE_VERSION @@ -0,0 +1 @@ +master \ No newline at end of file diff --git a/locust_benchmark/README.md b/locust_benchmark/README.md index c390b39db..eda8b440e 100644 --- a/locust_benchmark/README.md +++ b/locust_benchmark/README.md @@ -87,6 +87,7 @@ echo "[YOUR KEY]" > .api_key - `schema_repetitions`: This controls the number of features for the lookup. One schema repetition will result in 10 features plus primary key. Five repetitions will result in 50 features plus primary key. - `recreate_feature_group`: This controls if the previous feature group should be dropped and recreated. Set this to true when rerunning the benchmark with different size of rows or schema repetitions. - `batch_size`: This is relevant for the actual benchmark and controls how many feature vectors are looked up in the batch benchmark. +- `tablespace`: (Optional) If set creates a feature group using on-disk data. 3. Create the feature group diff --git a/locust_benchmark/build-manifest.json b/locust_benchmark/build-manifest.json new file mode 100644 index 000000000..48599c6dc --- /dev/null +++ b/locust_benchmark/build-manifest.json @@ -0,0 +1,8 @@ +[ + { + "name": "hopsworks/locust-hsfs", + "version": "env:VERSION", + "dockerFile": "locust_benchmark/Dockerfile", + "canUseCache": "true" + } +] diff --git a/locust_benchmark/common/hopsworks_client.py b/locust_benchmark/common/hopsworks_client.py index b9fbcae04..d82409892 100644 --- a/locust_benchmark/common/hopsworks_client.py +++ b/locust_benchmark/common/hopsworks_client.py @@ -7,10 +7,8 @@ import pandas as pd from locust.runners import MasterRunner, LocalRunner -import hsfs -from hsfs import client -from hsfs.client.exceptions import RestAPIError +import hopsworks class HopsworksClient: @@ -21,14 +19,14 @@ def __init__(self, environment=None): environment.runner, (MasterRunner, LocalRunner) ): print(self.hopsworks_config) - self.connection = hsfs.connection( + self.project = hopsworks.login( project=self.hopsworks_config.get("project", "test"), host=self.hopsworks_config.get("host", "localhost"), port=self.hopsworks_config.get("port", 443), api_key_file=".api_key", engine="python", ) - self.fs = self.connection.get_feature_store() + self.fs = self.project.get_feature_store() # test settings self.external = self.hopsworks_config.get("external", False) @@ -38,6 +36,7 @@ def __init__(self, environment=None): "recreate_feature_group", False ) self.batch_size = self.hopsworks_config.get("batch_size", 100) + self.tablespace = self.hopsworks_config.get("tablespace", None) def get_or_create_fg(self): locust_fg = self.fs.get_or_create_feature_group( @@ -46,6 +45,7 @@ def get_or_create_fg(self): primary_key=["ip"], online_enabled=True, stream=True, + online_config={'table_space': self.tablespace} if self.tablespace else None ) return locust_fg @@ -59,18 +59,15 @@ def insert_data(self, locust_fg): return locust_fg def get_or_create_fv(self, fg=None): - try: - return self.fs.get_feature_view("locust_fv", version=1) - except RestAPIError: - return self.fs.create_feature_view( - name="locust_fv", - query=fg.select_all(), - version=1, - ) + if fg is None: + fg = self.get_or_create_fg() + return self.fs.get_or_create_feature_view( + name="locust_fv", version=1, query=fg.select_all() + ) def close(self): - if client._client is not None: - self.connection.close() + if self.project is not None: + hopsworks.logout() def generate_insert_df(self, rows, schema_repetitions): data = {"ip": range(0, rows)} diff --git a/locust_benchmark/create_feature_group.py b/locust_benchmark/create_feature_group.py index 2ac6cf568..dbc237e27 100644 --- a/locust_benchmark/create_feature_group.py +++ b/locust_benchmark/create_feature_group.py @@ -4,4 +4,5 @@ hopsworks_client = HopsworksClient() fg = hopsworks_client.get_or_create_fg() hopsworks_client.insert_data(fg) + hopsworks_client.get_or_create_fv() hopsworks_client.close() diff --git a/locust_benchmark/hopsworks_config.json b/locust_benchmark/hopsworks_config.json index 6a8e60862..6e92b6739 100644 --- a/locust_benchmark/hopsworks_config.json +++ b/locust_benchmark/hopsworks_config.json @@ -1,10 +1,11 @@ { - "host": "localhost", + "host": "mercury.hops.works", "port": 443, - "project": "test", + "project": "fabio_demo", "external": true, - "rows": 100000, + "rows": 1000, "schema_repetitions": 1, "recreate_feature_group": true, - "batch_size": 100 + "batch_size": 100, + "tablespace": "ts1" } diff --git a/locust_benchmark/locustfile.py b/locust_benchmark/locustfile.py index d2d3ff933..105d80abd 100644 --- a/locust_benchmark/locustfile.py +++ b/locust_benchmark/locustfile.py @@ -3,7 +3,7 @@ from common.hopsworks_client import HopsworksClient from common.stop_watch import stopwatch from locust import HttpUser, User, task, constant, events -from locust.runners import MasterRunner, LocalRunner +from locust.runners import MasterRunner from urllib3 import PoolManager import nest_asyncio @@ -11,12 +11,8 @@ @events.init.add_listener def on_locust_init(environment, **kwargs): print("Locust process init") - - if isinstance(environment.runner, (MasterRunner, LocalRunner)): - # create feature view - environment.hopsworks_client = HopsworksClient(environment) - fg = environment.hopsworks_client.get_or_create_fg() - environment.hopsworks_client.get_or_create_fv(fg) + environment.hopsworks_client = HopsworksClient(environment) + environment.hopsworks_client.get_or_create_fg() @events.quitting.add_listener @@ -61,27 +57,21 @@ def get_feature_vector(self): class MySQLFeatureVectorLookup(User): - wait_time = constant(0) - weight = 5 - # fixed_count = 1 + wait_time = constant(0.001) + weight = 2 def __init__(self, environment): super().__init__(environment) - self.env = environment - self.client = HopsworksClient(environment) - self.fv = self.client.get_or_create_fv() + self.client = environment.hopsworks_client def on_start(self): - print("Init user") + self.fv = self.client.get_or_create_fv() self.fv.init_serving(external=self.client.external) nest_asyncio.apply() - def on_stop(self): - print("Closing user") - @task def get_feature_vector(self): - self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)}) + return self._get_feature_vector({"ip": random.randint(0, self.client.rows - 1)}) @stopwatch def _get_feature_vector(self, pk): @@ -89,14 +79,12 @@ def _get_feature_vector(self, pk): class MySQLFeatureVectorBatchLookup(User): - wait_time = constant(0) + wait_time = constant(0.001) weight = 1 - # fixed_count = 1 def __init__(self, environment): super().__init__(environment) - self.env = environment - self.client = HopsworksClient(environment) + self.client = environment.hopsworks_client self.fv = self.client.get_or_create_fv() def on_start(self): @@ -104,16 +92,13 @@ def on_start(self): self.fv.init_serving(external=self.client.external) nest_asyncio.apply() - def on_stop(self): - print("Closing user") - @task def get_feature_vector_batch(self): pks = [ {"ip": random.randint(0, self.client.rows - 1)} for i in range(self.client.batch_size) ] - self._get_feature_vectors(pks) + return self._get_feature_vectors(pks) @stopwatch def _get_feature_vectors(self, pk): diff --git a/locust_benchmark/requirements.txt b/locust_benchmark/requirements.txt index 2eef53a7f..d992f8066 100644 --- a/locust_benchmark/requirements.txt +++ b/locust_benchmark/requirements.txt @@ -1,3 +1,4 @@ markupsafe==2.0.1 -locust==2.17.0 +locust==2.23.1 +nest_asyncio==1.6.0 git+https://github.com/logicalclocks/hopsworks-api@main#egg=hopsworks[python]&subdirectory=python \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 823e3c8f2..2341c5ae1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -17,69 +17,69 @@ nav: - Setup and Installation: https://docs.hopsworks.ai/ - Administration: https://docs.hopsworks.ai/ - API: + - Login: generated/api/login.md - Platform API: - - Login: generated/api/login.md - Connection: generated/api/connection.md - - Projects: generated/api/projects.md - - Jobs: generated/api/jobs.md + - Datasets: generated/api/datasets.md + - Environment: generated/api/environment.md - Executions: generated/api/executions.md - FlinkCluster: generated/api/flink_cluster.md - - Environment: generated/api/environment.md - - GitRepo: generated/api/git_repo.md - GitProvider: generated/api/git_provider.md - GitRemote: generated/api/git_remote.md - - Datasets: generated/api/datasets.md - - KafkaTopic: generated/api/kafka_topic.md + - GitRepo: generated/api/git_repo.md + - Jobs: generated/api/jobs.md - KafkaSchema: generated/api/kafka_schema.md - - Secrets: generated/api/secrets.md + - KafkaTopic: generated/api/kafka_topic.md - OpenSearch: generated/api/opensearch.md + - Projects: generated/api/projects.md + - Secrets: generated/api/secrets.md - Feature Store API: + - Embedding: + - EmbeddingFeature: generated/api/embedding_feature_api.md + - EmbeddingIndex: generated/api/embedding_index_api.md + - SimilarityFunctionType: generated/api/similarity_function_type_api.md - ExpectationSuite: generated/api/expectation_suite_api.md - - FeatureStore: generated/api/feature_store_api.md - - FeatureGroup: generated/api/feature_group_api.md - ExternalFeatureGroup: generated/api/external_feature_group_api.md - - SpineGroup: generated/api/spine_group_api.md - - FeatureView: generated/api/feature_view_api.md - - TrainingDataset: generated/api/training_dataset_api.md - - Storage Connector: generated/api/storage_connector_api.md - Feature: generated/api/feature_api.md + - Feature Monitoring: + - Configuration: generated/api/feature_monitoring_config_api.md + - Result: generated/api/feature_monitoring_result_api.md + - Window: generated/api/feature_monitoring_window_config_api.md + - FeatureGroup: generated/api/feature_group_api.md + - FeatureStore: generated/api/feature_store_api.md + - FeatureView: generated/api/feature_view_api.md + - Provenance Links: generated/api/links.md - Query: generated/api/query_api.md + - SpineGroup: generated/api/spine_group_api.md + - Statistics: + - Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md + - Split Statistics: generated/api/split_statistics_api.md + - Statistics: generated/api/statistics_api.md + - Storage Connector: generated/api/storage_connector_api.md + - TrainingDataset: generated/api/training_dataset_api.md - Transformation Functions: - - UDF: generated/api/udf.md - HopsworksUDF: generated/api/hopsworks_udf.md - - TransformationFunction: generated/api/transformation_functions_api.md - Transformation Statistics: - - TransformationStatistics: generated/api/transformation_statistics.md - FeatureTransformationStatistics: generated/api/feature_transformation_statistics.md + - TransformationStatistics: generated/api/transformation_statistics.md + - TransformationFunction: generated/api/transformation_functions_api.md + - UDF: generated/api/udf.md - ValidationReport: generated/api/validation_report_api.md - - Provenance Links: generated/api/links.md - - Statistics: - - Statistics: generated/api/statistics_api.md - - Split Statistics: generated/api/split_statistics_api.md - - Feature descriptive statistics: generated/api/feature_descriptive_statistics_api.md - - Feature Monitoring: - - Configuration: generated/api/feature_monitoring_config_api.md - - Result: generated/api/feature_monitoring_result_api.md - - Window: generated/api/feature_monitoring_window_config_api.md - - Embedding: - - EmbeddingIndex: generated/api/embedding_index_api.md - - EmbeddingFeature: generated/api/embedding_feature_api.md - - SimilarityFunctionType: generated/api/similarity_function_type_api.md - Machine Learning API: - Model Registry: - - Model Registry: generated/model-registry/model_registry_api.md - Model: generated/model-registry/model_api.md + - Model Registry: generated/model-registry/model_registry_api.md - Model Schema: generated/model-registry/model_schema_api.md - Model Serving: - - Model Serving: generated/model-serving/model_serving_api.md - Deployment: generated/model-serving/deployment_api.md - Deployment state: generated/model-serving/predictor_state_api.md - Deployment state condition: generated/model-serving/predictor_state_condition_api.md - - Predictor: generated/model-serving/predictor_api.md - - Transformer: generated/model-serving/transformer_api.md - - Inference Logger: generated/model-serving/inference_logger_api.md - Inference Batcher: generated/model-serving/inference_batcher_api.md + - Inference Logger: generated/model-serving/inference_logger_api.md + - Model Serving: generated/model-serving/model_serving_api.md + - Predictor: generated/model-serving/predictor_api.md - Resources: generated/model-serving/resources_api.md + - Transformer: generated/model-serving/transformer_api.md # Added to allow navigation using the side drawer - Feature Store JavaDoc: https://docs.hopsworks.ai/feature-store-javadoc/latest/ - Contributing: CONTRIBUTING.md diff --git a/python/hopsworks/__init__.py b/python/hopsworks/__init__.py index 79d500769..220dcadb8 100644 --- a/python/hopsworks/__init__.py +++ b/python/hopsworks/__init__.py @@ -22,6 +22,7 @@ import tempfile import warnings from pathlib import Path +from typing import Literal, Union from hopsworks import client, constants, project, version from hopsworks.client.exceptions import ( @@ -83,6 +84,7 @@ def login( api_key_file: str = None, hostname_verification: bool = False, trust_store_path: str = None, + engine: Union[None, Literal["spark"], Literal["python"], Literal["training"]] = None, ) -> project.Project: """Connect to [Serverless Hopsworks](https://app.hopsworks.ai) by calling the `hopsworks.login()` function with no arguments. @@ -122,6 +124,13 @@ def login( api_key_file: Path to file wih Api Key hostname_verification: Whether to verify Hopsworks' certificate trust_store_path: Path on the file system containing the Hopsworks certificates + engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, + which initializes the engine to Spark if the environment provides Spark, for + example on Hopsworks and Databricks, or falls back to Python if Spark is not + available, e.g. on local Python environments or AWS SageMaker. This option + allows you to override this behaviour. `"training"` engine is useful when only + feature store metadata is needed, for example training dataset location and label + information when Hopsworks training experiment is conducted. # Returns `Project`: The Project object to perform operations on # Raises @@ -138,7 +147,7 @@ def login( # If inside hopsworks, just return the current project for now if "REST_ENDPOINT" in os.environ: - _hw_connection = _hw_connection(hostname_verification=hostname_verification) + _hw_connection = _hw_connection(hostname_verification=hostname_verification, engine=engine) _connected_project = _hw_connection.get_project() _initialize_module_apis() print("\nLogged in to project, explore it here " + _connected_project.get_url()) @@ -207,6 +216,7 @@ def login( _hw_connection = _hw_connection( host=host, port=port, + engine=engine, api_key_file=api_key_path, hostname_verification=hostname_verification, trust_store_path=trust_store_path, @@ -246,6 +256,7 @@ def login( _hw_connection = _hw_connection( host=host, port=port, + engine=engine, api_key_value=api_key, hostname_verification=hostname_verification, trust_store_path=trust_store_path, diff --git a/python/hopsworks_common/client/online_store_rest_client.py b/python/hopsworks_common/client/online_store_rest_client.py index 9ad05e9a3..b66897b09 100644 --- a/python/hopsworks_common/client/online_store_rest_client.py +++ b/python/hopsworks_common/client/online_store_rest_client.py @@ -305,7 +305,7 @@ def _check_hopsworks_connection(self) -> None: assert ( client.get_instance() is not None and client.get_instance()._connected ), """Hopsworks Client is not connected. Please connect to Hopsworks cluster - via hopsworks.login or hsfs.connection before initialising the Online Store REST Client. + via hopsworks.login before initialising the Online Store REST Client. """ _logger.debug("Hopsworks connection is active.") diff --git a/python/hopsworks_common/connection.py b/python/hopsworks_common/connection.py index 6972b6a85..43a64bc76 100644 --- a/python/hopsworks_common/connection.py +++ b/python/hopsworks_common/connection.py @@ -100,7 +100,7 @@ class Connection: Defaults to `None`. engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, which initializes the engine to Spark if the environment provides Spark, for - example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not + example on Hopsworks and Databricks, or falls back to Python if Spark is not available, e.g. on local Python environments or AWS SageMaker. This option allows you to override this behaviour. `"training"` engine is useful when only feature store metadata is needed, for example training dataset location and label @@ -151,7 +151,6 @@ def __init__( def get_feature_store( self, name: Optional[str] = None, - engine: Optional[str] = None, ): # -> feature_store.FeatureStore # the typing is commented out due to circular dependency, it breaks auto_doc.py """Get a reference to a feature store to perform operations on. @@ -161,25 +160,10 @@ def get_feature_store( # Arguments name: The name of the feature store, defaults to `None`. - engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, - which initializes the engine to Spark if the environment provides Spark, for - example on Hopsworks and Databricks, or falls back on Hive in Python if Spark is not - available, e.g. on local Python environments or AWS SageMaker. This option - allows you to override this behaviour. `"training"` engine is useful when only - feature store metadata is needed, for example training dataset location and label - information when Hopsworks training experiment is conducted. # Returns `FeatureStore`. A feature store handle object to perform operations on. """ - # Ensure the engine is initialized and of right type - from hsfs import engine as hsfs_engine - - if engine: - global _hsfs_engine_type - _hsfs_engine_type = engine - hsfs_engine.get_instance() - if not name: name = client.get_instance()._project_name return self._feature_store_api.get(util.append_feature_store_suffix(name)) @@ -484,7 +468,74 @@ def connection( api_key_file: Optional[str] = None, api_key_value: Optional[str] = None, ) -> Connection: - """Connection factory method, accessible through `hopsworks.connection()`.""" + """Connection factory method, accessible through `hopsworks.connection()`. + + This class provides convenience classmethods accessible from the `hopsworks`-module: + + !!! example "Connection factory" + For convenience, `hopsworks` provides a factory method, accessible from the top level + module, so you don't have to import the `Connection` class manually: + + ```python + import hopsworks + conn = hopsworks.connection() + ``` + + !!! hint "Save API Key as File" + To get started quickly, you can simply create a file with the previously + created Hopsworks API Key and place it on the environment from which you + wish to connect to Hopsworks. + + You can then connect by simply passing the path to the key file when + instantiating a connection: + + ```python hl_lines="6" + import hopsworks + conn = hopsworks.connection( + 'my_instance', # DNS of your Hopsworks instance + 443, # Port to reach your Hopsworks instance, defaults to 443 + api_key_file='hopsworks.key', # The file containing the API key generated above + hostname_verification=True) # Disable for self-signed certificates + ) + project = conn.get_project("my_project") + ``` + + Clients in external clusters need to connect to the Hopsworks using an + API key. The API key is generated inside the Hopsworks platform, and requires at + least the "project" scope to be able to access a project. + For more information, see the [integration guides](../setup.md). + + # Arguments + host: The hostname of the Hopsworks instance in the form of `[UUID].cloud.hopsworks.ai`, + defaults to `None`. Do **not** use the url including `https://` when connecting + programatically. + port: The port on which the Hopsworks instance can be reached, + defaults to `443`. + project: The name of the project to connect to. When running on Hopsworks, this + defaults to the project from where the client is run from. + Defaults to `None`. + engine: Which engine to use, `"spark"`, `"python"` or `"training"`. Defaults to `None`, + which initializes the engine to Spark if the environment provides Spark, for + example on Hopsworks and Databricks, or falls back to Python if Spark is not + available, e.g. on local Python environments or AWS SageMaker. This option + allows you to override this behaviour. `"training"` engine is useful when only + feature store metadata is needed, for example training dataset location and label + information when Hopsworks training experiment is conducted. + hostname_verification: Whether or not to verify Hopsworks' certificate, defaults + to `True`. + trust_store_path: Path on the file system containing the Hopsworks certificates, + defaults to `None`. + cert_folder: The directory to store retrieved HopsFS certificates, defaults to + `"/tmp"`. Only required when running without a Spark environment. + api_key_file: Path to a file containing the API Key, defaults to `None`. + api_key_value: API Key as string, if provided, `api_key_file` will be ignored, + however, this should be used with care, especially if the used notebook or + job script is accessible by multiple parties. Defaults to `None`. + + # Returns + `Connection`. Connection handle to perform operations on a + Hopsworks project. + """ return cls( host, port, diff --git a/python/hopsworks_common/constants.py b/python/hopsworks_common/constants.py index 72672dae8..b98ed8497 100644 --- a/python/hopsworks_common/constants.py +++ b/python/hopsworks_common/constants.py @@ -158,14 +158,17 @@ class MODEL: FRAMEWORK_TORCH = "TORCH" FRAMEWORK_PYTHON = "PYTHON" FRAMEWORK_SKLEARN = "SKLEARN" + FRAMEWORK_LLM = "LLM" class MODEL_REGISTRY: HOPSFS_MOUNT_PREFIX = "/hopsfs/" + MODEL_FILES_DIR_NAME = "Files" class MODEL_SERVING: MODELS_DATASET = "Models" + ARTIFACTS_DIR_NAME = "Artifacts" class ARTIFACT_VERSION: @@ -210,6 +213,7 @@ class PREDICTOR: # model server MODEL_SERVER_PYTHON = "PYTHON" MODEL_SERVER_TF_SERVING = "TENSORFLOW_SERVING" + MODEL_SERVER_VLLM = "VLLM" # serving tool SERVING_TOOL_DEFAULT = "DEFAULT" SERVING_TOOL_KSERVE = "KSERVE" diff --git a/python/hopsworks_common/core/dataset_api.py b/python/hopsworks_common/core/dataset_api.py index dc85dd263..f7ce40743 100644 --- a/python/hopsworks_common/core/dataset_api.py +++ b/python/hopsworks_common/core/dataset_api.py @@ -46,6 +46,7 @@ def __init__(self): DEFAULT_UPLOAD_FLOW_CHUNK_SIZE = 10 * 1024 * 1024 DEFAULT_UPLOAD_SIMULTANEOUS_UPLOADS = 3 + DEFAULT_UPLOAD_SIMULTANEOUS_CHUNKS = 3 DEFAULT_UPLOAD_MAX_CHUNK_RETRIES = 1 DEFAULT_DOWNLOAD_FLOW_CHUNK_SIZE = 1024 * 1024 @@ -159,10 +160,11 @@ def upload( overwrite: bool = False, chunk_size: int = DEFAULT_UPLOAD_FLOW_CHUNK_SIZE, simultaneous_uploads: int = DEFAULT_UPLOAD_SIMULTANEOUS_UPLOADS, + simultaneous_chunks: int = DEFAULT_UPLOAD_SIMULTANEOUS_CHUNKS, max_chunk_retries: int = DEFAULT_UPLOAD_MAX_CHUNK_RETRIES, chunk_retry_interval: int = 1, ): - """Upload a file to the Hopsworks filesystem. + """Upload a file or directory to the Hopsworks filesystem. ```python @@ -172,44 +174,93 @@ def upload( dataset_api = project.get_dataset_api() + # upload a file to Resources dataset uploaded_file_path = dataset_api.upload("my_local_file.txt", "Resources") + # upload a directory to Resources dataset + uploaded_file_path = dataset_api.upload("my_dir", "Resources") + ``` # Arguments - local_path: local path to file to upload + local_path: local path to file or directory to upload, can be relative or absolute upload_path: path to directory where to upload the file in Hopsworks Filesystem - overwrite: overwrite file if exists + overwrite: overwrite file or directory if exists chunk_size: upload chunk size in bytes. Default 10 MB - simultaneous_uploads: number of simultaneous chunks to upload. Default 3 + simultaneous_chunks: number of simultaneous chunks to upload for each file upload. Default 3 + simultaneous_uploads: number of simultaneous files to be uploaded for directories. Default 3 max_chunk_retries: maximum retry for a chunk. Default is 1 chunk_retry_interval: chunk retry interval in seconds. Default is 1sec # Returns - `str`: Path to uploaded file + `str`: Path to uploaded file or directory # Raises - `RestAPIError`: If unable to upload the file + `RestAPIError`: If unable to upload the file or directory """ + # local path could be absolute or relative, if not os.path.isabs(local_path) and os.path.exists( os.path.join(os.getcwd(), local_path) ): local_path = os.path.join(os.getcwd(), local_path) - file_size = os.path.getsize(local_path) - _, file_name = os.path.split(local_path) destination_path = upload_path + "/" + file_name if self.exists(destination_path): if overwrite: - self.remove(destination_path) + if 'datasetType' in self._get(destination_path): + raise DatasetException("overwrite=True not supported on a top-level dataset") + else: + self.remove(destination_path) else: raise DatasetException( "{} already exists, set overwrite=True to overwrite it".format( - local_path + destination_path ) ) + if os.path.isdir(local_path): + self.mkdir(destination_path) + + if os.path.isdir(local_path): + with ThreadPoolExecutor(simultaneous_uploads) as executor: + # if path is a dir, upload files and folders iteratively + for root, dirs, files in os.walk(local_path): + # os.walk(local_model_path), where local_model_path is expected to be an absolute path + # - root is the absolute path of the directory being walked + # - dirs is the list of directory names present in the root dir + # - files is the list of file names present in the root dir + # we need to replace the local path prefix with the hdfs path prefix (i.e., /srv/hops/....../root with /Projects/.../) + remote_base_path = root.replace( + local_path, destination_path + ).replace(os.sep, "/") + for d_name in dirs: + self.mkdir(remote_base_path + "/" + d_name) + + # uploading files in the same folder is done concurrently + futures = [ + executor.submit( + self._upload_file, f_name, root + os.sep + f_name, remote_base_path, chunk_size, simultaneous_chunks, max_chunk_retries, chunk_retry_interval + ) + for f_name in files + ] + + # wait for all upload tasks to complete + _, _ = wait(futures) + try: + _ = [future.result() for future in futures] + except Exception as e: + raise e + else: + self._upload_file(file_name, local_path, upload_path, chunk_size, simultaneous_chunks, max_chunk_retries, chunk_retry_interval) + + return upload_path + "/" + os.path.basename(local_path) + + + def _upload_file(self, file_name, local_path, upload_path, chunk_size, simultaneous_chunks, max_chunk_retries, chunk_retry_interval): + + file_size = os.path.getsize(local_path) + num_chunks = math.ceil(file_size / chunk_size) base_params = self._get_flow_base_params( @@ -223,15 +274,15 @@ def upload( pbar = tqdm( total=file_size, bar_format="{desc}: {percentage:.3f}%|{bar}| {n_fmt}/{total_fmt} elapsed<{elapsed} remaining<{remaining}", - desc="Uploading", + desc="Uploading {}".format(local_path), ) except Exception: self._log.exception("Failed to initialize progress bar.") self._log.info("Starting upload") - with ThreadPoolExecutor(simultaneous_uploads) as executor: + with ThreadPoolExecutor(simultaneous_chunks) as executor: while True: chunks = [] - for _ in range(simultaneous_uploads): + for _ in range(simultaneous_chunks): chunk = f.read(chunk_size) if not chunk: break @@ -269,8 +320,6 @@ def upload( else: self._log.info("Upload finished") - return upload_path + "/" + os.path.basename(local_path) - def _upload_chunk( self, base_params, @@ -459,7 +508,10 @@ def copy(self, source_path: str, destination_path: str, overwrite: bool = False) """ if self.exists(destination_path): if overwrite: - self.remove(destination_path) + if 'datasetType' in self._get(destination_path): + raise DatasetException("overwrite=True not supported on a top-level dataset") + else: + self.remove(destination_path) else: raise DatasetException( "{} already exists, set overwrite=True to overwrite it".format( @@ -497,10 +549,12 @@ def move(self, source_path: str, destination_path: str, overwrite: bool = False) # Raises `RestAPIError`: If unable to perform the move """ - if self.exists(destination_path): if overwrite: - self.remove(destination_path) + if 'datasetType' in self._get(destination_path): + raise DatasetException("overwrite=True not supported on a top-level dataset") + else: + self.remove(destination_path) else: raise DatasetException( "{} already exists, set overwrite=True to overwrite it".format( diff --git a/python/hopsworks_common/project.py b/python/hopsworks_common/project.py index df82b3f79..b35cac288 100644 --- a/python/hopsworks_common/project.py +++ b/python/hopsworks_common/project.py @@ -109,7 +109,7 @@ def project_namespace(self): return self._project_namespace def get_feature_store( - self, name: Optional[str] = None, engine: Optional[str] = None + self, name: Optional[str] = None ): # -> hsfs.feature_store.FeatureStore """Connect to Project's Feature Store. @@ -127,15 +127,12 @@ def get_feature_store( # Arguments name: Project name of the feature store. - engine: Which engine to use, `"spark"`, `"python"` or `"training"`. - Defaults to `"python"` when connected to [Serverless Hopsworks](https://app.hopsworks.ai). - See hsfs.Connection.connection documentation for more information. # Returns `hsfs.feature_store.FeatureStore`: The Feature Store API # Raises `RestAPIError`: If unable to connect """ - return client.get_connection().get_feature_store(name, engine) + return client.get_connection().get_feature_store(name) def get_model_registry(self): """Connect to Project's Model Registry API. diff --git a/python/hopsworks_common/util.py b/python/hopsworks_common/util.py index 5d14c70af..27a3ff8eb 100644 --- a/python/hopsworks_common/util.py +++ b/python/hopsworks_common/util.py @@ -465,6 +465,7 @@ def is_interactive(): def set_model_class(model): + from hsml.llm.model import Model as LLMModel from hsml.model import Model as BaseModel from hsml.python.model import Model as PyModel from hsml.sklearn.model import Model as SkLearnModel @@ -490,6 +491,8 @@ def set_model_class(model): return SkLearnModel(**model) elif framework == MODEL.FRAMEWORK_PYTHON: return PyModel(**model) + elif framework == MODEL.FRAMEWORK_LLM: + return LLMModel(**model) else: raise ValueError( "framework {} is not a supported framework".format(str(framework)) @@ -606,6 +609,8 @@ def validate_metrics(metrics): def get_predictor_for_model(model, **kwargs): + from hsml.llm.model import Model as LLMModel + from hsml.llm.predictor import Predictor as vLLMPredictor from hsml.model import Model as BaseModel from hsml.predictor import Predictor as BasePredictor from hsml.python.model import Model as PyModel @@ -632,6 +637,8 @@ def get_predictor_for_model(model, **kwargs): return SkLearnPredictor(**kwargs) if type(model) is PyModel: return PyPredictor(**kwargs) + if type(model) is LLMModel: + return vLLMPredictor(**kwargs) if type(model) is BaseModel: return BasePredictor( # python as default framework and model server model_framework=MODEL.FRAMEWORK_PYTHON, diff --git a/python/hopsworks_common/version.py b/python/hopsworks_common/version.py index 52cd363fc..82beef4ab 100644 --- a/python/hopsworks_common/version.py +++ b/python/hopsworks_common/version.py @@ -14,4 +14,4 @@ # limitations under the License. # -__version__ = "4.1.0.dev1" +__version__ = "4.2.0.dev1" diff --git a/python/hsfs/builtin_transformations.py b/python/hsfs/builtin_transformations.py index 1fc2ce670..4426268cc 100644 --- a/python/hsfs/builtin_transformations.py +++ b/python/hsfs/builtin_transformations.py @@ -43,7 +43,7 @@ def robust_scaler(feature: pd.Series, statistics=feature_statistics) -> pd.Serie ) -@udf(int, drop=["feature"]) +@udf(int, drop=["feature"], mode="pandas") def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = sorted([value for value in statistics.feature.unique_values]) value_to_index = {value: index for index, value in enumerate(unique_data)} @@ -56,7 +56,7 @@ def label_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Serie ) -@udf(bool, drop=["feature"]) +@udf(bool, drop=["feature"], mode="pandas") def one_hot_encoder(feature: pd.Series, statistics=feature_statistics) -> pd.Series: unique_data = [value for value in statistics.feature.unique_values] diff --git a/python/hsfs/core/feature_group_api.py b/python/hsfs/core/feature_group_api.py index ab05fb9b5..037228c73 100644 --- a/python/hsfs/core/feature_group_api.py +++ b/python/hsfs/core/feature_group_api.py @@ -21,7 +21,12 @@ from hopsworks_common import client from hsfs import feature_group as fg_mod from hsfs import feature_group_commit, util -from hsfs.core import explicit_provenance, ingestion_job, ingestion_job_conf +from hsfs.core import ( + explicit_provenance, + ingestion_job, + ingestion_job_conf, + job, +) class FeatureGroupApi: @@ -416,6 +421,36 @@ def ingestion( ), ) + def update_table_schema( + self, + feature_group_instance: fg_mod.FeatureGroup, + ) -> job.Job: + """ + Setup a Hopsworks job to update table schema + Args: + feature_group_instance: FeatureGroup, required + metadata object of feature group. + job_conf: the configuration for the job application + """ + + _client = client.get_instance() + path_params = [ + "project", + _client._project_id, + "featurestores", + feature_group_instance.feature_store_id, + "featuregroups", + feature_group_instance.id, + "updatetableschema", + ] + + headers = {"content-type": "application/json"} + return job.Job.from_response_json( + _client._send_request( + "POST", path_params, headers=headers + ), + ) + def get_parent_feature_groups( self, feature_group_instance: Union[ diff --git a/python/hsfs/core/feature_group_engine.py b/python/hsfs/core/feature_group_engine.py index f00a044e1..0eb5c441a 100644 --- a/python/hsfs/core/feature_group_engine.py +++ b/python/hsfs/core/feature_group_engine.py @@ -15,7 +15,7 @@ from __future__ import annotations import warnings -from typing import List +from typing import List, Union from hsfs import engine, feature, util from hsfs import feature_group as fg @@ -67,7 +67,7 @@ def _update_feature_group_schema_on_demand_transformations( def save( self, - feature_group, + feature_group: Union[fg.FeatureGroup, fg.ExternalFeatureGroup], feature_dataframe, write_options, validation_options: dict = None, @@ -80,6 +80,21 @@ def save( feature_group=feature_group, features=dataframe_features ) ) + + # Currently on-demand transformation functions not supported in external feature groups. + if feature_group.transformation_functions: + if not isinstance(feature_group, fg.ExternalFeatureGroup): + feature_dataframe = ( + engine.get_instance()._apply_transformation_function( + feature_group.transformation_functions, feature_dataframe + ) + ) + else: + warnings.warn( + "On-Demand features were not created because On-Demand Transformations are not supported for External Feature Groups.", + stacklevel=1, + ) + util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features ) @@ -119,7 +134,7 @@ def save( def insert( self, - feature_group, + feature_group: Union[fg.FeatureGroup, fg.ExternalFeatureGroup], feature_dataframe, overwrite, operation, @@ -132,6 +147,16 @@ def insert( feature_group.time_travel_format, features=feature_group.features, ) + + # Currently on-demand transformation functions not supported in external feature groups. + if ( + not isinstance(feature_group, fg.ExternalFeatureGroup) + and feature_group.transformation_functions + ): + feature_dataframe = engine.get_instance()._apply_transformation_function( + feature_group.transformation_functions, feature_dataframe + ) + dataframe_features = ( self._update_feature_group_schema_on_demand_transformations( feature_group=feature_group, features=dataframe_features @@ -249,6 +274,8 @@ def commit_delete(feature_group, delete_df, write_options): @staticmethod def delta_vacuum(feature_group, retention_hours): if feature_group.time_travel_format == "DELTA": + # TODO: This should change, DeltaEngine and HudiEngine always assumes spark client! + # Cannot properly manage what should happen when using python. delta_engine_instance = delta_engine.DeltaEngine( feature_group.feature_store_id, feature_group.feature_store_name, @@ -296,10 +323,7 @@ def append_features(self, feature_group, new_features): ) # write empty dataframe to update parquet schema - if feature_group.time_travel_format == "DELTA": - engine.get_instance().add_cols_to_delta_table(feature_group, new_features) - else: - engine.get_instance().save_empty_dataframe(feature_group, new_features=new_features) + engine.get_instance().update_table_schema(feature_group) def update_description(self, feature_group, description): """Updates the description of a feature group.""" @@ -326,7 +350,7 @@ def update_deprecated(self, feature_group, deprecate): def insert_stream( self, - feature_group, + feature_group: Union[fg.FeatureGroup, fg.ExternalFeatureGroup], dataframe, query_name, output_mode, @@ -349,6 +373,12 @@ def insert_stream( feature_group=feature_group, features=dataframe_features ) ) + + if feature_group.transformation_functions: + dataframe = engine.get_instance()._apply_transformation_function( + feature_group.transformation_functions, dataframe + ) + util.validate_embedding_feature_type( feature_group.embedding_index, dataframe_features ) diff --git a/python/hsfs/core/hudi_engine.py b/python/hsfs/core/hudi_engine.py index 4492f0a19..e96b8ea56 100644 --- a/python/hsfs/core/hudi_engine.py +++ b/python/hsfs/core/hudi_engine.py @@ -234,25 +234,6 @@ def _setup_hudi_read_opts(self, hudi_fg_alias, read_options): return hudi_options - def reconcile_hudi_schema( - self, save_empty_dataframe_callback, hudi_fg_alias, read_options - ): - if sorted(self._spark_session.table(hudi_fg_alias.alias).columns) != sorted( - [feature.name for feature in hudi_fg_alias.feature_group._features] + self.HUDI_SPEC_FEATURE_NAMES - ): - full_fg = self._feature_group_api.get( - feature_store_id=hudi_fg_alias.feature_group._feature_store_id, - name=hudi_fg_alias.feature_group.name, - version=hudi_fg_alias.feature_group.version, - ) - - save_empty_dataframe_callback(full_fg) - - self.register_temporary_table( - hudi_fg_alias, - read_options, - ) - @staticmethod def _get_last_commit_metadata(spark_context, base_path): hopsfs_conf = spark_context._jvm.org.apache.hadoop.fs.FileSystem.get( diff --git a/python/hsfs/core/kafka_engine.py b/python/hsfs/core/kafka_engine.py index d21b6ec22..ee9e892be 100644 --- a/python/hsfs/core/kafka_engine.py +++ b/python/hsfs/core/kafka_engine.py @@ -141,7 +141,7 @@ def kafka_get_offsets( offsets += f",{partition_metadata.id}:{consumer.get_watermark_offsets(partition)[tuple_value]}" consumer.close() - return f" -initialCheckPointString {topic_name + offsets}" + return f"{topic_name + offsets}" return "" diff --git a/python/hsfs/core/vector_server.py b/python/hsfs/core/vector_server.py index 0e785dde5..d354a5400 100755 --- a/python/hsfs/core/vector_server.py +++ b/python/hsfs/core/vector_server.py @@ -149,6 +149,7 @@ def __init__( self._feature_to_handle_if_sql: Optional[Set[str]] = None self._valid_serving_keys: Set[str] = set() self._serving_initialized: bool = False + self.__all_features_on_demand: Optional[bool] = None def init_serving( self, @@ -415,14 +416,23 @@ def get_feature_vectors( request_parameters is None or len(request_parameters) == 0 or isinstance(request_parameters, dict) + or not entries or len(request_parameters) == len(entries) - ), "Request Parameters should be a Dictionary, None, empty or have the same length as the entries" + ), "Request Parameters should be a Dictionary, None, empty or have the same length as the entries if they are not None or empty." online_client_choice = self.which_client_and_ensure_initialised( force_rest_client=force_rest_client, force_sql_client=force_sql_client ) rondb_entries = [] skipped_empty_entries = [] + + if not entries: + entries = ( + [[] * len(request_parameters)] + if isinstance(request_parameters, list) + else [[]] + ) + for (idx, entry), passed, vector_features in itertools.zip_longest( enumerate(entries), passed_features, @@ -547,7 +557,11 @@ def assemble_feature_vector( # for backward compatibility, before 3.4, if result is empty, # instead of throwing error, it skips the result # Maybe we drop this behaviour for 4.0 - if len(result_dict) == 0 and not allow_missing: + if ( + len(result_dict) == 0 + and not allow_missing + and not self._all_features_on_demand + ): return None if not allow_missing and len(missing_features) > 0: @@ -1255,6 +1269,17 @@ def validate_entry( Keys relevant to vector_db are filtered out. """ + _logger.debug( + "Checking if entry is None and all features in the feature view are on-demand." + ) + if not entry: + if self._all_features_on_demand: + return {} + else: + raise exceptions.FeatureStoreException( + "The required argument `entries` is missing. If the feature view includes only on-demand features, entries may be left empty or set to None." + ) + _logger.debug("Checking keys in entry are valid serving keys.") for key in entry.keys(): if key not in self.valid_serving_keys: @@ -1323,6 +1348,15 @@ def identify_missing_features_pre_fetch( passed_feature_names = passed_feature_names.union( vector_db_features.keys() ) + if self._on_demand_feature_names and len(self._on_demand_feature_names) > 0: + # Remove on-demand features from validation check as they would be computed. + _logger.debug( + "Appending on_demand_feature_names : %s, to passed_feature_names for pre-fetch missing", + self._on_demand_feature_names, + ) + passed_feature_names = passed_feature_names.union( + self._on_demand_feature_names + ) neither_fetched_nor_passed = fetched_features.difference( passed_feature_names ) @@ -1575,3 +1609,12 @@ def transformed_feature_vector_col_name(self): ] self._transformed_feature_vector_col_name.extend(output_column_names) return self._transformed_feature_vector_col_name + + @property + def _all_features_on_demand(self) -> bool: + """True if all features in the feature view are on-demand.""" + if self.__all_features_on_demand is None: + self.__all_features_on_demand = all( + feature.on_demand_transformation_function for feature in self._features + ) + return self.__all_features_on_demand diff --git a/python/hsfs/engine/python.py b/python/hsfs/engine/python.py index b2fb1968d..eeacf8e27 100644 --- a/python/hsfs/engine/python.py +++ b/python/hsfs/engine/python.py @@ -808,15 +808,6 @@ def save_dataframe( online_write_options: Dict[str, Any], validation_id: Optional[int] = None, ) -> Optional[job.Job]: - # Currently on-demand transformation functions not supported in external feature groups. - if ( - not isinstance(feature_group, ExternalFeatureGroup) - and feature_group.transformation_functions - ): - dataframe = self._apply_transformation_function( - feature_group.transformation_functions, dataframe - ) - if ( hasattr(feature_group, "EXTERNAL_FEATURE_GROUP") and feature_group.online_enabled @@ -1212,11 +1203,11 @@ def save_stream_dataframe( "Stream ingestion is not available on Python environments, because it requires Spark as engine." ) - def save_empty_dataframe( - self, feature_group: Union[FeatureGroup, ExternalFeatureGroup], new_features=None - ) -> None: - """Wrapper around save_dataframe in order to provide no-op.""" - pass + def update_table_schema(self, feature_group: Union[FeatureGroup, ExternalFeatureGroup]) -> None: + _job = self._feature_group_api.update_table_schema(feature_group) + _job._wait_for_job( + await_termination=True + ) def _get_app_options( self, user_write_options: Optional[Dict[str, Any]] = None @@ -1296,9 +1287,19 @@ def _apply_transformation_function( dataset.columns ) if missing_features: - raise FeatureStoreException( - f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." - ) + if ( + tf.transformation_type + == transformation_function.TransformationType.ON_DEMAND + ): + # On-demand transformation are applied using the python/spark engine during insertion, the transformation while retrieving feature vectors are performed in the vector_server. + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the on-demand transformation function '{hopsworks_udf.function_name}' are not present in the dataframe being inserted into the feature group. " + + "Please verify that the correct feature names are used in the transformation function and that these features exist in the dataframe being inserted." + ) + else: + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the model-dependent transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please verify that the correct features are specified in the transformation function." + ) if tf.hopsworks_udf.dropped_features: dropped_features.update(tf.hopsworks_udf.dropped_features) @@ -1406,7 +1407,9 @@ def _apply_pandas_udf( for feature in hopsworks_udf.transformation_features ] ) - ) + ).set_index( + dataframe.index + ) # Index is set to the input dataframe index so that pandas would merge the new columns without reordering them. else: dataframe[hopsworks_udf.output_column_names[0]] = hopsworks_udf.get_udf( online=False @@ -1417,9 +1420,11 @@ def _apply_pandas_udf( for feature in hopsworks_udf.transformation_features ] ) - ) + ).set_axis( + dataframe.index + ) # Index is set to the input dataframe index so that pandas would merge the new column without reordering it. if hopsworks_udf.output_column_names[0] in dataframe.columns: - # Overwriting features so reordering dataframe to move overwritten column to the end of the dataframe + # Overwriting features also reordering dataframe to move overwritten column to the end of the dataframe cols = dataframe.columns.tolist() cols.append(cols.pop(cols.index(hopsworks_udf.output_column_names[0]))) dataframe = dataframe[cols] @@ -1505,12 +1510,12 @@ def _write_dataframe_kafka( topic_name=feature_group._online_topic_name, feature_store_id=feature_group.feature_store_id, offline_write_options=offline_write_options, - high=True, + high=False, ) now = datetime.now(timezone.utc) feature_group.materialization_job.run( args=feature_group.materialization_job.config.get("defaultArgs", "") - + initial_check_point, + + (f" -initialCheckPointString {initial_check_point}" if initial_check_point else ""), await_termination=offline_write_options.get("wait_for_job", False), ) offline_backfill_every_hr = offline_write_options.pop( @@ -1540,7 +1545,7 @@ def _write_dataframe_kafka( # provide the initial_check_point as it will reduce the read amplification of materialization job feature_group.materialization_job.run( args=feature_group.materialization_job.config.get("defaultArgs", "") - + initial_check_point, + + (f" -initialCheckPointString {initial_check_point}" if initial_check_point else ""), await_termination=offline_write_options.get("wait_for_job", False), ) return feature_group.materialization_job @@ -1582,9 +1587,10 @@ def _start_offline_materialization(offline_write_options: Dict[str, Any]) -> boo def _convert_feature_log_to_df(feature_log, cols) -> pd.DataFrame: if feature_log is None and cols: return pd.DataFrame(columns=cols) - if not (isinstance(feature_log, (list, pd.DataFrame, pl.DataFrame)) or ( - HAS_NUMPY and isinstance(feature_log, np.ndarray) - )): + if not ( + isinstance(feature_log, (list, pd.DataFrame, pl.DataFrame)) + or (HAS_NUMPY and isinstance(feature_log, np.ndarray)) + ): raise ValueError(f"Type '{type(feature_log)}' not accepted") if isinstance(feature_log, list) or ( HAS_NUMPY and isinstance(feature_log, np.ndarray) diff --git a/python/hsfs/engine/spark.py b/python/hsfs/engine/spark.py index 2ff6bc39d..67e15468b 100644 --- a/python/hsfs/engine/spark.py +++ b/python/hsfs/engine/spark.py @@ -35,6 +35,7 @@ import tzlocal from hopsworks_common.core.constants import HAS_NUMPY, HAS_PANDAS from hsfs.constructor import query +from hsfs.core import feature_group_api # in case importing in %%local from hsfs.core.vector_db_client import VectorDbClient @@ -197,7 +198,7 @@ def register_external_temporary_table(self, external_fg, alias): external_fg.query, external_fg.data_format, external_fg.options, - external_fg.prepare_spark_location(), + external_fg.storage_connector._get_path(external_fg.path), # cant rely on location since this method can be used before FG is saved ) else: external_dataset = external_fg.dataframe @@ -221,8 +222,8 @@ def register_hudi_temporary_table( read_options, ) - hudi_engine_instance.reconcile_hudi_schema( - self.save_empty_dataframe, hudi_fg_alias, read_options + self.reconcile_schema( + hudi_fg_alias, read_options, hudi_engine_instance ) def register_delta_temporary_table( @@ -241,6 +242,30 @@ def register_delta_temporary_table( read_options, ) + self.reconcile_schema( + delta_fg_alias, read_options, delta_engine_instance + ) + + def reconcile_schema( + self, fg_alias, read_options, engine_instance + ): + if sorted(self._spark_session.table(fg_alias.alias).columns) != sorted( + [feature.name for feature in fg_alias.feature_group._features] + + hudi_engine.HudiEngine.HUDI_SPEC_FEATURE_NAMES if fg_alias.feature_group.time_travel_format == "HUDI" else [] + ): + full_fg = feature_group_api.FeatureGroupApi().get( + feature_store_id=fg_alias.feature_group._feature_store_id, + name=fg_alias.feature_group.name, + version=fg_alias.feature_group.version, + ) + + self.update_table_schema(full_fg) + + engine_instance.register_temporary_table( + fg_alias, + read_options, + ) + def _return_dataframe_type(self, dataframe, dataframe_type): if dataframe_type.lower() in ["default", "spark"]: return dataframe @@ -415,14 +440,6 @@ def save_dataframe( validation_id=None, ): try: - # Currently on-demand transformation functions not supported in external feature groups. - if ( - not isinstance(feature_group, fg_mod.ExternalFeatureGroup) - and feature_group.transformation_functions - ): - dataframe = self._apply_transformation_function( - feature_group.transformation_functions, dataframe - ) if ( isinstance(feature_group, fg_mod.ExternalFeatureGroup) and feature_group.online_enabled @@ -467,17 +484,10 @@ def save_stream_dataframe( checkpoint_dir: Optional[str], write_options: Optional[Dict[str, Any]], ): - if feature_group.transformation_functions: - dataframe = self._apply_transformation_function( - feature_group.transformation_functions, dataframe - ) - write_options = kafka_engine.get_kafka_config( feature_group.feature_store_id, write_options, engine="spark" ) - serialized_df = self._online_fg_to_avro( - feature_group, self._encode_complex_features(feature_group, dataframe) - ) + serialized_df = self._serialize_to_avro(feature_group, dataframe) project_id = str(feature_group.feature_store.project_id) feature_group_id = str(feature_group._id) @@ -570,9 +580,7 @@ def _save_online_dataframe(self, feature_group, dataframe, write_options): feature_group.feature_store_id, write_options, engine="spark" ) - serialized_df = self._online_fg_to_avro( - feature_group, self._encode_complex_features(feature_group, dataframe) - ) + serialized_df = self._serialize_to_avro(feature_group, dataframe) project_id = str(feature_group.feature_store.project_id).encode("utf8") feature_group_id = str(feature_group._id).encode("utf8") @@ -592,13 +600,13 @@ def _save_online_dataframe(self, feature_group, dataframe, write_options): "topic", feature_group._online_topic_name ).save() - def _encode_complex_features( + def _serialize_to_avro( self, feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], dataframe: Union[RDD, DataFrame], ): """Encodes all complex type features to binary using their avro type as schema.""" - return dataframe.select( + encoded_dataframe = dataframe.select( [ field["name"] if field["name"] not in feature_group.get_complex_features() @@ -609,15 +617,10 @@ def _encode_complex_features( ] ) - def _online_fg_to_avro( - self, - feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], - dataframe: Union[DataFrame, RDD], - ): """Packs all features into named struct to be serialized to single avro/binary column. And packs primary key into arry to be serialized for partitioning. """ - return dataframe.select( + return encoded_dataframe.select( [ # be aware: primary_key array should always be sorted to_avro( @@ -640,6 +643,30 @@ def _online_fg_to_avro( ] ) + def _deserialize_from_avro( + self, + feature_group: Union[fg_mod.FeatureGroup, fg_mod.ExternalFeatureGroup], + dataframe: Union[RDD, DataFrame], + ): + """ + Deserializes 'value' column from binary using avro schema and unpacks it into columns. + """ + decoded_dataframe = dataframe.select( + from_avro("value", feature_group._get_encoded_avro_schema()).alias("value") + ).select(col("value.*")) + + """Decodes all complex type features from binary using their avro type as schema.""" + return decoded_dataframe.select( + [ + field["name"] + if field["name"] not in feature_group.get_complex_features() + else from_avro( + field["name"], feature_group._get_feature_avro_schema(field["name"]) + ).alias(field["name"]) + for field in json.loads(feature_group.avro_schema)["fields"] + ] + ) + def get_training_data( self, training_dataset: training_dataset.TrainingDataset, @@ -1250,8 +1277,12 @@ def setup_storage_connector(self, storage_connector, path=None): return path def _setup_s3_hadoop_conf(self, storage_connector, path): - # For legacy behaviour set the S3 values at global level - self._set_s3_hadoop_conf(storage_connector, "fs.s3a") + FS_S3_GLOBAL_CONF = "fs.s3a.global-conf" + + # The argument arrive here as strings + if storage_connector.arguments.get(FS_S3_GLOBAL_CONF, "True").lower() == "true": + # For legacy behaviour set the S3 values at global level + self._set_s3_hadoop_conf(storage_connector, "fs.s3a") # Set credentials at bucket level as well to allow users to use multiple # storage connector in the same application. @@ -1309,18 +1340,20 @@ def is_spark_dataframe(self, dataframe): return True return False - def save_empty_dataframe(self, feature_group, new_features=None): + def update_table_schema(self, feature_group): + if feature_group.time_travel_format == "DELTA": + self._add_cols_to_delta_table(feature_group) + else: + self._save_empty_dataframe(feature_group) + + def _save_empty_dataframe(self, feature_group): location = feature_group.prepare_spark_location() dataframe = self._spark_session.read.format("hudi").load(location) - if (new_features is not None): - if isinstance(new_features, list): - for new_feature in new_features: - dataframe = dataframe.withColumn(new_feature.name, lit(None).cast(new_feature.type)) - else: - dataframe = dataframe.withColumn(new_features.name, lit(None).cast(new_features.type)) - + for _feature in feature_group.features: + if _feature.name not in dataframe.columns: + dataframe = dataframe.withColumn(_feature.name, lit(None).cast(_feature.type)) self.save_dataframe( feature_group, @@ -1332,23 +1365,20 @@ def save_empty_dataframe(self, feature_group, new_features=None): {}, ) - def add_cols_to_delta_table(self, feature_group, new_features): + def _add_cols_to_delta_table(self, feature_group): location = feature_group.prepare_spark_location() dataframe = self._spark_session.read.format("delta").load(location) - if (new_features is not None): - if isinstance(new_features, list): - for new_feature in new_features: - dataframe = dataframe.withColumn(new_feature.name, lit("").cast(new_feature.type)) - else: - dataframe = dataframe.withColumn(new_features.name, lit("").cast(new_features.type)) + for _feature in feature_group.features: + if _feature.name not in dataframe.columns: + dataframe = dataframe.withColumn(_feature.name, lit(None).cast(_feature.type)) - dataframe.limit(0).write.format("delta").mode( - "append" - ).option("mergeSchema", "true").option( - "spark.databricks.delta.schema.autoMerge.enabled", "true" - ).save(location) + dataframe.limit(0).write.format("delta").mode("append").option( + "mergeSchema", "true" + ).option("spark.databricks.delta.schema.autoMerge.enabled", "true").save( + location + ) def _apply_transformation_function( self, @@ -1378,9 +1408,19 @@ def _apply_transformation_function( ) if missing_features: - raise FeatureStoreException( - f"Features {missing_features} specified in the transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please specify the feature required correctly." - ) + if ( + tf.transformation_type + == transformation_function.TransformationType.ON_DEMAND + ): + # On-demand transformation are applied using the python/spark engine during insertion, the transformation while retrieving feature vectors are performed in the vector_server. + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the on-demand transformation function '{hopsworks_udf.function_name}' are not present in the dataframe being inserted into the feature group. " + + "Please verify that the correct feature names are used in the transformation function and that these features exist in the dataframe being inserted." + ) + else: + raise FeatureStoreException( + f"The following feature(s): `{'`, '.join(missing_features)}`, specified in the model-dependent transformation function '{hopsworks_udf.function_name}' are not present in the feature view. Please verify that the correct features are specified in the transformation function." + ) if tf.hopsworks_udf.dropped_features: dropped_features.update(hopsworks_udf.dropped_features) diff --git a/python/hsfs/feature_group.py b/python/hsfs/feature_group.py index a3385afda..e2c42f1a3 100644 --- a/python/hsfs/feature_group.py +++ b/python/hsfs/feature_group.py @@ -2327,27 +2327,14 @@ def __init__( # for python engine we always use stream feature group if engine.get_type() == "python": self._stream = True - # for stream feature group time travel format is always HUDI - if self._stream: - expected_format = "HUDI" - if self._time_travel_format != expected_format: - warnings.warn( - ( - "The provided time travel format `{}` has been overwritten " - "because Stream enabled feature groups only support `{}`" - ).format(self._time_travel_format, expected_format), - util.FeatureGroupWarning, - stacklevel=1, - ) - self._time_travel_format = expected_format self.primary_key = primary_key self.partition_key = partition_key self._hudi_precombine_key = ( util.autofix_feature_name(hudi_precombine_key) if hudi_precombine_key is not None - and self._time_travel_format is not None - and self._time_travel_format == "HUDI" + and (self._time_travel_format is None + or self._time_travel_format == "HUDI") else None ) self.statistics_config = statistics_config @@ -3289,7 +3276,7 @@ def delta_vacuum( fg = fs.get_or_create_feature_group(...) commit_details = fg.delta_vacuum(retention_hours = 168) - + ``` # Arguments retention_hours: User provided retention period. The default retention threshold for the files is 7 days. diff --git a/python/hsfs/feature_store.py b/python/hsfs/feature_store.py index c1ef352f9..4b45c9c77 100644 --- a/python/hsfs/feature_store.py +++ b/python/hsfs/feature_store.py @@ -458,7 +458,7 @@ def sql( For spark engine: Dictionary of read options for Spark. For python engine: If running queries on the online feature store, users can provide an entry `{'external': True}`, - this instructs the library to use the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) to establish the connection to the online feature store. + this instructs the library to use the `host` parameter in the [`hopsworks.login()`](login.md#login) to establish the connection to the online feature store. If not set, or set to False, the online feature store storage connector is used which relies on the private ip. Defaults to `{}`. @@ -556,7 +556,7 @@ def plus_two(value): online_enabled=True, event_time='date', transformation_functions=transformation_functions, - online_config={'online_comments': ['NDB_TABLE=READ_BACKUP=1']} + online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1']} ) ``` @@ -721,7 +721,7 @@ def get_or_create_feature_group( online_enabled=True, event_time="timestamp", transformation_functions=transformation_functions, - online_config={'online_comments': ['NDB_TABLE=READ_BACKUP=1']} + online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1']} ) ``` @@ -1023,7 +1023,7 @@ def create_external_feature_group( primary_key=['ss_store_sk'], event_time='sale_date', online_enabled=True, - online_config={'online_comments': ['NDB_TABLE=READ_BACKUP=1']} + online_config={'table_space': 'ts_1', 'online_comments': ['NDB_TABLE=READ_BACKUP=1']} ) external_fg.save() diff --git a/python/hsfs/feature_view.py b/python/hsfs/feature_view.py index 6dbe7a585..b61b3e09a 100644 --- a/python/hsfs/feature_view.py +++ b/python/hsfs/feature_view.py @@ -337,7 +337,7 @@ def init_serving( Transformation statistics are fetched from training dataset and applied to the feature vector. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -520,7 +520,7 @@ def get_batch_query( def get_feature_vector( self, - entry: Dict[str, Any], + entry: Optional[Dict[str, Any]] = None, passed_features: Optional[Dict[str, Any]] = None, external: Optional[bool] = None, return_type: Literal["list", "polars", "numpy", "pandas"] = "list", @@ -592,7 +592,7 @@ def get_feature_vector( providing feature values which are not available in the feature store. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -635,7 +635,7 @@ def get_feature_vector( def get_feature_vectors( self, - entry: List[Dict[str, Any]], + entry: Optional[List[Dict[str, Any]]] = None, passed_features: Optional[List[Dict[str, Any]]] = None, external: Optional[bool] = None, return_type: Literal["list", "polars", "numpy", "pandas"] = "list", @@ -705,7 +705,7 @@ def get_feature_vectors( providing feature values which are not available in the feature store. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -777,7 +777,7 @@ def get_inference_helper( Set of required primary keys is [`feature_view.primary_keys`](#primary_keys) external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -835,7 +835,7 @@ def get_inference_helpers( Set of required primary keys is [`feature_view.primary_keys`](#primary_keys) external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -912,7 +912,7 @@ def find_neighbors( filter: A filter expression to restrict the search space (optional). external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -3567,7 +3567,7 @@ def transform( feature_vector: `Union[List[Any], List[List[Any]], pd.DataFrame, pl.DataFrame]`. The feature vector to be transformed. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. diff --git a/python/hsfs/storage_connector.py b/python/hsfs/storage_connector.py index 7ed887cd9..15ccdc8d6 100644 --- a/python/hsfs/storage_connector.py +++ b/python/hsfs/storage_connector.py @@ -369,6 +369,7 @@ def prepare_spark(self, path: Optional[str] = None) -> Optional[str]: # Arguments path: Path to prepare for reading from cloud storage. Defaults to `None`. """ + self.refetch() return engine.get_instance().setup_storage_connector(self, path) def connector_options(self) -> Dict[str, Any]: diff --git a/python/hsfs/training_dataset.py b/python/hsfs/training_dataset.py index 94688b692..7d9e89ec8 100644 --- a/python/hsfs/training_dataset.py +++ b/python/hsfs/training_dataset.py @@ -1007,7 +1007,7 @@ def init_prepared_statement( initialised for retrieving serving vectors as a batch. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -1024,7 +1024,7 @@ def get_serving_vector( serving application. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. @@ -1046,7 +1046,7 @@ def get_serving_vectors( serving application. external: boolean, optional. If set to True, the connection to the online feature store is established using the same host as - for the `host` parameter in the [`hsfs.connection()`](connection_api.md#connection) method. + for the `host` parameter in the [`hopsworks.login()`](login.md#login) method. If set to False, the online feature store storage connector is used which relies on the private IP. Defaults to True if connection to Hopsworks is established from external environment (e.g AWS Sagemaker or Google Colab), otherwise to False. diff --git a/python/hsml/core/hdfs_api.py b/python/hsml/core/hdfs_api.py new file mode 100644 index 000000000..d786bce37 --- /dev/null +++ b/python/hsml/core/hdfs_api.py @@ -0,0 +1,93 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import os + + +class HdfsApi: + def __init__(self): + + import fsspec.implementations.arrow as pfs + + host, port = os.environ["LIBHDFS_DEFAULT_FS"].split(":") + + self._hopsfs = pfs.HadoopFileSystem( + host=host, + port=int(port), + user=os.environ["LIBHDFS_DEFAULT_USER"], + ) + + DEFAULT_BUFFER_SIZE = 0 + + def upload( + self, + local_path: str, + upload_path: str, + overwrite: bool = False, + buffer_size: int = DEFAULT_BUFFER_SIZE, + ): + """Upload file/directory to the Hopsworks filesystem. + :param local_path: local path to file to upload + :type local_path: str + :param upload_path: path to directory where to upload the file in Hopsworks filesystem + :type upload_path: str + :param overwrite: overwrite file if exists + :type overwrite: bool + :param buffer_size: size of the temporary read and write buffer. Defaults to 0. + :type buffer_size: int + """ + # local path could be absolute or relative, + if not os.path.isabs(local_path) and os.path.exists( + os.path.join(os.getcwd(), local_path) + ): + local_path = os.path.join(os.getcwd(), local_path) + + _, file_name = os.path.split(local_path) + + destination_path = upload_path + "/" + file_name + + if self._hopsfs.exists(destination_path): + if overwrite: + self._hopsfs.rm(destination_path, recursive=True) + else: + raise Exception( + "{} already exists, set overwrite=True to overwrite it".format( + local_path + ) + ) + + self._hopsfs.upload( + lpath=local_path, + rpath=destination_path, + recursive=True, + buffer_size=buffer_size, + ) + + return upload_path + "/" + os.path.basename(local_path) + + def download(self, path, local_path, buffer_size=DEFAULT_BUFFER_SIZE): + """Download file/directory on a path in datasets. + :param path: path to download + :type path: str + :param local_path: path to download in datasets + :type local_path: str + :param buffer_size: size of the temporary read and write buffer. Defaults to 0. + :type buffer_size: int + """ + + self._hopsfs.download(path, local_path, recursive=True, buffer_size=buffer_size) diff --git a/python/hsml/core/serving_api.py b/python/hsml/core/serving_api.py index 92d947728..9a124465d 100644 --- a/python/hsml/core/serving_api.py +++ b/python/hsml/core/serving_api.py @@ -419,4 +419,7 @@ def _get_hopsworks_inference_path(self, project_id: int, deployment_instance): ] def _get_istio_inference_path(self, deployment_instance): + if deployment_instance.model_server == "VLLM": + return ["openai", "v1", "completions"] + return ["v1", "models", deployment_instance.name + ":predict"] diff --git a/python/hsml/deployment.py b/python/hsml/deployment.py index 6999acc41..9c98b4e94 100644 --- a/python/hsml/deployment.py +++ b/python/hsml/deployment.py @@ -66,7 +66,7 @@ def __init__( self._model_registry_id = None @usage.method_logger - def save(self, await_update: Optional[int] = 60): + def save(self, await_update: Optional[int] = 120): """Persist this deployment including the predictor and metadata to Model Serving. # Arguments @@ -78,7 +78,7 @@ def save(self, await_update: Optional[int] = 60): self._serving_engine.save(self, await_update) @usage.method_logger - def start(self, await_running: Optional[int] = 60): + def start(self, await_running: Optional[int] = 120): """Start the deployment # Arguments @@ -90,7 +90,7 @@ def start(self, await_running: Optional[int] = 60): self._serving_engine.start(self, await_status=await_running) @usage.method_logger - def stop(self, await_stopped: Optional[int] = 60): + def stop(self, await_stopped: Optional[int] = 120): """Stop the deployment # Arguments @@ -218,10 +218,14 @@ def get_model(self): ) @usage.method_logger - def download_artifact(self): - """Download the model artifact served by the deployment""" + def download_artifact_files(self, local_path=None): + """Download the artifact files served by the deployment - return self._serving_engine.download_artifact(self) + # Arguments + local_path: path where to download the artifact files in the local filesystem + """ + + return self._serving_engine.download_artifact_files(self, local_path=local_path) def get_logs(self, component="predictor", tail=10): """Prints the deployment logs of the predictor or transformer. @@ -372,9 +376,15 @@ def artifact_version(self): def artifact_version(self, artifact_version: Union[int, str]): self._predictor.artifact_version = artifact_version + @property + def artifact_files_path(self): + """Path of the artifact files deployed by the predictor.""" + return self._predictor.artifact_files_path + @property def artifact_path(self): """Path of the model artifact deployed by the predictor.""" + # TODO: deprecated return self._predictor.artifact_path @property diff --git a/python/hsml/engine/local_engine.py b/python/hsml/engine/local_engine.py index 7b669a249..d703002da 100644 --- a/python/hsml/engine/local_engine.py +++ b/python/hsml/engine/local_engine.py @@ -17,7 +17,7 @@ import os from hsml import client -from hsml.core import dataset_api, model_api +from hsml.core import dataset_api, hdfs_api, model_api class LocalEngine: @@ -25,6 +25,11 @@ def __init__(self): self._dataset_api = dataset_api.DatasetApi() self._model_api = model_api.ModelApi() + try: + self._hdfs_api = hdfs_api.HdfsApi() + except Exception: + self._hdfs_api = None + def mkdir(self, remote_path: str): remote_path = self._prepend_project_path(remote_path) self._dataset_api.mkdir(remote_path) @@ -38,26 +43,55 @@ def upload(self, local_path: str, remote_path: str, upload_configuration=None): # Initialize the upload configuration to empty dictionary if is None upload_configuration = upload_configuration if upload_configuration else {} - self._dataset_api.upload( - local_path, - remote_path, - chunk_size=upload_configuration.get( - "chunk_size", self._dataset_api.DEFAULT_UPLOAD_FLOW_CHUNK_SIZE - ), - simultaneous_uploads=upload_configuration.get( - "simultaneous_uploads", - self._dataset_api.DEFAULT_UPLOAD_SIMULTANEOUS_UPLOADS, - ), - max_chunk_retries=upload_configuration.get( - "max_chunk_retries", - self._dataset_api.DEFAULT_UPLOAD_MAX_CHUNK_RETRIES, - ), - ) - def download(self, remote_path: str, local_path: str): + if self._hdfs_api is not None: + # use the hdfs client if available + self._hdfs_api.upload( + local_path=local_path, + upload_path=remote_path, + buffer_size=upload_configuration.get( + "buffer_size", self._hdfs_api.DEFAULT_BUFFER_SIZE + ), + ) + else: + # otherwise, use the REST API + self._dataset_api.upload( + local_path, + remote_path, + chunk_size=upload_configuration.get( + "chunk_size", self._dataset_api.DEFAULT_UPLOAD_FLOW_CHUNK_SIZE + ), + simultaneous_uploads=upload_configuration.get( + "simultaneous_uploads", + self._dataset_api.DEFAULT_UPLOAD_SIMULTANEOUS_UPLOADS, + ), + max_chunk_retries=upload_configuration.get( + "max_chunk_retries", + self._dataset_api.DEFAULT_UPLOAD_MAX_CHUNK_RETRIES, + ), + ) + + def download(self, remote_path: str, local_path: str, download_configuration=None): local_path = self._get_abs_path(local_path) remote_path = self._prepend_project_path(remote_path) - self._dataset_api.download(remote_path, local_path) + + # Initialize the download configuration to empty dictionary if is None + download_configuration = ( + download_configuration if download_configuration else {} + ) + + if self._hdfs_api is not None: + # use the hdfs client if available + self._hdfs_api.download( + path=remote_path, + local_path=local_path, + buffer_size=download_configuration.get( + "buffer_size", self._hdfs_api.DEFAULT_BUFFER_SIZE + ), + ) + else: + # otherwise, use the REST API + self._dataset_api.download(remote_path, local_path) def copy(self, source_path, destination_path): source_path = self._prepend_project_path(source_path) diff --git a/python/hsml/engine/model_engine.py b/python/hsml/engine/model_engine.py index 29a1a0234..bb6312f66 100644 --- a/python/hsml/engine/model_engine.py +++ b/python/hsml/engine/model_engine.py @@ -81,11 +81,11 @@ def _upload_additional_resources(self, model_instance): return model_instance def _copy_or_move_hopsfs_model_item( - self, item_attr, to_model_version_path, keep_original_files + self, item_attr, to_model_files_path, keep_original_files ): """Copy or move model item from a hdfs path to the model version folder in the Models dataset. It works with files and folders.""" path = item_attr["path"] - to_hdfs_path = os.path.join(to_model_version_path, os.path.basename(path)) + to_hdfs_path = os.path.join(to_model_files_path, os.path.basename(path)) if keep_original_files: self._engine.copy(path, to_hdfs_path) else: @@ -94,7 +94,7 @@ def _copy_or_move_hopsfs_model_item( def _copy_or_move_hopsfs_model( self, from_hdfs_model_path, - to_model_version_path, + to_model_files_path, keep_original_files, update_upload_progress, ): @@ -123,7 +123,7 @@ def _copy_or_move_hopsfs_model( )["items"]: path_attr = entry["attributes"] self._copy_or_move_hopsfs_model_item( - path_attr, to_model_version_path, keep_original_files + path_attr, to_model_files_path, keep_original_files ) if path_attr.get("dir", False): n_dirs += 1 @@ -133,7 +133,7 @@ def _copy_or_move_hopsfs_model( else: # if path is a file, copy/move it self._copy_or_move_hopsfs_model_item( - model_path_attr, to_model_version_path, keep_original_files + model_path_attr, to_model_files_path, keep_original_files ) n_files += 1 update_upload_progress(n_dirs=n_dirs, n_files=n_files) @@ -157,7 +157,9 @@ def _download_model_from_hopsfs_recursive( if path_attr.get("dir", False): # otherwise, make a recursive call for the folder - if basename == "Artifacts": + if ( + basename == constants.MODEL_SERVING.ARTIFACTS_DIR_NAME + ): # TODO: Not needed anymore continue # skip Artifacts subfolder local_folder_path = os.path.join(to_local_path, basename) os.mkdir(local_folder_path) @@ -196,11 +198,11 @@ def _download_model_from_hopsfs( def _upload_local_model( self, from_local_model_path, - to_model_version_path, + to_model_files_path, update_upload_progress, upload_configuration=None, ): - """Copy or upload model files from a local path to the model version folder in the Models dataset.""" + """Copy or upload model files from a local path to the model files folder in the Models dataset.""" n_dirs, n_files = 0, 0 if os.path.isdir(from_local_model_path): # if path is a dir, upload files and folders iteratively @@ -211,8 +213,8 @@ def _upload_local_model( # - files is the list of file names present in the root dir # we need to replace the local path prefix with the hdfs path prefix (i.e., /srv/hops/....../root with /Projects/.../) remote_base_path = root.replace( - from_local_model_path, to_model_version_path - ) + from_local_model_path, to_model_files_path + ).replace(os.sep, "/") for d_name in dirs: self._engine.mkdir(remote_base_path + "/" + d_name) n_dirs += 1 @@ -229,7 +231,7 @@ def _upload_local_model( # if path is a file, upload file self._engine.upload( from_local_model_path, - to_model_version_path, + to_model_files_path, upload_configuration=upload_configuration, ) n_files += 1 @@ -250,14 +252,14 @@ def _save_model_from_local_or_hopsfs_mount( from_hdfs_model_path=model_path.replace( constants.MODEL_REGISTRY.HOPSFS_MOUNT_PREFIX, "" ), - to_model_version_path=model_instance.version_path, + to_model_files_path=model_instance.model_files_path, keep_original_files=keep_original_files, update_upload_progress=update_upload_progress, ) else: self._upload_local_model( from_local_model_path=model_path, - to_model_version_path=model_instance.version_path, + to_model_files_path=model_instance.model_files_path, update_upload_progress=update_upload_progress, upload_configuration=upload_configuration, ) @@ -366,6 +368,7 @@ def save( if step["id"] == 0: # Create folders self._engine.mkdir(model_instance.version_path) + self._engine.mkdir(model_instance.model_files_path) if step["id"] == 1: def update_upload_progress(n_dirs=0, n_files=0, step=step): @@ -375,7 +378,7 @@ def update_upload_progress(n_dirs=0, n_files=0, step=step): update_upload_progress(n_dirs=0, n_files=0) - # Upload Model files from local path to /Models/{model_instance._name}/{model_instance._version} + # Upload Model files from local path to /Models/{model_instance._name}/{model_instance._version}/Files # check local absolute if os.path.isabs(model_path) and os.path.exists(model_path): self._save_model_from_local_or_hopsfs_mount( @@ -402,7 +405,7 @@ def update_upload_progress(n_dirs=0, n_files=0, step=step): ): # check hdfs relative and absolute self._copy_or_move_hopsfs_model( from_hdfs_model_path=model_path, - to_model_version_path=model_instance.version_path, + to_model_files_path=model_instance.model_files_path, keep_original_files=keep_original_files, update_upload_progress=update_upload_progress, ) @@ -432,12 +435,13 @@ def update_upload_progress(n_dirs=0, n_files=0, step=step): return model_instance - def download(self, model_instance): - model_name_path = os.path.join( - tempfile.gettempdir(), str(uuid.uuid4()), model_instance._name - ) - model_version_path = model_name_path + "/" + str(model_instance._version) - os.makedirs(model_version_path) + def download(self, model_instance, local_path=None): + if local_path is None: + local_path = os.path.join( + tempfile.gettempdir(), str(uuid.uuid4()), model_instance._name + ) + local_path = local_path + "/" + str(model_instance._version) + os.makedirs(local_path, exist_ok=True) def update_download_progress(n_dirs, n_files, done=False): print( @@ -447,20 +451,20 @@ def update_download_progress(n_dirs, n_files, done=False): ) try: - from_hdfs_model_path = model_instance.version_path + from_hdfs_model_path = model_instance.model_files_path if from_hdfs_model_path.startswith("hdfs:/"): projects_index = from_hdfs_model_path.find("/Projects", 0) from_hdfs_model_path = from_hdfs_model_path[projects_index:] self._download_model_from_hopsfs( from_hdfs_model_path=from_hdfs_model_path, - to_local_path=model_version_path, + to_local_path=local_path, update_download_progress=update_download_progress, ) except BaseException as be: raise be - return model_version_path + return local_path def read_file(self, model_instance, resource): hdfs_resource_path = self._build_resource_path( diff --git a/python/hsml/engine/serving_engine.py b/python/hsml/engine/serving_engine.py index 1151fb79b..4bc377ae5 100644 --- a/python/hsml/engine/serving_engine.py +++ b/python/hsml/engine/serving_engine.py @@ -15,17 +15,23 @@ # import os +import tempfile import time import uuid from typing import Dict, List, Union -from hopsworks_common import util from hopsworks_common.client.exceptions import ModelServingException, RestAPIError from hopsworks_common.client.istio.utils.infer_type import InferInput -from hopsworks_common.constants import DEPLOYMENT, PREDICTOR, PREDICTOR_STATE +from hopsworks_common.constants import ( + DEPLOYMENT, + MODEL_SERVING, + PREDICTOR, + PREDICTOR_STATE, +) from hopsworks_common.constants import INFERENCE_ENDPOINTS as IE from hopsworks_common.core import dataset_api from hsml.core import serving_api +from hsml.engine import local_engine from tqdm.auto import tqdm @@ -46,6 +52,8 @@ def __init__(self): self._serving_api = serving_api.ServingApi() self._dataset_api = dataset_api.DatasetApi() + self._engine = local_engine.LocalEngine() + def _poll_deployment_status( self, deployment_instance, status: str, await_status: int, update_progress=None ): @@ -299,7 +307,64 @@ def _get_stopped_instances(self, available_instances, requested_instances): num_instances = requested_instances - available_instances return num_instances if num_instances >= 0 else 0 - def download_artifact(self, deployment_instance): + def _download_files_from_hopsfs_recursive( + self, + from_hdfs_path: str, + to_local_path: str, + update_download_progress, + n_dirs, + n_files, + ): + """Download model files from a model path in hdfs, recursively""" + + for entry in self._dataset_api.list(from_hdfs_path, sort_by="NAME:desc")[ + "items" + ]: + path_attr = entry["attributes"] + path = path_attr["path"] + basename = os.path.basename(path) + + if path_attr.get("dir", False): + # otherwise, make a recursive call for the folder + if ( + basename == MODEL_SERVING.ARTIFACTS_DIR_NAME + ): # TODO: Not needed anymore + continue # skip Artifacts subfolder + local_folder_path = os.path.join(to_local_path, basename) + os.mkdir(local_folder_path) + n_dirs, n_files = self._download_files_from_hopsfs_recursive( + from_hdfs_path=path, + to_local_path=local_folder_path, + update_download_progress=update_download_progress, + n_dirs=n_dirs, + n_files=n_files, + ) + n_dirs += 1 + update_download_progress(n_dirs=n_dirs, n_files=n_files) + else: + # if it's a file, download it + local_file_path = os.path.join(to_local_path, basename) + self._engine.download(path, local_file_path) + n_files += 1 + update_download_progress(n_dirs=n_dirs, n_files=n_files) + + return n_dirs, n_files + + def _download_files_from_hopsfs( + self, from_hdfs_path: str, to_local_path: str, update_download_progress + ): + """Download files from a model path in hdfs.""" + + n_dirs, n_files = self._download_files_from_hopsfs_recursive( + from_hdfs_path=from_hdfs_path, + to_local_path=to_local_path, + update_download_progress=update_download_progress, + n_dirs=0, + n_files=0, + ) + update_download_progress(n_dirs=n_dirs, n_files=n_files, done=True) + + def download_artifact_files(self, deployment_instance, local_path=None): if deployment_instance.id is None: raise ModelServingException( "Deployment is not created yet. To create the deployment use `.save()`" @@ -311,30 +376,39 @@ def download_artifact(self, deployment_instance): Download the model files by using `model.download()`" ) - from_artifact_zip_path = deployment_instance.artifact_path - to_artifacts_path = os.path.join( - os.getcwd(), - str(uuid.uuid4()), - deployment_instance.model_name, - str(deployment_instance.model_version), - "Artifacts", - ) - to_artifact_version_path = ( - to_artifacts_path + "/" + str(deployment_instance.artifact_version) - ) - to_artifact_zip_path = to_artifact_version_path + ".zip" + if local_path is None: + local_path = os.path.join( + tempfile.gettempdir(), + str(uuid.uuid4()), + deployment_instance.model_name, + str(deployment_instance.model_version), + MODEL_SERVING.ARTIFACTS_DIR_NAME, + str(deployment_instance.artifact_version), + ) + os.makedirs(local_path, exist_ok=True) - os.makedirs(to_artifacts_path) + def update_download_progress(n_dirs, n_files, done=False): + print( + "Downloading artifact files (%s dirs, %s files)... %s" + % (n_dirs, n_files, "DONE" if done else ""), + end="\r", + ) try: - self._dataset_api.download(from_artifact_zip_path, to_artifact_zip_path) - util.decompress(to_artifact_zip_path, extract_dir=to_artifacts_path) - os.remove(to_artifact_zip_path) - finally: - if os.path.exists(to_artifact_zip_path): - os.remove(to_artifact_zip_path) + from_hdfs_path = deployment_instance.artifact_files_path + if from_hdfs_path.startswith("hdfs:/"): + projects_index = from_hdfs_path.find("/Projects", 0) + from_hdfs_path = from_hdfs_path[projects_index:] + + self._download_files_from_hopsfs( + from_hdfs_path=from_hdfs_path, + to_local_path=local_path, + update_download_progress=update_download_progress, + ) + except BaseException as be: + raise be - return to_artifact_version_path + return local_path def create(self, deployment_instance): try: @@ -488,7 +562,10 @@ def predict( inputs: Union[Dict, List[Dict]], ): # validate user-provided payload - self._validate_inference_payload(deployment_instance.api_protocol, data, inputs) + if deployment_instance.model_server != "VLLM": + self._validate_inference_payload( + deployment_instance.api_protocol, data, inputs + ) # build inference payload based on API protocol payload = self._build_inference_payload( diff --git a/python/hsml/llm/__init__.py b/python/hsml/llm/__init__.py new file mode 100644 index 000000000..ff8055b9b --- /dev/null +++ b/python/hsml/llm/__init__.py @@ -0,0 +1,15 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/python/hsml/llm/model.py b/python/hsml/llm/model.py new file mode 100644 index 000000000..b52cf6398 --- /dev/null +++ b/python/hsml/llm/model.py @@ -0,0 +1,75 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import humps +from hsml.constants import MODEL +from hsml.model import Model + + +class Model(Model): + """Metadata object representing a LLM model in the Model Registry.""" + + def __init__( + self, + id, + name, + version=None, + created=None, + creator=None, + environment=None, + description=None, + project_name=None, + metrics=None, + program=None, + user_full_name=None, + model_schema=None, + training_dataset=None, + input_example=None, + model_registry_id=None, + tags=None, + href=None, + feature_view=None, + training_dataset_version=None, + **kwargs, + ): + super().__init__( + id, + name, + version=version, + created=created, + creator=creator, + environment=environment, + description=description, + project_name=project_name, + metrics=metrics, + program=program, + user_full_name=user_full_name, + model_schema=model_schema, + training_dataset=training_dataset, + input_example=input_example, + framework=MODEL.FRAMEWORK_LLM, + model_registry_id=model_registry_id, + feature_view=feature_view, + training_dataset_version=training_dataset_version, + ) + + def update_from_response_json(self, json_dict): + json_decamelized = humps.decamelize(json_dict) + json_decamelized.pop("framework") + if "type" in json_decamelized: # backwards compatibility + _ = json_decamelized.pop("type") + self.__init__(**json_decamelized) + return self diff --git a/python/hsml/llm/predictor.py b/python/hsml/llm/predictor.py new file mode 100644 index 000000000..814edc522 --- /dev/null +++ b/python/hsml/llm/predictor.py @@ -0,0 +1,28 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from hsml.constants import MODEL, PREDICTOR +from hsml.predictor import Predictor + + +class Predictor(Predictor): + """Configuration for a predictor running with the vLLM backend""" + + def __init__(self, **kwargs): + kwargs["model_framework"] = MODEL.FRAMEWORK_LLM + kwargs["model_server"] = PREDICTOR.MODEL_SERVER_VLLM + + super().__init__(**kwargs) diff --git a/python/hsml/llm/signature.py b/python/hsml/llm/signature.py new file mode 100644 index 000000000..05ff003eb --- /dev/null +++ b/python/hsml/llm/signature.py @@ -0,0 +1,79 @@ +# +# Copyright 2024 Hopsworks AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Union + +import numpy +import pandas +from hopsworks_common import usage +from hsml.llm.model import Model +from hsml.model_schema import ModelSchema + + +_mr = None + + +@usage.method_logger +def create_model( + name: str, + version: Optional[int] = None, + metrics: Optional[dict] = None, + description: Optional[str] = None, + input_example: Optional[ + Union[pandas.DataFrame, pandas.Series, numpy.ndarray, list] + ] = None, + model_schema: Optional[ModelSchema] = None, + feature_view=None, + training_dataset_version: Optional[int] = None, +): + """Create an LLM model metadata object. + + !!! note "Lazy" + This method is lazy and does not persist any metadata or uploads model artifacts in the + model registry on its own. To save the model object and the model artifacts, call the `save()` method with a + local file path to the directory containing the model artifacts. + + # Arguments + name: Name of the model to create. + version: Optionally version of the model to create, defaults to `None` and + will create the model with incremented version from the last + version in the model registry. + metrics: Optionally a dictionary with model evaluation metrics (e.g., accuracy, MAE) + description: Optionally a string describing the model, defaults to empty string + `""`. + input_example: Optionally an input example that represents a single input for the model, defaults to `None`. + model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. + + # Returns + `Model`. The model metadata object. + """ + model = Model( + id=None, + name=name, + version=version, + description=description, + metrics=metrics, + input_example=input_example, + model_schema=model_schema, + feature_view=feature_view, + training_dataset_version=training_dataset_version, + ) + model._shared_registry_project_name = _mr.shared_registry_project_name + model._model_registry_id = _mr.model_registry_id + + return model diff --git a/python/hsml/model.py b/python/hsml/model.py index 3e39a7b26..2c897b50d 100644 --- a/python/hsml/model.py +++ b/python/hsml/model.py @@ -17,19 +17,22 @@ import json import logging import os +import re import warnings from typing import Any, Dict, Optional, Union import humps from hopsworks_common import client, usage, util -from hopsworks_common.constants import ARTIFACT_VERSION +from hopsworks_common.constants import ARTIFACT_VERSION, MODEL_REGISTRY from hopsworks_common.constants import INFERENCE_ENDPOINTS as IE from hsml.core import explicit_provenance from hsml.engine import model_engine from hsml.inference_batcher import InferenceBatcher from hsml.inference_logger import InferenceLogger +from hsml.model_schema import ModelSchema from hsml.predictor import Predictor from hsml.resources import PredictorResources +from hsml.schema import Schema from hsml.transformer import Transformer @@ -53,7 +56,6 @@ def __init__( program=None, user_full_name=None, model_schema=None, - training_dataset=None, input_example=None, framework=None, model_registry_id=None, @@ -83,7 +85,6 @@ def __init__( self._input_example = input_example self._framework = framework self._model_schema = model_schema - self._training_dataset = training_dataset # This is needed for update_from_response_json function to not overwrite name of the shared registry this model originates from if not hasattr(self, "_shared_registry_project_name"): @@ -94,17 +95,6 @@ def __init__( self._model_engine = model_engine.ModelEngine() self._feature_view = feature_view self._training_dataset_version = training_dataset_version - if training_dataset_version is None and feature_view is not None: - if feature_view.get_last_accessed_training_dataset() is not None: - self._training_dataset_version = ( - feature_view.get_last_accessed_training_dataset() - ) - else: - warnings.warn( - "Provenance cached data - feature view provided, but training dataset version is missing", - util.ProvenanceWarning, - stacklevel=1, - ) @usage.method_logger def save( @@ -130,6 +120,39 @@ def save( # Returns `Model`: The model metadata object. """ + if self._training_dataset_version is None and self._feature_view is not None: + if self._feature_view.get_last_accessed_training_dataset() is not None: + self._training_dataset_version = ( + self._feature_view.get_last_accessed_training_dataset() + ) + else: + warnings.warn( + "Provenance cached data - feature view provided, but training dataset version is missing", + util.ProvenanceWarning, + stacklevel=1, + ) + if self._model_schema is None: + if ( + self._feature_view is not None + and self._training_dataset_version is not None + ): + all_features = self._feature_view.get_training_dataset_schema( + self._training_dataset_version + ) + features, labels = [], [] + for feature in all_features: + (labels if feature.label else features).append(feature.to_dict()) + self._model_schema = ModelSchema( + input_schema=Schema(features) if features else None, + output_schema=Schema(labels) if labels else None, + ) + else: + warnings.warn( + "Model schema cannot not be inferred without both the feature view and the training dataset version.", + util.ProvenanceWarning, + stacklevel=1, + ) + return self._model_engine.save( model_instance=self, model_path=model_path, @@ -139,13 +162,15 @@ def save( ) @usage.method_logger - def download(self): + def download(self, local_path=None): """Download the model files. + # Arguments + local_path: path where to download the model files in the local filesystem # Returns `str`: Absolute path to local folder containing the model files. """ - return self._model_engine.download(model_instance=self) + return self._model_engine.download(model_instance=self, local_path=local_path) @usage.method_logger def delete(self): @@ -211,7 +236,7 @@ def deploy( """ if name is None: - name = self._name + name = self._get_default_serving_name() predictor = Predictor.for_model( self, @@ -341,6 +366,9 @@ def get_training_dataset_provenance(self): """ return self._model_engine.get_training_dataset_provenance(model_instance=self) + def _get_default_serving_name(self): + return re.sub(r"[^a-zA-Z0-9]", "", self._name) + @classmethod def from_response_json(cls, json_dict): json_decamelized = humps.decamelize(json_dict) @@ -372,7 +400,6 @@ def to_dict(self): "inputExample": self._input_example, "framework": self._framework, "metrics": self._training_metrics, - "trainingDataset": self._training_dataset, "environment": self._environment, "program": self._program, "featureView": util.feature_view_to_json(self._feature_view), @@ -507,15 +534,6 @@ def model_schema(self): def model_schema(self, model_schema): self._model_schema = model_schema - @property - def training_dataset(self): - """training_dataset of the model.""" - return self._training_dataset - - @training_dataset.setter - def training_dataset(self, training_dataset): - self._training_dataset = training_dataset - @property def project_name(self): """project_name of the model.""" @@ -544,6 +562,14 @@ def version_path(self): """path of the model including version folder. Resolves to /Projects/{project_name}/Models/{name}/{version}""" return "{}/{}".format(self.model_path, str(self.version)) + @property + def model_files_path(self): + """path of the model files including version and files folder. Resolves to /Projects/{project_name}/Models/{name}/{version}/Files""" + return "{}/{}".format( + self.version_path, + MODEL_REGISTRY.MODEL_FILES_DIR_NAME, + ) + @property def shared_registry_project_name(self): """shared_registry_project_name of the model.""" diff --git a/python/hsml/model_registry.py b/python/hsml/model_registry.py index cfd9136aa..9309eb7c4 100644 --- a/python/hsml/model_registry.py +++ b/python/hsml/model_registry.py @@ -19,6 +19,7 @@ import humps from hopsworks_common import usage, util from hsml.core import model_api +from hsml.llm import signature as llm_signature # noqa: F401 from hsml.python import signature as python_signature # noqa: F401 from hsml.sklearn import signature as sklearn_signature # noqa: F401 from hsml.tensorflow import signature as tensorflow_signature # noqa: F401 @@ -48,11 +49,13 @@ def __init__( self._python = python_signature self._sklearn = sklearn_signature self._torch = torch_signature + self._llm = llm_signature tensorflow_signature._mr = self python_signature._mr = self sklearn_signature._mr = self torch_signature._mr = self + llm_signature._mr = self @classmethod def from_response_json(cls, json_dict): @@ -190,6 +193,12 @@ def python(self): return python_signature + @property + def llm(self): + """Module for exporting a Large Language Model.""" + + return llm_signature + def __repr__(self): project_name = ( self._shared_registry_project_name diff --git a/python/hsml/model_serving.py b/python/hsml/model_serving.py index b58942ba7..2d24d2b20 100644 --- a/python/hsml/model_serving.py +++ b/python/hsml/model_serving.py @@ -125,7 +125,7 @@ def get_deployments(self, model: Model = None, status: str = None): `RestAPIError`: If unable to retrieve deployments from model serving. """ - model_name = model.name if model is not None else None + model_name = model._get_default_serving_name() if model is not None else None if status is not None: self._validate_deployment_status(status) @@ -207,7 +207,7 @@ def create_predictor( """ if name is None: - name = model.name + name = model._get_default_serving_name() return Predictor.for_model( model, diff --git a/python/hsml/predictor.py b/python/hsml/predictor.py index b7f02b66b..31c6aa138 100644 --- a/python/hsml/predictor.py +++ b/python/hsml/predictor.py @@ -22,6 +22,7 @@ ARTIFACT_VERSION, INFERENCE_ENDPOINTS, MODEL, + MODEL_SERVING, PREDICTOR, RESOURCES, Default, @@ -168,18 +169,22 @@ def _validate_serving_tool(cls, serving_tool): @classmethod def _validate_script_file(cls, model_framework, script_file): - if model_framework == MODEL.FRAMEWORK_PYTHON and script_file is None: + if script_file is None and ( + model_framework == MODEL.FRAMEWORK_PYTHON + or model_framework == MODEL.FRAMEWORK_LLM + ): raise ValueError( - "Predictor scripts are required in deployments for custom Python models" + "Predictor scripts are required in deployments for custom Python models and LLMs." ) @classmethod def _infer_model_server(cls, model_framework): - return ( - PREDICTOR.MODEL_SERVER_TF_SERVING - if model_framework == MODEL.FRAMEWORK_TENSORFLOW - else PREDICTOR.MODEL_SERVER_PYTHON - ) + if model_framework == MODEL.FRAMEWORK_TENSORFLOW: + return PREDICTOR.MODEL_SERVER_TF_SERVING + elif model_framework == MODEL.FRAMEWORK_LLM: + return PREDICTOR.MODEL_SERVER_VLLM + else: + return PREDICTOR.MODEL_SERVER_PYTHON @classmethod def _get_default_serving_tool(cls): @@ -392,9 +397,19 @@ def artifact_version(self): def artifact_version(self, artifact_version: Union[int, str]): self._artifact_version = artifact_version + @property + def artifact_files_path(self): + return "{}/{}/{}/{}".format( + self._model_path, + str(self._model_version), + MODEL_SERVING.ARTIFACTS_DIR_NAME, + str(self._artifact_version), + ) + @property def artifact_path(self): """Path of the model artifact deployed by the predictor. Resolves to /Projects/{project_name}/Models/{name}/{version}/Artifacts/{artifact_version}/{name}_{version}_{artifact_version}.zip""" + # TODO: Deprecated artifact_name = "{}_{}_{}.zip".format( self._model_name, str(self._model_version), str(self._artifact_version) ) diff --git a/python/hsml/python/signature.py b/python/hsml/python/signature.py index 1bb5fa8f7..fa704aaab 100644 --- a/python/hsml/python/signature.py +++ b/python/hsml/python/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/sklearn/signature.py b/python/hsml/sklearn/signature.py index f8816febb..4c145a96a 100644 --- a/python/hsml/sklearn/signature.py +++ b/python/hsml/sklearn/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/tensorflow/signature.py b/python/hsml/tensorflow/signature.py index 1f83c5496..e24d20e65 100644 --- a/python/hsml/tensorflow/signature.py +++ b/python/hsml/tensorflow/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/torch/signature.py b/python/hsml/torch/signature.py index 5234d110a..bab488974 100644 --- a/python/hsml/torch/signature.py +++ b/python/hsml/torch/signature.py @@ -56,6 +56,8 @@ def create_model( `""`. input_example: Optionally an input example that represents a single input for the model, defaults to `None`. model_schema: Optionally a model schema for the model inputs and/or outputs. + feature_view: Optionally a feature view object returned by querying the feature store. If the feature view is not provided, the model will not have access to provenance. + training_dataset_version: Optionally a training dataset version. If training dataset version is not provided, but the feature view is provided, the training dataset version used will be the last accessed training dataset of the feature view, within the code/notebook that reads the feature view and training dataset and then creates the model. # Returns `Model`. The model metadata object. diff --git a/python/hsml/utils/schema/columnar_schema.py b/python/hsml/utils/schema/columnar_schema.py index 3aa5fde0e..a7468401f 100644 --- a/python/hsml/utils/schema/columnar_schema.py +++ b/python/hsml/utils/schema/columnar_schema.py @@ -20,11 +20,6 @@ from hsml.utils.schema.column import Column -try: - import hsfs -except ImportError: - pass - try: import pyspark except ImportError: @@ -35,6 +30,10 @@ class ColumnarSchema: """Metadata object representing a columnar schema for a model.""" def __init__(self, columnar_obj=None): + from hsfs.training_dataset import ( + TrainingDataset, # import performed here to prevent circular dependencies when importing ModelSchema + ) + if isinstance(columnar_obj, list): self.columns = self._convert_list_to_schema(columnar_obj) elif isinstance(columnar_obj, pandas.DataFrame): @@ -45,9 +44,7 @@ def __init__(self, columnar_obj=None): columnar_obj, pyspark.sql.dataframe.DataFrame ): self.columns = self._convert_spark_to_schema(columnar_obj) - elif importlib.util.find_spec("hsfs") is not None and isinstance( - columnar_obj, hsfs.training_dataset.TrainingDataset - ): + elif isinstance(columnar_obj, TrainingDataset): self.columns = self._convert_td_to_schema(columnar_obj) else: raise TypeError( diff --git a/python/pyproject.toml b/python/pyproject.toml index a66d15115..6ff1e6a0c 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -54,7 +54,7 @@ dependencies = [ "opensearch-py>=1.1.0,<=2.4.2", "tqdm", "grpcio>=1.49.1,<2.0.0", # ^1.49.1 - "protobuf>=3.19.0,<4.0.0", # ^3.19.0 + "protobuf>=4.25.4,<5.0.0", # ^4.25.4 ] [project.optional-dependencies] diff --git a/python/tests/core/test_feature_group_engine.py b/python/tests/core/test_feature_group_engine.py index 91f1086ed..e57f2c0c3 100644 --- a/python/tests/core/test_feature_group_engine.py +++ b/python/tests/core/test_feature_group_engine.py @@ -56,6 +56,49 @@ def test_save(self, mocker): # Assert assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + def test_save_dataframe_transformation_functions(self, mocker): + # Arrange + feature_store_id = 99 + + mocker.patch("hsfs.engine.get_type") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine.save_feature_group_metadata" + ) + mocker.patch("hsfs.core.great_expectation_engine.GreatExpectationEngine") + + fg_engine = feature_group_engine.FeatureGroupEngine( + feature_store_id=feature_store_id + ) + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=feature_store_id, + primary_key=[], + partition_key=[], + transformation_functions=[test], + id=10, + ) + + # Act + fg_engine.save( + feature_group=fg, + feature_dataframe=None, + write_options=None, + ) + + # Assert + assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + assert ( + mock_engine_get_instance.return_value._apply_transformation_function.call_count + == 1 + ) + def test_save_ge_report(self, mocker): # Arrange feature_store_id = 99 @@ -143,6 +186,56 @@ def test_insert(self, mocker): assert mock_fg_api.return_value.delete_content.call_count == 0 assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + def test_insert_transformation_functions(self, mocker): + # Arrange + feature_store_id = 99 + + mocker.patch("hsfs.engine.get_type") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine.save_feature_group_metadata" + ) + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine._verify_schema_compatibility" + ) + mocker.patch("hsfs.core.great_expectation_engine.GreatExpectationEngine") + mock_fg_api = mocker.patch("hsfs.core.feature_group_api.FeatureGroupApi") + + fg_engine = feature_group_engine.FeatureGroupEngine( + feature_store_id=feature_store_id + ) + + @udf(int) + def test(feature): + return feature + 1 + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=feature_store_id, + transformation_functions=[test], + primary_key=[], + partition_key=[], + ) + + # Act + fg_engine.insert( + feature_group=fg, + feature_dataframe=None, + overwrite=None, + operation=None, + storage=None, + write_options=None, + ) + + # Assert + assert mock_fg_api.return_value.delete_content.call_count == 0 + assert mock_engine_get_instance.return_value.save_dataframe.call_count == 1 + assert ( + mock_engine_get_instance.return_value._apply_transformation_function.call_count + == 1 + ) + def test_insert_id(self, mocker): # Arrange feature_store_id = 99 @@ -709,7 +802,7 @@ def test_append_features(self, mocker): # Assert assert ( - mock_engine_get_instance.return_value.save_empty_dataframe.call_count == 1 + mock_engine_get_instance.return_value.update_table_schema.call_count == 1 ) assert len(mock_fg_engine_update_features_metadata.call_args[0][1]) == 4 @@ -909,6 +1002,59 @@ def test_insert_stream_stream(self, mocker): mock_engine_get_instance.return_value.save_stream_dataframe.call_count == 1 ) + def test_insert_stream_stream_transformation_functions(self, mocker): + # Arrange + feature_store_id = 99 + + mocker.patch("hsfs.engine.get_type") + mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine.save_feature_group_metadata" + ) + mocker.patch( + "hsfs.core.feature_group_engine.FeatureGroupEngine._verify_schema_compatibility" + ) + + @udf(int) + def test(feature): + return feature + 1 + + fg_engine = feature_group_engine.FeatureGroupEngine( + feature_store_id=feature_store_id + ) + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=feature_store_id, + primary_key=[], + partition_key=[], + transformation_functions=[test], + stream=True, + ) + + # Act + fg_engine.insert_stream( + feature_group=fg, + dataframe=None, + query_name=None, + output_mode=None, + await_termination=None, + timeout=None, + checkpoint_dir=None, + write_options=None, + ) + + # Assert + assert mock_engine_get_instance.return_value.save_dataframe.call_count == 0 + assert ( + mock_engine_get_instance.return_value.save_stream_dataframe.call_count == 1 + ) + assert ( + mock_engine_get_instance.return_value._apply_transformation_function.call_count + == 1 + ) + def test_insert_stream_online_enabled_id(self, mocker): # Arrange feature_store_id = 99 diff --git a/python/tests/core/test_kafka_engine.py b/python/tests/core/test_kafka_engine.py index e6bb48297..88085689e 100644 --- a/python/tests/core/test_kafka_engine.py +++ b/python/tests/core/test_kafka_engine.py @@ -340,7 +340,7 @@ def test_kafka_get_offsets_high(self, mocker): ) # Assert - assert result == f" -initialCheckPointString {topic_name},0:11" + assert result == f"{topic_name},0:11" def test_kafka_get_offsets_low(self, mocker): # Arrange @@ -372,7 +372,7 @@ def test_kafka_get_offsets_low(self, mocker): ) # Assert - assert result == f" -initialCheckPointString {topic_name},0:0" + assert result == f"{topic_name},0:0" def test_kafka_get_offsets_no_topic(self, mocker): # Arrange diff --git a/python/tests/engine/test_python.py b/python/tests/engine/test_python.py index e921787be..ea83f618f 100644 --- a/python/tests/engine/test_python.py +++ b/python/tests/engine/test_python.py @@ -1450,52 +1450,6 @@ def test_save_dataframe(self, mocker): assert mock_python_engine_write_dataframe_kafka.call_count == 0 assert mock_python_engine_legacy_save_dataframe.call_count == 1 - def test_save_dataframe_transformation_functions(self, mocker): - # Arrange - mock_python_engine_write_dataframe_kafka = mocker.patch( - "hsfs.engine.python.Engine._write_dataframe_kafka" - ) - mock_python_engine_legacy_save_dataframe = mocker.patch( - "hsfs.engine.python.Engine.legacy_save_dataframe" - ) - mock_python_engine_apply_transformations = mocker.patch( - "hsfs.engine.python.Engine._apply_transformation_function" - ) - - python_engine = python.Engine() - - @udf(int) - def test(feature): - return feature + 1 - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - stream=False, - transformation_functions=[test], - ) - - # Act - python_engine.save_dataframe( - feature_group=fg, - dataframe=None, - operation=None, - online_enabled=None, - storage=None, - offline_write_options=None, - online_write_options=None, - validation_id=None, - ) - - # Assert - assert mock_python_engine_write_dataframe_kafka.call_count == 0 - assert mock_python_engine_legacy_save_dataframe.call_count == 1 - assert mock_python_engine_apply_transformations.call_count == 1 - def test_save_dataframe_stream(self, mocker): # Arrange mock_python_engine_write_dataframe_kafka = mocker.patch( @@ -2565,15 +2519,22 @@ def test_save_stream_dataframe(self): == "Stream ingestion is not available on Python environments, because it requires Spark as engine." ) - def test_save_empty_dataframe(self): + def test_update_table_schema(self, mocker): # Arrange + mock_fg_api = mocker.patch("hsfs.core.feature_group_api.FeatureGroupApi") + python_engine = python.Engine() + mock_fg_api.return_value.update_table_schema.return_value.job = job.Job( + 1, "test_job", None, None, None, None + ) + # Act - result = python_engine.save_empty_dataframe(feature_group=None) + result = python_engine.update_table_schema(feature_group=None) # Assert assert result is None + assert mock_fg_api.return_value.update_table_schema.call_count == 1 def test_get_app_options(self, mocker): # Arrange @@ -3456,6 +3417,88 @@ def test_get_unique_values(self): assert 2 in result assert 3 in result + def test_apply_transformation_function_missing_feature_on_demand_transformations( + self, mocker + ): + # Arrange + mocker.patch("hopsworks_common.client.get_instance") + hopsworks_common.connection._hsfs_engine_type = "python" + python_engine = python.Engine() + + @udf(int) + def add_one(col1): + return col1 + 1 + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + transformation_functions=[add_one("missing_col1")], + id=11, + stream=False, + ) + + df = pd.DataFrame(data={"tf_name": [1, 2]}) + + # Act + with pytest.raises(exceptions.FeatureStoreException) as exception: + python_engine._apply_transformation_function( + transformation_functions=fg.transformation_functions, dataset=df + ) + print(str(exception.value)) + assert ( + str(exception.value) + == "The following feature(s): `missing_col1`, specified in the on-demand transformation function 'add_one' are not present in the dataframe being inserted into the feature group. " + "Please verify that the correct feature names are used in the transformation function and that these features exist in the dataframe being inserted." + ) + + def test_apply_transformation_function_missing_feature_model_dependent_transformations( + self, mocker + ): + # Arrange + mocker.patch("hopsworks_common.client.get_instance") + hopsworks_common.connection._hsfs_engine_type = "python" + python_engine = python.Engine() + + @udf(int) + def add_one(col1): + return col1 + 1 + + fg = feature_group.FeatureGroup( + name="test1", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + features=[feature.Feature("id"), feature.Feature("tf_name")], + id=11, + stream=False, + ) + + fv = feature_view.FeatureView( + name="fv_name", + query=fg.select_all(), + featurestore_id=99, + transformation_functions=[add_one("missing_col1")], + ) + + df = pd.DataFrame(data={"tf_name": [1, 2]}) + + # Act + with pytest.raises(exceptions.FeatureStoreException) as exception: + python_engine._apply_transformation_function( + transformation_functions=fv.transformation_functions, dataset=df + ) + print(str(exception.value)) + assert ( + str(exception.value) + == "The following feature(s): `missing_col1`, specified in the model-dependent transformation function 'add_one' are not present in the feature view. " + "Please verify that the correct features are specified in the transformation function." + ) + def test_materialization_kafka(self, mocker): # Arrange mocker.patch("hsfs.core.kafka_engine.get_kafka_config", return_value={}) @@ -3526,7 +3569,7 @@ def test_materialization_kafka_first_job_execution(self, mocker): mocker.patch("hsfs.util.get_job_url") mocker.patch( "hsfs.core.kafka_engine.kafka_get_offsets", - return_value=" tests_offsets", + return_value="tests_offsets", ) mocker.patch( "hsfs.core.job_api.JobApi.last_execution", @@ -3568,7 +3611,7 @@ def test_materialization_kafka_first_job_execution(self, mocker): # Assert assert mock_python_engine_kafka_produce.call_count == 4 job_mock.run.assert_called_once_with( - args="defaults tests_offsets", + args="defaults -initialCheckPointString tests_offsets", await_termination=False, ) @@ -3584,7 +3627,7 @@ def test_materialization_kafka_skip_offsets(self, mocker): mocker.patch("hsfs.util.get_job_url") mocker.patch( "hsfs.core.kafka_engine.kafka_get_offsets", - return_value=" tests_offsets", + return_value="tests_offsets", ) mocker.patch("hopsworks_common.client.get_instance") @@ -3625,7 +3668,7 @@ def test_materialization_kafka_skip_offsets(self, mocker): # Assert assert mock_python_engine_kafka_produce.call_count == 4 job_mock.run.assert_called_once_with( - args="defaults tests_offsets", + args="defaults -initialCheckPointString tests_offsets", await_termination=False, ) @@ -3641,7 +3684,7 @@ def test_materialization_kafka_topic_doesnt_exist(self, mocker): mocker.patch("hsfs.util.get_job_url") mocker.patch( "hsfs.core.kafka_engine.kafka_get_offsets", - side_effect=["", " tests_offsets"], + side_effect=["", "tests_offsets"], ) mocker.patch("hopsworks_common.client.get_instance") @@ -3679,7 +3722,7 @@ def test_materialization_kafka_topic_doesnt_exist(self, mocker): # Assert assert mock_python_engine_kafka_produce.call_count == 4 job_mock.run.assert_called_once_with( - args="defaults tests_offsets", + args="defaults -initialCheckPointString tests_offsets", await_termination=False, ) diff --git a/python/tests/engine/test_spark.py b/python/tests/engine/test_spark.py index fb3f6e08f..da3449270 100644 --- a/python/tests/engine/test_spark.py +++ b/python/tests/engine/test_spark.py @@ -15,6 +15,8 @@ # from __future__ import annotations +from unittest.mock import call + import hopsworks_common import numpy import pandas as pd @@ -39,6 +41,7 @@ from hsfs.training_dataset_feature import TrainingDatasetFeature from hsfs.transformation_function import TransformationType from pyspark.sql import DataFrame +from pyspark.sql.functions import lit from pyspark.sql.types import ( ArrayType, BinaryType, @@ -202,6 +205,9 @@ def test_register_hudi_temporary_table(self, mocker): # Arrange mock_hudi_engine = mocker.patch("hsfs.core.hudi_engine.HudiEngine") mocker.patch("hsfs.feature_group.FeatureGroup.from_response_json") + mock_reconcile_schema = mocker.patch( + "hsfs.engine.spark.Engine.reconcile_schema" + ) spark_engine = spark.Engine() @@ -219,6 +225,33 @@ def test_register_hudi_temporary_table(self, mocker): # Assert assert mock_hudi_engine.return_value.register_temporary_table.call_count == 1 + assert mock_reconcile_schema.call_count == 1 + + def test_register_delta_temporary_table(self, mocker): + # Arrange + mock_delta_engine = mocker.patch("hsfs.core.delta_engine.DeltaEngine") + mocker.patch("hsfs.feature_group.FeatureGroup.from_response_json") + mock_reconcile_schema = mocker.patch( + "hsfs.engine.spark.Engine.reconcile_schema" + ) + + spark_engine = spark.Engine() + + hudi_fg_alias = hudi_feature_group_alias.HudiFeatureGroupAlias( + feature_group=None, alias=None + ) + + # Act + spark_engine.register_delta_temporary_table( + delta_fg_alias=hudi_fg_alias, + feature_store_id=None, + feature_store_name=None, + read_options=None, + ) + + # Assert + assert mock_delta_engine.return_value.register_temporary_table.call_count == 1 + assert mock_reconcile_schema.call_count == 1 def test_return_dataframe_type_default(self, mocker): # Arrange @@ -605,51 +638,6 @@ def test_save_dataframe(self, mocker): assert mock_spark_engine_save_online_dataframe.call_count == 0 assert mock_spark_engine_save_offline_dataframe.call_count == 1 - def test_save_dataframe_transformations(self, mocker): - # Arrange - mock_spark_engine_save_online_dataframe = mocker.patch( - "hsfs.engine.spark.Engine._save_online_dataframe" - ) - mock_spark_engine_save_offline_dataframe = mocker.patch( - "hsfs.engine.spark.Engine._save_offline_dataframe" - ) - mock_spark_engine_apply_transformations = mocker.patch( - "hsfs.engine.spark.Engine._apply_transformation_function" - ) - - spark_engine = spark.Engine() - - @udf(int) - def test(feature): - return feature + 1 - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - transformation_functions=[test], - ) - - # Act - spark_engine.save_dataframe( - feature_group=fg, - dataframe=None, - operation=None, - online_enabled=None, - storage=None, - offline_write_options=None, - online_write_options=None, - validation_id=None, - ) - - # Assert - assert mock_spark_engine_save_online_dataframe.call_count == 0 - assert mock_spark_engine_save_offline_dataframe.call_count == 1 - assert mock_spark_engine_apply_transformations.call_count == 1 - def test_save_dataframe_storage_offline(self, mocker): # Arrange mock_spark_engine_save_online_dataframe = mocker.patch( @@ -873,130 +861,8 @@ def test_save_stream_dataframe(self, mocker, backend_fixtures): "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" - ) - - mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") - mock_engine_get_instance.return_value.add_file.return_value = ( - "result_from_add_file" - ) - - mock_storage_connector_api = mocker.patch( - "hsfs.core.storage_connector_api.StorageConnectorApi" - ) - json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] - sc = storage_connector.StorageConnector.from_response_json(json) - mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc - - spark_engine = spark.Engine() - - fg = feature_group.FeatureGroup( - name="test", - version=1, - featurestore_id=99, - primary_key=[], - partition_key=[], - id=10, - online_topic_name="test_online_topic_name", - ) - fg.feature_store = mocker.Mock() - project_id = 1 - fg.feature_store.project_id = project_id - - mock_common_client_get_instance.return_value._project_name = "test_project_name" - - # Act - spark_engine.save_stream_dataframe( - feature_group=fg, - dataframe=None, - query_name=None, - output_mode="test_mode", - await_termination=None, - timeout=None, - checkpoint_dir=None, - write_options={"test_name": "test_value"}, - ) - - # Assert - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] - == "headers" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ - 0 - ][0] - == "test_mode" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ - 0 - ][0] - == "kafka" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ - 0 - ][0] - == "checkpointLocation" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ - 0 - ][1] - == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ - 1 - ] - == { - "kafka.bootstrap.servers": "test_bootstrap_servers", - "kafka.security.protocol": "test_security_protocol", - "kafka.ssl.endpoint.identification.algorithm": "test_ssl_endpoint_identification_algorithm", - "kafka.ssl.key.password": "test_ssl_key_password", - "kafka.ssl.keystore.location": "result_from_add_file", - "kafka.ssl.keystore.password": "test_ssl_keystore_password", - "kafka.ssl.truststore.location": "result_from_add_file", - "kafka.ssl.truststore.password": "test_ssl_truststore_password", - "kafka.test_option_name": "test_option_value", - "test_name": "test_value", - } - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ - 0 - ][0] - == "topic" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ - 0 - ][1] - == "test_online_topic_name" - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ - 0 - ][0] - == self._get_spark_query_name(project_id, fg) - ) - assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count - == 0 - ) - - def test_save_stream_dataframe_transformations(self, mocker, backend_fixtures): - # Arrange - mock_common_client_get_instance = mocker.patch( - "hopsworks_common.client.get_instance" - ) - mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1007,21 +873,12 @@ def test_save_stream_dataframe_transformations(self, mocker, backend_fixtures): mock_storage_connector_api = mocker.patch( "hsfs.core.storage_connector_api.StorageConnectorApi" ) - - mock_spark_engine_apply_transformations = mocker.patch( - "hsfs.engine.spark.Engine._apply_transformation_function" - ) - json = backend_fixtures["storage_connector"]["get_kafka_external"]["response"] sc = storage_connector.StorageConnector.from_response_json(json) mock_storage_connector_api.return_value.get_kafka_connector.return_value = sc spark_engine = spark.Engine() - @udf(int) - def test(feature): - return feature + 1 - fg = feature_group.FeatureGroup( name="test", version=1, @@ -1030,7 +887,6 @@ def test(feature): partition_key=[], id=10, online_topic_name="test_online_topic_name", - transformation_functions=[test], ) fg.feature_store = mocker.Mock() project_id = 1 @@ -1052,35 +908,35 @@ def test(feature): # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -1097,28 +953,27 @@ def test(feature): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == self._get_spark_query_name(project_id, fg) ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 0 ) - assert mock_spark_engine_apply_transformations.call_count == 1 def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): # Arrange @@ -1126,9 +981,8 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1172,35 +1026,35 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == "/Projects/test_project_name/Resources/test_query_name-checkpoint" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -1217,25 +1071,25 @@ def test_save_stream_dataframe_query_name(self, mocker, backend_fixtures): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == "test_query_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 0 ) @@ -1251,9 +1105,8 @@ def test_save_stream_dataframe_checkpoint_dir(self, mocker, backend_fixtures): "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1299,35 +1152,35 @@ def test_save_stream_dataframe_checkpoint_dir(self, mocker, backend_fixtures): # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == "test_checkpoint_dir" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -1344,25 +1197,25 @@ def test_save_stream_dataframe_checkpoint_dir(self, mocker, backend_fixtures): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == self._get_spark_query_name(project_id, fg) ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 0 ) @@ -1372,9 +1225,8 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) "hopsworks_common.client.get_instance" ) mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1420,35 +1272,35 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) # Assert assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.call_args[ 0 ][0] == "test_mode" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][0] == "checkpointLocation" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.call_args[ 0 ][1] == f"/Projects/test_project_name/Resources/{self._get_spark_query_name(project_id, fg)}-checkpoint" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.call_args[ 1 ] == { @@ -1465,29 +1317,29 @@ def test_save_stream_dataframe_await_termination(self, mocker, backend_fixtures) } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.call_args[ 0 ][0] == self._get_spark_query_name(project_id, fg) ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_count == 1 ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.writeStream.outputMode.return_value.format.return_value.option.return_value.options.return_value.option.return_value.queryName.return_value.start.return_value.awaitTermination.call_args[ 0 ][0] == 123 @@ -1630,9 +1482,8 @@ def test_save_online_dataframe(self, mocker, backend_fixtures): # Arrange mocker.patch("hopsworks_common.client.get_instance") mocker.patch("hopsworks_common.client._is_external", return_value=False) - mocker.patch("hsfs.engine.spark.Engine._encode_complex_features") - mock_spark_engine_online_fg_to_avro = mocker.patch( - "hsfs.engine.spark.Engine._online_fg_to_avro" + mock_spark_engine_serialize_to_avro = mocker.patch( + "hsfs.engine.spark.Engine._serialize_to_avro" ) mock_engine_get_instance = mocker.patch("hsfs.engine.get_instance") @@ -1668,19 +1519,19 @@ def test_save_online_dataframe(self, mocker, backend_fixtures): ) # Assert - assert mock_spark_engine_online_fg_to_avro.call_count == 1 + assert mock_spark_engine_serialize_to_avro.call_count == 1 assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.call_args[0][0] + mock_spark_engine_serialize_to_avro.return_value.withColumn.call_args[0][0] == "headers" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.call_args[ 0 ][0] == "kafka" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.call_args[ 1 ] == { @@ -1697,37 +1548,40 @@ def test_save_online_dataframe(self, mocker, backend_fixtures): } ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ 0 ][0] == "topic" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.call_args[ 0 ][1] == "test_online_topic_name" ) assert ( - mock_spark_engine_online_fg_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.return_value.save.call_count + mock_spark_engine_serialize_to_avro.return_value.withColumn.return_value.write.format.return_value.options.return_value.option.return_value.save.call_count == 1 ) - def test_encode_complex_features(self, mocker): + def test_serialize_to_avro(self, mocker): # Arrange - mocker.patch("hopsworks_common.client.get_instance") - mocker.patch( - "hsfs.feature_group.FeatureGroup.get_complex_features", - return_value=["col_1"], - ) - mocker.patch("hsfs.feature_group.FeatureGroup._get_feature_avro_schema") - spark_engine = spark.Engine() - d = {"col_0": ["test_1", "test_2"], "col_1": ["test_1", "test_2"]} - df = pd.DataFrame(data=d) + mock_to_avro = mocker.patch("hsfs.engine.spark.to_avro") + mock_to_avro.return_value = lit(b"111") - spark_df = spark_engine._spark_session.createDataFrame(df) + fg_data = [] + fg_data.append(("ekarson", ["GRAVITY RUSH 2", "KING'S QUEST"])) + fg_data.append(("ratmilkdrinker", ["NBA 2K", "CALL OF DUTY"])) + pandas_df = pd.DataFrame(fg_data, columns=["account_id", "last_played_games"]) + + df = spark_engine._spark_session.createDataFrame(pandas_df) + + features = [ + feature.Feature(name="account_id", type="str"), + feature.Feature(name="last_played_games", type="array"), + ] fg = feature_group.FeatureGroup( name="test", @@ -1736,37 +1590,44 @@ def test_encode_complex_features(self, mocker): primary_key=[], partition_key=[], id=10, + features=features, ) - fg._subject = {"schema": '{"fields": [{"name": "col_0"}]}'} - - expected = pd.DataFrame(data={"col_0": ["test_1", "test_2"]}) + fg._subject = { + "id": 1025, + "subject": "fg_1", + "version": 1, + "schema": '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]}]}', + } # Act - result = spark_engine._encode_complex_features( + serialized_df = spark_engine._serialize_to_avro( feature_group=fg, - dataframe=spark_df, + dataframe=df, ) # Assert - result_df = result.toPandas() - assert list(result_df) == list(expected) - for column in list(result_df): - assert result_df[column].equals(expected[column]) - - def test_encode_complex_features_col_in_complex_features(self, mocker): - # Arrange - mocker.patch( - "hsfs.feature_group.FeatureGroup.get_complex_features", - return_value=["col_0"], + assert ( + serialized_df.schema.json() + == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}' ) - mocker.patch("hsfs.feature_group.FeatureGroup._get_feature_avro_schema") + """ Need spark to run these tests properly + def test_deserialize_from_avro(self, mocker): + # Arrange spark_engine = spark.Engine() - d = {"col_0": ["test_1", "test_2"], "col_1": ["test_1", "test_2"]} - df = pd.DataFrame(data=d) + data = [] + data.append((b"2121", b"21212121")) + data.append((b"1212", b"12121212")) + pandas_df = pd.DataFrame(data, columns =["key", "value"]) - spark_df = spark_engine._spark_session.createDataFrame(df) + df = spark_engine._spark_session.createDataFrame(pandas_df) + + features = [ + feature.Feature(name="account_id", type="str"), + feature.Feature(name="last_played_games", type="array"), + feature.Feature(name="event_time", type="timestamp"), + ] fg = feature_group.FeatureGroup( name="test", @@ -1775,29 +1636,42 @@ def test_encode_complex_features_col_in_complex_features(self, mocker): primary_key=[], partition_key=[], id=10, + features=features, ) - fg._subject = {"schema": '{"fields": [{"name": "col_0"}]}'} + fg._subject = { + 'id': 1025, + 'subject': 'fg_1', + 'version': 1, + 'schema': '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]},{"name":"event_time","type":["null",{"type":"long","logicalType":"timestamp-micros"}]}]}' + } # Act - with pytest.raises( - TypeError - ) as e_info: # todo look into this (to_avro has to be mocked) - spark_engine._encode_complex_features( - feature_group=fg, - dataframe=spark_df, - ) + deserialized_df = spark_engine._deserialize_from_avro( + feature_group=fg, + dataframe=df, + ) # Assert - assert str(e_info.value) == "'JavaPackage' object is not callable" + assert deserialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"account_id","nullable":true,"type":"string"},{"metadata":{},"name":"last_played_games","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"event_time","nullable":true,"type":"timestamp"}],"type":"struct"}' - def test_online_fg_to_avro(self): + def test_serialize_deserialize_avro(self, mocker): # Arrange spark_engine = spark.Engine() - d = {"col_0": ["test_1", "test_2"], "col_1": ["test_1", "test_2"]} - df = pd.DataFrame(data=d) + now = datetime.datetime.now() - spark_df = spark_engine._spark_session.createDataFrame(df) + fg_data = [] + fg_data.append(("ekarson", ["GRAVITY RUSH 2", "KING'S QUEST"], pd.Timestamp(now.timestamp()))) + fg_data.append(("ratmilkdrinker", ["NBA 2K", "CALL OF DUTY"], pd.Timestamp(now.timestamp()))) + pandas_df = pd.DataFrame(fg_data, columns =["account_id", "last_played_games", "event_time"]) + + df = spark_engine._spark_session.createDataFrame(pandas_df) + + features = [ + feature.Feature(name="account_id", type="str"), + feature.Feature(name="last_played_games", type="array"), + feature.Feature(name="event_time", type="timestamp"), + ] fg = feature_group.FeatureGroup( name="test", @@ -1806,20 +1680,31 @@ def test_online_fg_to_avro(self): primary_key=[], partition_key=[], id=10, + features=features, ) - fg._avro_schema = '{"fields": [{"name": "col_0"}]}' + fg._subject = { + 'id': 1025, + 'subject': 'fg_1', + 'version': 1, + 'schema': '{"type":"record","name":"fg_1","namespace":"test_featurestore.db","fields":[{"name":"account_id","type":["null","string"]},{"name":"last_played_games","type":["null",{"type":"array","items":["null","string"]}]},{"name":"event_time","type":["null",{"type":"long","logicalType":"timestamp-micros"}]}]}' + } # Act - with pytest.raises( - TypeError - ) as e_info: # todo look into this (to_avro has to be mocked) - spark_engine._online_fg_to_avro( - feature_group=fg, - dataframe=spark_df, - ) + serialized_df = spark_engine._serialize_to_avro( + feature_group=fg, + dataframe=df, + ) + + deserialized_df = spark_engine._deserialize_from_avro( + feature_group=fg, + dataframe=serialized_df, + ) # Assert - assert str(e_info.value) == "'JavaPackage' object is not callable" + assert serialized_df.schema.json() == '{"fields":[{"metadata":{},"name":"key","nullable":false,"type":"binary"},{"metadata":{},"name":"value","nullable":false,"type":"binary"}],"type":"struct"}' + assert df.schema == deserialized_df.schema + assert df.collect() == deserialized_df.collect() + """ def test_get_training_data(self, mocker): # Arrange @@ -4389,6 +4274,109 @@ def test_setup_s3_hadoop_conf_legacy(self, mocker): "fs.s3a.endpoint", s3_connector.arguments.get("fs.s3a.endpoint") ) + def test_setup_s3_hadoop_conf_disable_legacy(self, mocker): + # Arrange + mock_pyspark_getOrCreate = mocker.patch( + "pyspark.sql.session.SparkSession.builder.getOrCreate" + ) + + spark_engine = spark.Engine() + + s3_connector = storage_connector.S3Connector( + id=1, + name="test_connector", + featurestore_id=99, + bucket="bucket-name", + access_key="1", + secret_key="2", + server_encryption_algorithm="3", + server_encryption_key="4", + session_token="5", + arguments=[ + {"name": "fs.s3a.endpoint", "value": "testEndpoint"}, + {"name": "fs.s3a.global-conf", "value": "False"}, + ], + ) + + # Act + result = spark_engine._setup_s3_hadoop_conf( + storage_connector=s3_connector, + path="s3://_test_path", + ) + + # Assert + assert result == "s3a://_test_path" + assert ( + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.call_count + == 7 # Options should only be set at bucket level + ) + assert ( + call("fs.s3a.access.key", s3_connector.access_key) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + assert ( + call("fs.s3a.secret.key", s3_connector.secret_key) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + assert ( + call( + "fs.s3a.server-side-encryption-algorithm", + s3_connector.server_encryption_algorithm, + ) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call( + "fs.s3a.server-side-encryption-key", s3_connector.server_encryption_key + ) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call( + "fs.s3a.aws.credentials.provider", + "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider", + ) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call("fs.s3a.session.token", s3_connector.session_token) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + assert ( + call("fs.s3a.endpoint", s3_connector.arguments.get("fs.s3a.endpoint")) + not in mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.mock_calls + ) + + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.access.key", s3_connector.access_key + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.secret.key", s3_connector.secret_key + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.server-side-encryption-algorithm", + s3_connector.server_encryption_algorithm, + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.server-side-encryption-key", + s3_connector.server_encryption_key, + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.aws.credentials.provider", + "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider", + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.session.token", s3_connector.session_token + ) + mock_pyspark_getOrCreate.return_value.sparkContext._jsc.hadoopConfiguration.return_value.set.assert_any_call( + "fs.s3a.bucket.bucket-name.endpoint", + s3_connector.arguments.get("fs.s3a.endpoint"), + ) + def test_setup_s3_hadoop_conf_bucket_scope(self, mocker): # Arrange mock_pyspark_getOrCreate = mocker.patch( @@ -4514,7 +4502,7 @@ def test_is_spark_dataframe_spark_dataframe(self): # Assert assert result is True - def test_save_empty_dataframe(self, mocker): + def test_update_table_schema_hudi(self, mocker): # Arrange mock_spark_engine_save_dataframe = mocker.patch( "hsfs.engine.spark.Engine.save_dataframe" @@ -4534,15 +4522,42 @@ def test_save_empty_dataframe(self, mocker): partition_key=[], id=10, featurestore_name="test_featurestore", + time_travel_format="HUDI", ) # Act - spark_engine.save_empty_dataframe(feature_group=fg) + spark_engine.update_table_schema(feature_group=fg) # Assert assert mock_spark_engine_save_dataframe.call_count == 1 assert mock_spark_read.format.call_count == 1 + def test_update_table_schema_delta(self, mocker): + # Arrange + mock_spark_read = mocker.patch("pyspark.sql.SparkSession.read") + mock_format = mocker.Mock() + mock_spark_read.format.return_value = mock_format + + # Arrange + spark_engine = spark.Engine() + + fg = feature_group.FeatureGroup( + name="test", + version=1, + featurestore_id=99, + primary_key=[], + partition_key=[], + id=10, + featurestore_name="test_featurestore", + time_travel_format="DELTA", + ) + + # Act + spark_engine.update_table_schema(feature_group=fg) + + # Assert + assert mock_spark_read.format.call_count == 1 + def test_apply_transformation_function_single_output_udf_default_mode(self, mocker): # Arrange mocker.patch("hopsworks_common.client.get_instance") diff --git a/python/tests/fixtures/model_fixtures.json b/python/tests/fixtures/model_fixtures.json index 40c0b8002..a937eab40 100644 --- a/python/tests/fixtures/model_fixtures.json +++ b/python/tests/fixtures/model_fixtures.json @@ -16,7 +16,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -42,7 +41,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -69,7 +67,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -96,7 +93,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -123,7 +119,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -133,6 +128,32 @@ ] } }, + "get_llm": { + "response": { + "count": 1, + "items": [ + { + "id": "5", + "name": "llmmodel", + "version": 0, + "created": "created", + "creator": "creator", + "environment": "environment.yml", + "description": "description", + "project_name": "myproject", + "metrics": { "acc": 0.7 }, + "program": "program", + "user_full_name": "Full Name", + "model_schema": "model_schema.json", + "input_example": "input_example.json", + "model_registry_id": 1, + "tags": [], + "framework": "LLM", + "href": "test_href" + } + ] + } + }, "get_list": { "response": { "count": 2, @@ -150,7 +171,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], @@ -170,7 +190,6 @@ "program": "program", "user_full_name": "Full Name", "model_schema": "model_schema.json", - "training_dataset": "training_dataset", "input_example": "input_example.json", "model_registry_id": 1, "tags": [], diff --git a/python/tests/fixtures/model_fixtures.py b/python/tests/fixtures/model_fixtures.py index 32fe396de..9b3796d05 100644 --- a/python/tests/fixtures/model_fixtures.py +++ b/python/tests/fixtures/model_fixtures.py @@ -17,6 +17,7 @@ import numpy as np import pandas as pd import pytest +from hsml.llm.model import Model as LLMModel from hsml.model import Model as BaseModel from hsml.python.model import Model as PythonModel from hsml.sklearn.model import Model as SklearnModel @@ -29,12 +30,14 @@ MODEL_SKLEARN_ID = 2 MODEL_TENSORFLOW_ID = 3 MODEL_TORCH_ID = 4 +MODEL_LLM_ID = 5 MODEL_BASE_NAME = "basemodel" MODEL_PYTHON_NAME = "pythonmodel" MODEL_SKLEARN_NAME = "sklearnmodel" MODEL_TENSORFLOW_NAME = "tensorflowmodel" MODEL_TORCH_NAME = "torchmodel" +MODEL_LLM_NAME = "llmmodel" # models @@ -63,6 +66,10 @@ def model_tensorflow(): def model_torch(): return TorchModel(MODEL_TORCH_ID, MODEL_TORCH_NAME) +@pytest.fixture +def model_llm(): + return LLMModel(MODEL_LLM_ID, MODEL_LLM_NAME) + # input example diff --git a/python/tests/test_constants.py b/python/tests/test_constants.py index 7a923d8d8..3c03263bf 100644 --- a/python/tests/test_constants.py +++ b/python/tests/test_constants.py @@ -38,6 +38,7 @@ def test_model_framework_constants(self): "FRAMEWORK_TORCH": "TORCH", "FRAMEWORK_PYTHON": "PYTHON", "FRAMEWORK_SKLEARN": "SKLEARN", + "FRAMEWORK_LLM": "LLM", } # Assert @@ -52,26 +53,29 @@ def test_model_framework_constants(self): def test_model_registry_constants(self): # Arrange - hopsfs_mount_prefix = {"HOPSFS_MOUNT_PREFIX": "/hopsfs/"} + model_registry = { + "HOPSFS_MOUNT_PREFIX": "/hopsfs/", + "MODEL_FILES_DIR_NAME": "Files", + } # Assert self._check_added_modified_or_removed_values( constants.MODEL_REGISTRY, - num_values=len(hopsfs_mount_prefix), - expected_constants=hopsfs_mount_prefix, + num_values=len(model_registry), + expected_constants=model_registry, ) # MODEL_SERVING def test_model_serving_constants(self): # Arrange - models_dataset = {"MODELS_DATASET": "Models"} + model_serving = {"MODELS_DATASET": "Models", "ARTIFACTS_DIR_NAME": "Artifacts"} # Assert self._check_added_modified_or_removed_values( constants.MODEL_SERVING, - num_values=len(models_dataset), - expected_constants=models_dataset, + num_values=len(model_serving), + expected_constants=model_serving, ) # ARTIFACT_VERSION @@ -193,6 +197,7 @@ def test_predictor_model_server_constants(self): model_servers = { "MODEL_SERVER_PYTHON": "PYTHON", "MODEL_SERVER_TF_SERVING": "TENSORFLOW_SERVING", + "MODEL_SERVER_VLLM": "VLLM", } # Assert diff --git a/python/tests/test_deployment.py b/python/tests/test_deployment.py index 4ff91eadd..d9494fe62 100644 --- a/python/tests/test_deployment.py +++ b/python/tests/test_deployment.py @@ -145,7 +145,7 @@ def test_save_default(self, mocker, backend_fixtures): d.save() # Assert - mock_serving_engine_save.assert_called_once_with(d, 60) + mock_serving_engine_save.assert_called_once_with(d, 120) def test_save(self, mocker, backend_fixtures): # Arrange @@ -176,7 +176,7 @@ def test_start_default(self, mocker, backend_fixtures): d.start() # Assert - mock_serving_engine_start.assert_called_once_with(d, await_status=60) + mock_serving_engine_start.assert_called_once_with(d, await_status=120) def test_start(self, mocker, backend_fixtures): # Arrange @@ -207,7 +207,7 @@ def test_stop_default(self, mocker, backend_fixtures): d.stop() # Assert - mock_serving_engine_stop.assert_called_once_with(d, await_status=60) + mock_serving_engine_stop.assert_called_once_with(d, await_status=120) def test_stop(self, mocker, backend_fixtures): # Arrange @@ -617,15 +617,17 @@ def test_download_artifact(self, mocker, backend_fixtures): # Arrange p = self._get_dummy_predictor(mocker, backend_fixtures) d = deployment.Deployment(predictor=p) - mock_serving_engine_download_artifact = mocker.patch( - "hsml.engine.serving_engine.ServingEngine.download_artifact" + mock_serving_engine_download_artifact_files = mocker.patch( + "hsml.engine.serving_engine.ServingEngine.download_artifact_files" ) # Act - d.download_artifact() + d.download_artifact_files() # Assert - mock_serving_engine_download_artifact.assert_called_once_with(d) + mock_serving_engine_download_artifact_files.assert_called_once_with( + d, local_path=None + ) # get logs diff --git a/python/tests/test_feature_group.py b/python/tests/test_feature_group.py index 5e01b5a10..ea25bbff3 100644 --- a/python/tests/test_feature_group.py +++ b/python/tests/test_feature_group.py @@ -928,6 +928,7 @@ def test_prepare_spark_location_with_s3_connector(self, mocker, backend_fixtures # Arrange engine = spark.Engine() engine_instance = mocker.patch("hsfs.engine.get_instance", return_value=engine) + refetch_api = mocker.patch("hsfs.storage_connector.S3Connector.refetch") json = backend_fixtures["feature_group"]["get_basic_info"]["response"] fg = feature_group.FeatureGroup.from_response_json(json) fg._location = f"{fg.name}_{fg.version}" @@ -939,11 +940,13 @@ def test_prepare_spark_location_with_s3_connector(self, mocker, backend_fixtures # Assert assert fg.location == path engine_instance.assert_called_once() + refetch_api.assert_called_once() def test_prepare_spark_location_with_s3_connector_python(self, mocker, backend_fixtures): # Arrange engine = python.Engine() engine_instance = mocker.patch("hsfs.engine.get_instance", return_value=engine) + mocker.patch("hsfs.storage_connector.S3Connector.refetch") json = backend_fixtures["feature_group"]["get_basic_info"]["response"] fg = feature_group.FeatureGroup.from_response_json(json) fg._location = f"{fg.name}_{fg.version}" diff --git a/python/tests/test_model.py b/python/tests/test_model.py index b430afd53..44ec19b5b 100644 --- a/python/tests/test_model.py +++ b/python/tests/test_model.py @@ -138,6 +138,19 @@ def test_constructor_torch(self, mocker, backend_fixtures): # Assert self.assert_model(mocker, m, json, MODEL.FRAMEWORK_TORCH) + def test_constructor_llm(self, mocker, backend_fixtures): + # Arrange + json = backend_fixtures["model"]["get_llm"]["response"]["items"][0] + m_json = copy.deepcopy(json) + id = m_json.pop("id") + name = m_json.pop("name") + + # Act + m = model.Model(id=id, name=name, **m_json) + + # Assert + self.assert_model(mocker, m, json, MODEL.FRAMEWORK_LLM) + # save def test_save(self, mocker, backend_fixtures): @@ -253,7 +266,9 @@ def test_download(self, mocker, backend_fixtures): m.download() # Assert - mock_model_engine_download.assert_called_once_with(model_instance=m) + mock_model_engine_download.assert_called_once_with( + model_instance=m, local_path=None + ) # tags @@ -357,7 +372,6 @@ def assert_model(self, mocker, m, m_json, model_framework): assert m.project_name == m_json["project_name"] assert m.training_metrics == m_json["metrics"] assert m._user_full_name == m_json["user_full_name"] - assert m.training_dataset == m_json["training_dataset"] assert m.model_registry_id == m_json["model_registry_id"] if model_framework is None: diff --git a/python/tests/test_predictor.py b/python/tests/test_predictor.py index 658e9d8fc..a48c3d877 100644 --- a/python/tests/test_predictor.py +++ b/python/tests/test_predictor.py @@ -344,6 +344,14 @@ def test_validate_script_file_py_none(self): # Assert assert "Predictor scripts are required" in str(e_info.value) + def test_validate_script_file_llm_none(self): + # Act + with pytest.raises(ValueError) as e_info: + _ = predictor.Predictor._validate_script_file(MODEL.FRAMEWORK_LLM, None) + + # Assert + assert "Predictor scripts are required" in str(e_info.value) + def test_validate_script_file_tf_script_file(self): # Act predictor.Predictor._validate_script_file( @@ -364,6 +372,10 @@ def test_validate_script_file_py_script_file(self): # Act predictor.Predictor._validate_script_file(MODEL.FRAMEWORK_PYTHON, "script_file") + def test_validate_script_file_llm_script_file(self): + # Act + predictor.Predictor._validate_script_file(MODEL.FRAMEWORK_LLM, "script_file") + # infer model server def test_infer_model_server_tf(self): @@ -394,6 +406,13 @@ def test_infer_model_server_py(self): # Assert assert ms == PREDICTOR.MODEL_SERVER_PYTHON + def test_infer_model_server_llm(self): + # Act + ms = predictor.Predictor._infer_model_server(MODEL.FRAMEWORK_LLM) + + # Assert + assert ms == PREDICTOR.MODEL_SERVER_VLLM + # default serving tool def test_get_default_serving_tool_kserve_installed(self, mocker): diff --git a/python/tests/test_util.py b/python/tests/test_util.py index ce29ec34d..f92358755 100644 --- a/python/tests/test_util.py +++ b/python/tests/test_util.py @@ -28,6 +28,8 @@ from hopsworks_common.core.constants import HAS_AIOMYSQL, HAS_SQLALCHEMY from hsfs.embedding import EmbeddingFeature, EmbeddingIndex from hsfs.feature import Feature +from hsml.llm.model import Model as LLMModel +from hsml.llm.predictor import Predictor as LLMPredictor from hsml.model import Model as BaseModel from hsml.predictor import Predictor as BasePredictor from hsml.python.model import Model as PythonModel @@ -105,6 +107,17 @@ def test_set_model_class_torch(self, backend_fixtures): assert isinstance(model, TorchModel) assert model.framework == MODEL.FRAMEWORK_TORCH + def test_set_model_class_llm(self, backend_fixtures): + # Arrange + json = backend_fixtures["model"]["get_llm"]["response"]["items"][0] + + # Act + model = util.set_model_class(json) + + # Assert + assert isinstance(model, LLMModel) + assert model.framework == MODEL.FRAMEWORK_LLM + def test_set_model_class_unsupported(self, backend_fixtures): # Arrange json = backend_fixtures["model"]["get_base"]["response"]["items"][0] @@ -385,6 +398,7 @@ def pred_base_spec(model_framework, model_server): pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_base) @@ -398,6 +412,7 @@ def pred_base_spec(model_framework, model_server): pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_python(self, mocker, model_python): # Arrange @@ -408,6 +423,7 @@ def test_get_predictor_for_model_python(self, mocker, model_python): pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_python) @@ -419,6 +435,7 @@ def test_get_predictor_for_model_python(self, mocker, model_python): pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_sklearn(self, mocker, model_sklearn): # Arrange @@ -429,6 +446,7 @@ def test_get_predictor_for_model_sklearn(self, mocker, model_sklearn): ) pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_sklearn) @@ -440,6 +458,7 @@ def test_get_predictor_for_model_sklearn(self, mocker, model_sklearn): pred_sklearn.assert_called_once() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_tensorflow(self, mocker, model_tensorflow): # Arrange @@ -450,6 +469,7 @@ def test_get_predictor_for_model_tensorflow(self, mocker, model_tensorflow): "hsml.tensorflow.predictor.Predictor.__init__", return_value=None ) pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_tensorflow) @@ -461,6 +481,7 @@ def test_get_predictor_for_model_tensorflow(self, mocker, model_tensorflow): pred_sklearn.assert_not_called() pred_tensorflow.assert_called_once() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_predictor_for_model_torch(self, mocker, model_torch): # Arrange @@ -471,6 +492,7 @@ def test_get_predictor_for_model_torch(self, mocker, model_torch): pred_torch = mocker.patch( "hsml.torch.predictor.Predictor.__init__", return_value=None ) + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") # Act predictor = util.get_predictor_for_model(model_torch) @@ -482,6 +504,30 @@ def test_get_predictor_for_model_torch(self, mocker, model_torch): pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_called_once() + pred_llm.assert_not_called() + + def test_get_predictor_for_model_llm(self, mocker, model_llm): + # Arrange + pred_base = mocker.patch("hsml.predictor.Predictor.__init__") + pred_python = mocker.patch("hsml.python.predictor.Predictor.__init__") + pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") + pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") + pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch( + "hsml.llm.predictor.Predictor.__init__", return_value=None + ) + + # Act + predictor = util.get_predictor_for_model(model_llm) + + # Assert + assert isinstance(predictor, LLMPredictor) + pred_base.assert_not_called() + pred_python.assert_not_called() + pred_sklearn.assert_not_called() + pred_tensorflow.assert_not_called() + pred_torch.assert_not_called() + pred_llm.assert_called_once() def test_get_predictor_for_model_non_base(self, mocker): # Arrange @@ -490,6 +536,7 @@ def test_get_predictor_for_model_non_base(self, mocker): pred_sklearn = mocker.patch("hsml.sklearn.predictor.Predictor.__init__") pred_tensorflow = mocker.patch("hsml.tensorflow.predictor.Predictor.__init__") pred_torch = mocker.patch("hsml.torch.predictor.Predictor.__init__") + pred_llm = mocker.patch("hsml.llm.predictor.Predictor.__init__") class NonBaseModel: pass @@ -506,6 +553,7 @@ class NonBaseModel: pred_sklearn.assert_not_called() pred_tensorflow.assert_not_called() pred_torch.assert_not_called() + pred_llm.assert_not_called() def test_get_hostname_replaced_url(self, mocker): # Arrange diff --git a/python/tests/utils/schema/test_columnar_schema.py b/python/tests/utils/schema/test_columnar_schema.py index c01c3c33d..6ddffea5d 100644 --- a/python/tests/utils/schema/test_columnar_schema.py +++ b/python/tests/utils/schema/test_columnar_schema.py @@ -57,7 +57,7 @@ def test_constructor_default(self, mocker): mock_convert_pandas_series_to_schema.assert_not_called() mock_convert_spark_to_schema.assert_not_called() mock_convert_td_to_schema.assert_not_called() - assert mock_find_spec.call_count == 2 + assert mock_find_spec.call_count == 1 def test_constructor_list(self, mocker): # Arrange @@ -257,7 +257,7 @@ def test_constructor_hsfs_td(self, mocker): mock_convert_pandas_series_to_schema.assert_not_called() mock_convert_spark_to_schema.assert_not_called() mock_convert_td_to_schema.assert_called_once_with(columnar_obj) - assert mock_find_spec.call_count == 2 + assert mock_find_spec.call_count == 1 # convert list to schema diff --git a/utils/java/pom.xml b/utils/java/pom.xml index 196978d6c..a3a3026b4 100644 --- a/utils/java/pom.xml +++ b/utils/java/pom.xml @@ -5,7 +5,7 @@ com.logicalclocks hsfs-utils - 4.1.0-SNAPSHOT + 4.2.0-SNAPSHOT 3.2.0.0-SNAPSHOT diff --git a/utils/python/hsfs_utils.py b/utils/python/hsfs_utils.py index 6b8c49311..3cc1eb615 100644 --- a/utils/python/hsfs_utils.py +++ b/utils/python/hsfs_utils.py @@ -13,12 +13,14 @@ hopsfs = pfs.HadoopFileSystem("default", user=os.environ["HADOOP_USER_NAME"]) from pyspark.sql import SparkSession from pyspark.sql.types import StructField, StructType, _parse_datatype_string +from pyspark.sql.functions import max, expr import hopsworks +from hsfs import engine from hsfs.constructor import query from hsfs.statistics_config import StatisticsConfig -from hsfs.core import feature_monitoring_config_engine, feature_view_engine +from hsfs.core import feature_monitoring_config_engine, feature_view_engine, kafka_engine def read_job_conf(path: str) -> Dict[Any, Any]: @@ -258,6 +260,96 @@ def delta_vacuum_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None: entity.delta_vacuum() +def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], initial_check_point_string: str) -> None: + """ + Run materialization job on a feature group. + """ + feature_store = job_conf.pop("feature_store") + fs = get_feature_store_handle(feature_store) + + entity = fs.get_feature_group(name=job_conf["name"], version=job_conf["version"]) + + read_options = kafka_engine.get_kafka_config( + entity.feature_store_id, {}, engine="spark" + ) + + # get offsets + offset_location = entity.prepare_spark_location() + "/kafka_offsets" + try: + if initial_check_point_string: + offset_string = json.dumps(_build_starting_offsets(initial_check_point_string)) + else: + offset_string = spark.read.json(offset_location).toJSON().first() + except Exception as e: + print(f"An unexpected error occurred: {e}") + # if all else fails read from the beggining + initial_check_point_string = kafka_engine.kafka_get_offsets( + topic_name=entity._online_topic_name, + feature_store_id=entity.feature_store_id, + offline_write_options={}, + high=False, + ) + offset_string = json.dumps(_build_starting_offsets(initial_check_point_string)) + print(f"startingOffsets: {offset_string}") + + # read kafka topic + df = ( + spark.read.format("kafka") + .options(**read_options) + .option("subscribe", entity._online_topic_name) + .option("startingOffsets", offset_string) + .option("includeHeaders", "true") + .load() + .limit(5000000) + ) + + # filter only the necassary entries + df = df.filter(expr("CAST(filter(headers, header -> header.key = 'featureGroupId')[0].value AS STRING)") == str(entity._id)) + df = df.filter(expr("CAST(filter(headers, header -> header.key = 'subjectId')[0].value AS STRING)") == str(entity.subject["id"])) + + # deserialize dataframe so that it can be properly saved + deserialized_df = engine.get_instance()._deserialize_from_avro(entity, df) + + # insert data + entity.stream = False # to make sure we dont write to kafka + entity.insert(deserialized_df, storage="offline") + + # update offsets + df_offsets = df.groupBy('partition').agg(max('offset').alias('offset')).collect() + offset_dict = json.loads(offset_string) + for offset_row in df_offsets: + offset_dict[f"{entity._online_topic_name}"][f"{offset_row.partition}"] = offset_row.offset + 1 + + # save offsets + offset_df = spark.createDataFrame([offset_dict]) + offset_df.coalesce(1).write.mode("overwrite").json(offset_location) + +def update_table_schema_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None: + """ + Run table schema update job on a feature group. + """ + feature_store = job_conf.pop("feature_store") + fs = get_feature_store_handle(feature_store) + + entity = fs.get_feature_group(name=job_conf["name"], version=job_conf["version"]) + + entity.stream = False + engine.get_instance().update_table_schema(entity) + +def _build_starting_offsets(initial_check_point_string: str): + if not initial_check_point_string: + return "" + + # Split the input string into the topic and partition-offset pairs + topic, offsets = initial_check_point_string.split(',', 1) + + # Split the offsets and build a dictionary from them + offsets_dict = {partition: int(offset) for partition, offset in (pair.split(':') for pair in offsets.split(','))} + + # Create the final dictionary structure + result = {topic: offsets_dict} + + return result if __name__ == "__main__": # Setup spark first so it fails faster in case of args errors @@ -278,6 +370,8 @@ def delta_vacuum_fg(spark: SparkSession, job_conf: Dict[Any, Any]) -> None: "import_fg", "run_feature_monitoring", "delta_vacuum_fg", + "offline_fg_materialization", + "update_table_schema_fg", ], help="Operation type", ) @@ -297,6 +391,12 @@ def parse_isoformat_date(da: str) -> datetime: help="Job start time", ) + parser.add_argument( + "-initialCheckPointString", + type=str, + help="Kafka offset to start consuming from", + ) + args = parser.parse_args() job_conf = read_job_conf(args.path) @@ -318,6 +418,10 @@ def parse_isoformat_date(da: str) -> datetime: run_feature_monitoring(job_conf) elif args.op == "delta_vacuum_fg": delta_vacuum_fg(spark, job_conf) + elif args.op == "offline_fg_materialization": + offline_fg_materialization(spark, job_conf, args.initialCheckPointString) + elif args.op == "update_table_schema_fg": + update_table_schema_fg(spark, job_conf) success = True except Exception: