logicalclocks · bubriks · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/utils/python/hsfs_utils.py b/utils/python/hsfs_utils.py
@@ -273,13 +273,13 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in
         entity.feature_store_id, {}, engine="spark"
     )
 
-    # get offsets
+    # get starting offsets
     offset_location = entity.prepare_spark_location() + "/kafka_offsets"
     try:
         if initial_check_point_string:
-            offset_string = json.dumps(_build_starting_offsets(initial_check_point_string))
+            starting_offset_string = json.dumps(_build_starting_offsets(initial_check_point_string))
         else:
-            offset_string = spark.read.json(offset_location).toJSON().first()
+            starting_offset_string = spark.read.json(offset_location).toJSON().first()
     except Exception as e:
         print(f"An unexpected error occurred: {e}")
         # if all else fails read from the beggining
@@ -289,15 +289,26 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in
             offline_write_options={},
             high=False,
         )
-        offset_string = json.dumps(_build_starting_offsets(initial_check_point_string))
-    print(f"startingOffsets: {offset_string}")
+        starting_offset_string = json.dumps(_build_starting_offsets(initial_check_point_string))
+    print(f"startingOffsets: {starting_offset_string}")
+
+    # get ending offsets
+    ending_offset_string = kafka_engine.kafka_get_offsets(
+        topic_name=entity._online_topic_name,
+        feature_store_id=entity.feature_store_id,
+        offline_write_options={},
+        high=True,
+    )
+    ending_offset_string = json.dumps(_build_starting_offsets(ending_offset_string))
+    print(f"endingOffsets: {ending_offset_string}")
 
     # read kafka topic
     df = (
         spark.read.format("kafka")
         .options(**read_options)
         .option("subscribe", entity._online_topic_name)
-        .option("startingOffsets", offset_string)
+        .option("startingOffsets", starting_offset_string)
+        .option("endingOffsets", ending_offset_string)
         .option("includeHeaders", "true")
         .option("failOnDataLoss", "false")
         .load()
@@ -320,7 +331,7 @@ def offline_fg_materialization(spark: SparkSession, job_conf: Dict[Any, Any], in
 
     # update offsets
     df_offsets = (df if limit > filtered_df.count() else filtered_df).groupBy('partition').agg(max('offset').alias('offset')).collect()
-    offset_dict = json.loads(offset_string)
+    offset_dict = json.loads(starting_offset_string)
     for offset_row in df_offsets:
         offset_dict[f"{entity._online_topic_name}"][f"{offset_row.partition}"] = offset_row.offset + 1