dlt-hub · sh-rp · Apr 17, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -427,6 +427,7 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]:
 
     @abstractmethod
     def get_stored_schema_by_hash(self, version_hash: str) -> StorageSchemaInfo:
+        """retrieves the stored schema by hash"""
         pass
 
     @abstractmethod

diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py
@@ -1,9 +1,13 @@
 import posixpath
 import os
 from types import TracebackType
-from typing import ClassVar, List, Type, Iterable, Set, Iterator
+from typing import ClassVar, List, Type, Iterable, Set, Iterator, Optional, Tuple
 from fsspec import AbstractFileSystem
 from contextlib import contextmanager
+from dlt.common import json, pendulum
+from dlt.common.typing import DictStrAny
+
+import re
 
 from dlt.common import logger
 from dlt.common.schema import Schema, TSchemaTables, TTableSchema
@@ -16,6 +20,10 @@
     JobClientBase,
     FollowupJob,
     WithStagingDataset,
+    WithStateSync,
+    StorageSchemaInfo,
+    StateInfo,
+    DoNothingJob,
 )
 
 from dlt.destinations.job_impl import EmptyLoadJob
@@ -87,7 +95,7 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]:
         return jobs
 
 
-class FilesystemClient(JobClientBase, WithStagingDataset):
+class FilesystemClient(JobClientBase, WithStagingDataset, WithStateSync):
     """filesystem client storing jobs in memory"""
 
     capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
@@ -171,9 +179,14 @@ def update_stored_schema(
         self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None
     ) -> TSchemaTables:
         # create destination dirs for all tables
+        # TODO we should only create dirs for datatables
         dirs_to_create = self._get_table_dirs(only_tables or self.schema.tables.keys())
         for directory in dirs_to_create:
             self.fs_client.makedirs(directory, exist_ok=True)
+
+        # write schema to destination
+        self.store_current_schema()
+
         return expected_update
 
     def _get_table_dirs(self, table_names: Iterable[str]) -> Set[str]:
@@ -192,6 +205,11 @@ def is_storage_initialized(self) -> bool:
         return self.fs_client.isdir(self.dataset_path)  # type: ignore[no-any-return]
 
     def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob:
+        # skip the state table, we create a jsonl file in the complete_load
+        # step
+        if table["name"] == self.schema.state_table_name:
+            return DoNothingJob(file_path)
+
         cls = FollowupFilesystemJob if self.config.as_staging else LoadFilesystemJob
         return cls(
             file_path,
@@ -204,12 +222,6 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
     def restore_file_load(self, file_path: str) -> LoadJob:
         return EmptyLoadJob.from_file_path(file_path, "completed")
 
-    def complete_load(self, load_id: str) -> None:
-        schema_name = self.schema.name
-        table_name = self.schema.loads_table_name
-        file_name = f"{schema_name}.{table_name}.{load_id}"
-        self.fs_client.touch(posixpath.join(self.dataset_path, file_name))
-
     def __enter__(self) -> "FilesystemClient":
         return self
 
@@ -220,3 +232,111 @@ def __exit__(
 
     def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool:
         return False
+
+    #
+    # state stuff
+    #
+
+    def _write_to_json_file(self, filepath: str, data: DictStrAny) -> None:
+        dirname = os.path.dirname(filepath)
+        self.fs_client.makedirs(dirname, exist_ok=True)
+        self.fs_client.write_text(filepath, json.dumps(data), "utf-8")
+
+    def complete_load(self, load_id: str) -> None:
+        # store current state
+        self.store_current_state()
+
+        # write entry to load "table"
+        # TODO: this is also duplicate across all destinations. DRY this.
+        load_data = {
+            "load_id": load_id,
+            "schema_name": self.schema.name,
+            "status": 0,
+            "inserted_at": pendulum.now().isoformat(),
+            "schema_version_hash": self.schema.version_hash,
+        }
+        filepath = (
+            f"{self.dataset_path}/{self.schema.loads_table_name}/{self.schema.name}.{load_id}.jsonl"
+        )
+
+        self._write_to_json_file(filepath, load_data)
+
+    #
+    # state read/write
+    #
+
+    def _get_state_file_name(self, pipeline_name: str, hash: str) -> Tuple[str, str]:
+        """gets tuple of dir and fullpath for schema file for a given hash"""
+        safe_hash = "".join(
+            [c for c in hash if re.match(r"\w", c)]
+        )  # remove all special chars from hash
+        return (
+            f"{self.dataset_path}/{self.schema.state_table_name}/{pipeline_name}__{safe_hash}.jsonl"
+        )
+
+    def store_current_state(self) -> None:
+        # get state doc from current pipeline
+        from dlt import current
+        from dlt.pipeline.state_sync import state_doc
+
+        pipeline = current.pipeline()
+        state = pipeline._get_state()
+        doc = state_doc(state)
+
+        # get paths
+        current_path = self._get_state_file_name(pipeline.pipeline_name, "current")
+        hash_path = self._get_state_file_name(
+            pipeline.pipeline_name, self.schema.stored_version_hash
+        )
+
+        # write
+        self._write_to_json_file(current_path, doc)
+        self._write_to_json_file(hash_path, doc)
+
+    def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
+        """Loads compressed state from destination storage"""
+        file_name = self._get_state_file_name(pipeline_name, "current")
+        if self.fs_client.exists(file_name):
+            state_json = json.loads(self.fs_client.read_text(file_name))
+            state_json.pop("version_hash")
+            return StateInfo(**state_json)
+
+    #
+    # Schema read/write
+    #
+
+    def _get_schema_file_name(self, hash: str) -> Tuple[str, str]:
+        """gets tuple of dir and fullpath for schema file for a given hash"""
+        safe_hash = "".join(
+            [c for c in hash if re.match(r"\w", c)]
+        )  # remove all special chars from hash
+        return f"{self.dataset_path}/{self.schema.version_table_name}/{self.schema.name}__{safe_hash}.jsonl"
+
+    def get_stored_schema(self) -> Optional[StorageSchemaInfo]:
+        """Retrieves newest schema from destination storage"""
+        return self.get_stored_schema_by_hash("current")
+
+    def get_stored_schema_by_hash(self, version_hash: str) -> Optional[StorageSchemaInfo]:
+        """retrieves the stored schema by hash"""
+        filepath = self._get_schema_file_name(version_hash)
+        if self.fs_client.exists(filepath):
+            return StorageSchemaInfo(**json.loads(self.fs_client.read_text(filepath)))
+
+    def store_current_schema(self) -> None:
+        # get paths
+        current_path = self._get_schema_file_name("current")
+        hash_path = self._get_schema_file_name(self.schema.stored_version_hash)
+
+        # TODO: duplicate of weaviate implementation, should be abstracted out
+        version_info = {
+            "version_hash": self.schema.stored_version_hash,
+            "schema_name": self.schema.name,
+            "version": self.schema.version,
+            "engine_version": self.schema.ENGINE_VERSION,
+            "inserted_at": pendulum.now(),
+            "schema": json.dumps(self.schema.to_dict()),
+        }
+
+        # we always keep tabs on what the current schema is
+        self._write_to_json_file(current_path, version_info)
+        self._write_to_json_file(hash_path, version_info)
diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py
@@ -521,17 +521,6 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]:
                     state["dlt_load_id"] = state.pop("_dlt_load_id")
                     return StateInfo(**state)
 
-    # def get_stored_states(self, state_table: str) -> List[StateInfo]:
-    #     state_records = self.get_records(state_table,
-    #         sort={
-    #             "path": ["created_at"],
-    #             "order": "desc"
-    #         }, properties=self.state_properties)
-
-    #     for state in state_records:
-    #         state["dlt_load_id"] = state.pop("_dlt_load_id")
-    #     return [StateInfo(**state) for state in state_records]
-
     def get_stored_schema(self) -> Optional[StorageSchemaInfo]:
         """Retrieves newest schema from destination storage"""
         try:

diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
@@ -371,15 +371,6 @@ def get_stored_state(self, pipeline_name: str) -> StateInfo:
             return None
         return StateInfo(row[0], row[1], row[2], row[3], pendulum.instance(row[4]))
 
-    # def get_stored_states(self, state_table: str) -> List[StateInfo]:
-    #     """Loads list of compressed states from destination storage, optionally filtered by pipeline name"""
-    #     query = f"SELECT {self.STATE_TABLE_COLUMNS} FROM {state_table} AS s ORDER BY created_at DESC"
-    #     result: List[StateInfo] = []
-    #     with self.sql_client.execute_query(query) as cur:
-    #         for row in cur.fetchall():
-    #             result.append(StateInfo(row[0], row[1], row[2], row[3], pendulum.instance(row[4])))
-    #     return result
-
     def get_stored_schema_by_hash(self, version_hash: str) -> StorageSchemaInfo:
         name = self.sql_client.make_qualified_table_name(self.schema.version_table_name)
         query = f"SELECT {self.version_table_schema_columns} FROM {name} WHERE version_hash = %s;"

diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py
@@ -713,7 +713,6 @@ def sync_destination(
                             remote_state["schema_names"], always_download=True
                         )
                         # TODO: we should probably wipe out pipeline here
-
                 # if we didn't full refresh schemas, get only missing schemas
                 if restored_schemas is None:
                     restored_schemas = self._get_schemas_from_destination(

diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py
@@ -115,20 +115,25 @@ def migrate_pipeline_state(
     return cast(TPipelineState, state)
 
 
-def state_resource(state: TPipelineState) -> DltResource:
-    state = copy(state)
-    state.pop("_local")
+def state_doc(state: TPipelineState) -> DictStrAny:
+    doc = copy(state)
+    doc.pop("_local")
     state_str = compress_state(state)
-    state_doc = {
+    doc = {
         "version": state["_state_version"],
         "engine_version": state["_state_engine_version"],
         "pipeline_name": state["pipeline_name"],
         "state": state_str,
         "created_at": pendulum.now(),
         "version_hash": state["_version_hash"],
     }
+    return doc
+
+
+def state_resource(state: TPipelineState) -> DltResource:
+    doc = state_doc(state)
     return dlt.resource(
-        [state_doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS
+        [doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS
     )
 
 

diff --git a/fs_testing_pipe.py b/fs_testing_pipe.py
@@ -0,0 +1,20 @@
+import dlt
+import os
+
+if __name__ == "__main__":
+    os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "file://my_files"
+    os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE"
+
+    # resource with incremental for testing restoring of pipeline state 
+    @dlt.resource(name="my_table")
+    def my_resouce(id=dlt.sources.incremental("id")):
+        yield from [
+            {"id": 1},
+            {"id": 2},
+            {"id": 3},
+            {"id": 4},
+            {"id": 5}
+            ]
+
+    pipe = dlt.pipeline(pipeline_name="dave", destination="filesystem")
+    pipe.run(my_resouce(), table_name="my_table") #, loader_file_format="parquet")