-
Notifications
You must be signed in to change notification settings - Fork 185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Delta table support for filesystem
destination
#1382
Changes from 12 commits
683b35c
a650de7
d66cbb2
6e3dced
b241e8c
10185df
574215f
ae03815
88cbfcf
b83ca8b
b8d2967
7a38470
418d8a8
fad4ff0
8134aab
91716df
8bdb93f
0a32c44
0fd7e3e
e9282ea
c87d68e
1e341cf
2e04cff
0240c39
f47de39
1d9b968
12e03ec
681ae48
83745fa
72553d6
3f41402
a49b23d
5b9071f
6f76587
bae5266
70fddc3
1a10d2d
e1e4772
d25ebc4
4420a36
0bde8b9
86ab9ff
9a302dc
6c3c8c7
e17a54b
bcfb418
8f7831f
e33af63
cdeefd2
a75a0f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -214,6 +214,20 @@ def exception(self) -> str: | |
pass | ||
|
||
|
||
class DirectoryLoadJob: | ||
"""Job that loads a directory of files in a single transaction.""" | ||
|
||
def __init__(self, dir_name: str) -> None: | ||
self._dir_name = dir_name | ||
|
||
def dir_name(self) -> str: | ||
"""Returns name of directory containing the job files.""" | ||
return self._dir_name | ||
|
||
def job_id(self) -> str: | ||
return "hacked_job_id" | ||
|
||
|
||
class NewLoadJob(LoadJob): | ||
"""Adds a trait that allows to save new job file""" | ||
|
||
|
@@ -309,8 +323,12 @@ def restore_file_load(self, file_path: str) -> LoadJob: | |
"""Finds and restores already started loading job identified by `file_path` if destination supports it.""" | ||
pass | ||
|
||
def can_do_logical_replace(self, table: TTableSchema) -> bool: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this can become a destination capability if we turn Delta into a full destination. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO we do not need this on the highest abstraction level. This belongs only to i'm trying to keep the abstract classes as simple as possible. two methods below are already a stretch (but I do not have an idea where to move them) |
||
"""Returns True if `replace` can be done without physically deleting data.""" | ||
return table["table_format"] == "delta" | ||
|
||
def should_truncate_table_before_load(self, table: TTableSchema) -> bool: | ||
return table["write_disposition"] == "replace" | ||
return table["write_disposition"] == "replace" and not self.can_do_logical_replace(table) | ||
|
||
def create_table_chain_completed_followup_jobs( | ||
self, table_chain: Sequence[TTableSchema] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,7 +64,7 @@ | |
"dedup_sort", | ||
] | ||
"""Known hints of a column used to declare hint regexes.""" | ||
TTableFormat = Literal["iceberg", "parquet", "jsonl"] | ||
TTableFormat = Literal["iceberg", "parquet", "jsonl", "delta"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should kick out "parquet" and "jsonl" from here.
why: to have a clear distinction between file format and table format. I see 3 formats now: iceberg, delta and hive (or pyarrow dataset) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
TTypeDetections = Literal[ | ||
"timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" | ||
] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
Any, | ||
Tuple, | ||
TypedDict, | ||
Union, | ||
) | ||
from typing_extensions import NotRequired | ||
|
||
|
@@ -177,6 +178,15 @@ def __str__(self) -> str: | |
return self.job_id() | ||
|
||
|
||
class ParsedLoadJobDirectoryName(NamedTuple): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also very minimal. Same as above. |
||
table_name: str | ||
|
||
@staticmethod | ||
def parse(dir_name: str) -> "ParsedLoadJobDirectoryName": | ||
table_name = Path(dir_name).name | ||
return ParsedLoadJobDirectoryName(table_name=table_name) | ||
|
||
|
||
class LoadJobInfo(NamedTuple): | ||
state: TJobState | ||
file_path: str | ||
|
@@ -316,11 +326,18 @@ def __init__(self, storage: FileStorage, initial_state: TLoadPackageStatus) -> N | |
def get_package_path(self, load_id: str) -> str: | ||
return load_id | ||
|
||
def get_job_folder_path(self, load_id: str, folder: TJobState) -> str: | ||
return os.path.join(self.get_package_path(load_id), folder) | ||
def get_job_folder_path( | ||
self, load_id: str, folder: TJobState, subfolder: Optional[str] = None | ||
) -> str: | ||
if subfolder is None: | ||
return os.path.join(self.get_package_path(load_id), folder) | ||
else: | ||
return os.path.join(self.get_package_path(load_id), folder, subfolder) | ||
|
||
def get_job_file_path(self, load_id: str, folder: TJobState, file_name: str) -> str: | ||
return os.path.join(self.get_job_folder_path(load_id, folder), file_name) | ||
def get_job_file_path( | ||
self, load_id: str, folder: TJobState, file_name: str, subfolder: Optional[str] = None | ||
) -> str: | ||
return os.path.join(self.get_job_folder_path(load_id, folder, subfolder), file_name) | ||
|
||
def list_packages(self) -> Sequence[str]: | ||
"""Lists all load ids in storage, earliest first | ||
|
@@ -331,11 +348,17 @@ def list_packages(self) -> Sequence[str]: | |
# start from the oldest packages | ||
return sorted(loads) | ||
|
||
def list_new_jobs(self, load_id: str) -> Sequence[str]: | ||
new_jobs = self.storage.list_folder_files( | ||
self.get_job_folder_path(load_id, PackageStorage.NEW_JOBS_FOLDER) | ||
) | ||
return new_jobs | ||
def list_new_jobs(self, load_id: str, root_only: bool = False) -> Sequence[str]: | ||
root_dir = self.get_job_folder_path(load_id, PackageStorage.NEW_JOBS_FOLDER) | ||
if root_only: | ||
return self.storage.list_folder_files(root_dir) | ||
sub_dirs = self.storage.list_folder_dirs(root_dir) | ||
dirs = [root_dir] + sub_dirs | ||
return [file for dir_ in dirs for file in self.storage.list_folder_files(dir_)] | ||
|
||
def list_new_dir_jobs(self, load_id: str) -> Sequence[str]: | ||
root_dir = self.get_job_folder_path(load_id, PackageStorage.NEW_JOBS_FOLDER) | ||
return self.storage.list_folder_dirs(root_dir) | ||
|
||
def list_started_jobs(self, load_id: str) -> Sequence[str]: | ||
return self.storage.list_folder_files( | ||
|
@@ -382,17 +405,19 @@ def import_job( | |
"""Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`""" | ||
self.storage.atomic_import(job_file_path, self.get_job_folder_path(load_id, job_state)) | ||
|
||
def start_job(self, load_id: str, file_name: str) -> str: | ||
def start_job(self, load_id: str, job: Union["LoadJob", "DirectoryLoadJob"]) -> str: # type: ignore[name-defined] # noqa: F821 | ||
return self._move_job( | ||
load_id, PackageStorage.NEW_JOBS_FOLDER, PackageStorage.STARTED_JOBS_FOLDER, file_name | ||
load_id, PackageStorage.NEW_JOBS_FOLDER, PackageStorage.STARTED_JOBS_FOLDER, job | ||
) | ||
|
||
def fail_job(self, load_id: str, file_name: str, failed_message: Optional[str]) -> str: | ||
def fail_job( | ||
self, load_id: str, job: Union["LoadJob", "DirectoryLoadJob"], failed_message: Optional[str] # type: ignore[name-defined] # noqa: F821 | ||
) -> str: | ||
# save the exception to failed jobs | ||
if failed_message: | ||
self.storage.save( | ||
self.get_job_file_path( | ||
load_id, PackageStorage.FAILED_JOBS_FOLDER, file_name + ".exception" | ||
load_id, PackageStorage.FAILED_JOBS_FOLDER, job.file_name() + ".exception" | ||
), | ||
failed_message, | ||
) | ||
|
@@ -401,28 +426,30 @@ def fail_job(self, load_id: str, file_name: str, failed_message: Optional[str]) | |
load_id, | ||
PackageStorage.STARTED_JOBS_FOLDER, | ||
PackageStorage.FAILED_JOBS_FOLDER, | ||
file_name, | ||
job.file_name(), | ||
) | ||
|
||
def retry_job(self, load_id: str, file_name: str) -> str: | ||
def retry_job(self, load_id: str, job: Union["LoadJob", "DirectoryLoadJob"]) -> str: # type: ignore[name-defined] # noqa: F821 | ||
# when retrying job we must increase the retry count | ||
source_fn = ParsedLoadJobFileName.parse(file_name) | ||
source_fn = ParsedLoadJobFileName.parse(job.file_name()) | ||
dest_fn = source_fn.with_retry() | ||
# move it directly to new file name | ||
return self._move_job( | ||
load_id, | ||
PackageStorage.STARTED_JOBS_FOLDER, | ||
PackageStorage.NEW_JOBS_FOLDER, | ||
file_name, | ||
job.file_name(), | ||
dest_fn.file_name(), | ||
) | ||
|
||
def complete_job(self, load_id: str, file_name: str) -> str: | ||
def complete_job( | ||
self, load_id: str, job: Union["LoadJob", "DirectoryLoadJob"] # type: ignore[name-defined] # noqa: F821 | ||
) -> str: | ||
return self._move_job( | ||
load_id, | ||
PackageStorage.STARTED_JOBS_FOLDER, | ||
PackageStorage.COMPLETED_JOBS_FOLDER, | ||
file_name, | ||
job, | ||
) | ||
|
||
# | ||
|
@@ -601,15 +628,25 @@ def _move_job( | |
load_id: str, | ||
source_folder: TJobState, | ||
dest_folder: TJobState, | ||
file_name: str, | ||
job: Union["LoadJob", "DirectoryLoadJob"], # type: ignore[name-defined] # noqa: F821 | ||
new_file_name: str = None, | ||
) -> str: | ||
# ensure we move file names, not paths | ||
assert file_name == FileStorage.get_file_name_from_file_path(file_name) | ||
from dlt.common.destination.reference import LoadJob, DirectoryLoadJob | ||
|
||
load_path = self.get_package_path(load_id) | ||
dest_path = os.path.join(load_path, dest_folder, new_file_name or file_name) | ||
self.storage.atomic_rename(os.path.join(load_path, source_folder, file_name), dest_path) | ||
# print(f"{join(load_path, source_folder, file_name)} -> {dest_path}") | ||
|
||
if isinstance(job, LoadJob): | ||
source_name = job.file_name() | ||
# ensure we move file names, not paths | ||
assert source_name == FileStorage.get_file_name_from_file_path(source_name) | ||
dest_name = new_file_name or source_name | ||
elif isinstance(job, DirectoryLoadJob): | ||
source_name = job.dir_name() | ||
dest_name = job.dir_name() | ||
|
||
source_path = os.path.join(load_path, source_folder, source_name) | ||
dest_path = os.path.join(load_path, dest_folder, dest_name) | ||
self.storage.atomic_rename(source_path, dest_path) | ||
return self.storage.make_full_path(dest_path) | ||
|
||
def _load_schema(self, load_id: str) -> DictStrAny: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Very minimal for now. Want to get some feedback before further polishing.