From 321be274659cd7c64e7f649ed9997a83ca162a3a Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Tue, 10 Oct 2023 00:13:12 +0200 Subject: [PATCH] fixes globbing on windows --- dlt/common/storages/configuration.py | 7 ++++++- dlt/common/storages/fsspec_filesystem.py | 7 ++++++- tests/common/storages/test_local_filesystem.py | 13 +++++++------ 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index 8931a461d0..699465ce4a 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -1,3 +1,4 @@ +import os from urllib.parse import urlparse from typing import TYPE_CHECKING, Any, Literal, Optional, Type, get_args, ClassVar, Dict, Union @@ -74,7 +75,11 @@ class FilesystemConfiguration(BaseConfiguration): def protocol(self) -> str: """`bucket_url` protocol""" url = urlparse(self.bucket_url) - return url.scheme or "file" + # this prevents windows absolute paths to be recognized as schemas + if not url.scheme or (os.path.isabs(self.bucket_url) and "\\" in self.bucket_url): + return "file" + else: + return url.scheme def on_resolved(self) -> None: url = urlparse(self.bucket_url) diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index 93d3d37bbc..c084fcc12e 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -190,8 +190,10 @@ def glob_files( Returns: Iterable[FileItem]: The list of files. """ + import os bucket_url_parsed = urlparse(bucket_url) - if not bucket_url_parsed.scheme: + # if this is file path without scheme + if not bucket_url_parsed.scheme or (os.path.isabs(bucket_url) and "\\" in bucket_url): # this is a file so create a proper file url bucket_url = pathlib.Path(bucket_url).absolute().as_uri() bucket_url_parsed = urlparse(bucket_url) @@ -207,6 +209,9 @@ def glob_files( for file, md in glob_result.items(): if md["type"] != "file": continue + # make that absolute path on a file:// + if bucket_url_parsed.scheme == "file" and not file.startswith("/"): + file = "/" + file file_name = posixpath.relpath(file, bucket_path) file_url = bucket_url_parsed.scheme + "://" + file yield FileItem( diff --git a/tests/common/storages/test_local_filesystem.py b/tests/common/storages/test_local_filesystem.py index a8cdc96458..e9550a3173 100644 --- a/tests/common/storages/test_local_filesystem.py +++ b/tests/common/storages/test_local_filesystem.py @@ -1,8 +1,7 @@ import os import itertools import pytest - - +import pathlib from dlt.common.storages import fsspec_from_config, FilesystemConfiguration from dlt.common.storages.fsspec_filesystem import glob_files @@ -14,13 +13,15 @@ @pytest.mark.parametrize("bucket_url,load_content", itertools.product(["file:///", "/", ""], [True, False])) def test_filesystem_dict_local(bucket_url: str, load_content: bool) -> None: - if bucket_url in ["file://", ""]: + if bucket_url in [""]: # relative paths - bucket_url += TEST_SAMPLE_FILES + bucket_url = TEST_SAMPLE_FILES else: - bucket_url += os.path.abspath(TEST_SAMPLE_FILES)[1:] + if bucket_url == "/": + bucket_url = os.path.abspath(TEST_SAMPLE_FILES) + else: + bucket_url = pathlib.Path(TEST_SAMPLE_FILES).absolute().as_uri() - print(bucket_url) config = FilesystemConfiguration(bucket_url=bucket_url) filesystem, _ = fsspec_from_config(config) # use glob to get data