From 46d9bda86ea93922897e3957eb0a91a25772850c Mon Sep 17 00:00:00 2001 From: Emma Turetsky Date: Thu, 12 Dec 2024 15:50:03 -0600 Subject: [PATCH 1/4] Split get_dirlist_url from _set_director_url --- src/pelicanfs/core.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/pelicanfs/core.py b/src/pelicanfs/core.py index de39359..0bd2466 100644 --- a/src/pelicanfs/core.py +++ b/src/pelicanfs/core.py @@ -384,10 +384,7 @@ async def get_origin_url(self, fileloc: str) -> str: raise NoAvailableSource() return origin - async def get_dirlist_url(self, fileloc: str) -> str: - """ - Returns a dirlist host url for the given namespace locations - """ + async def _set_director_url(self) -> str: if not self.director_url: metadata_json = await self._discover_federation_metadata(self.discovery_url) # Ensure the director url has a '/' at the end @@ -399,6 +396,12 @@ async def get_dirlist_url(self, fileloc: str) -> str: director_url = director_url + "/" self.director_url = director_url + async def get_dirlist_url(self, fileloc: str) -> str: + """ + Returns a dirlist host url for the given namespace locations + """ + await self._set_director_url() + url = urllib.parse.urljoin(self.director_url, fileloc) # Timeout response in seconds - the default response is 5 minutes From 6d0a3c696935a947cd0fa5e2282ac0e892dd1bc8 Mon Sep 17 00:00:00 2001 From: Emma Turetsky Date: Thu, 12 Dec 2024 16:15:26 -0600 Subject: [PATCH 2/4] Added an _ls_real function into core.py -- This function will use a webdav listings for the base ls call --- src/pelicanfs/core.py | 50 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/src/pelicanfs/core.py b/src/pelicanfs/core.py index 0bd2466..79c171c 100644 --- a/src/pelicanfs/core.py +++ b/src/pelicanfs/core.py @@ -24,6 +24,7 @@ import aiohttp import cachetools import fsspec.implementations.http as fshttp +from aiowebdav.client import Client from fsspec.asyn import AsyncFileSystem, sync from fsspec.utils import glob_translate @@ -473,8 +474,53 @@ async def wrapper(self, *args, **kwargs): @_dirlist_dec async def _ls(self, path, detail=True, **kwargs): - results = await self.http_file_system._ls(path, detail, **kwargs) - return self._remove_host_from_paths(results) + """ + This _ls call will mimic the httpfs _ls call and call our version of _ls_real + """ + # TODO: Add the optional listings cache for pelicanfs - removed for now in order to keep things simple for + # the re-implementation of ls + # if self.use_listings_cache and path in self.dircache: + # out = self.dircache[path] + # else: + out = await self._ls_real(path, detail=detail, **kwargs) + # self.dircache[path] = out + return self._remove_host_from_paths(out) + + async def _ls_real(self, url, detail=True, **kwargs): + """ + This _ls_real uses a webdavclient listing rather than an https call. This lets pelicanfs identify whether an object + is a file or a collection. This is important for functions which are expected to recurse or walk the collection url + such as find/glob/walk + """ + # ignoring URL-encoded arguments + logger.debug(url) + parts = urllib.parse.urlparse(url) + base_url = f"{parts.scheme}://{parts.netloc}" + + # Create the options for the webdavclient + + options = { + "hostname": base_url, + } + + async with Client(options) as client: + remote_dir = parts.path + try: + items = await client.list(remote_dir, get_info=True) + if detail: + return [ + { + "name": f"{base_url}{item['path']}", # use the base url in order for httpfs find/walk to be able to call its info + "size": None, + "type": "directory" if item["isdir"] else "file", + } + for item in items + ] + else: + return sorted([item["path"] for item in items]) # TODO: Check to see if this needs to match the name scheme + except Exception: + # TODO: Check for if the top level is a file and not a directory and handle accordingly + raise @_dirlist_dec async def _isdir(self, path): From 7281466f0c4df7970aeed3ed64db73b9ba4a5137 Mon Sep 17 00:00:00 2001 From: Emma Turetsky Date: Thu, 12 Dec 2024 16:19:09 -0600 Subject: [PATCH 3/4] Overwrote the httpfs _ls_real with ours -- This is to avoid having to create find and walk which would be simply be duplicates of the httpfs find/walk --- src/pelicanfs/core.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/pelicanfs/core.py b/src/pelicanfs/core.py index 79c171c..2f32359 100644 --- a/src/pelicanfs/core.py +++ b/src/pelicanfs/core.py @@ -222,6 +222,9 @@ def __init__( self._mkdir = self.http_file_system._mkdir self._makedirs = self.http_file_system._makedirs + # Overwrite the httpsfs _ls_real call with ours with ours + self.http_file_system._ls_real = self._ls_real + # Note this is a class method because it's overwriting a class method for the AbstractFileSystem @classmethod def _strip_protocol(cls, path): From 86218f56d0c11a132932fe48afffac1b0be7d908 Mon Sep 17 00:00:00 2001 From: Emma Turetsky Date: Thu, 12 Dec 2024 16:38:32 -0600 Subject: [PATCH 4/4] Add aiowebdav to setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index cc3c6a8..bc70e44 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,7 @@ install_requires=[ "aiohttp~=3.9.4", "aiosignal~=1.3.1", + "aiowebdav~=0.1.0rc5", "async-timeout~=4.0.3", "attrs~=23.2.0", "frozenlist~=1.4.1",