From 682c885c9ebd4caf588dba05d4c72dd04121b3c6 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Dec 2023 22:07:58 -0300 Subject: [PATCH 01/24] FIX: Remove GTEST_SKIP() that would skip the second call with (false) --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index f6af9f722dbac..410dbf1538cdc 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -662,7 +662,7 @@ class TestAzureFileSystem : public ::testing::Test { void TestAzureFileSystem::TestDetectHierarchicalNamespace(bool trip_up_azurite) { EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); if (trip_up_azurite && env->backend() != AzureBackend::kAzurite) { - GTEST_SKIP() << "trip_up_azurite=true is only for Azurite."; + return; } auto data = SetUpPreexistingData(); From 899a724b7acb75a7c2d7021ea679649bbc3917ce Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 19:34:11 -0300 Subject: [PATCH 02/24] Implement GetFileInfo with upfront HNS checking --- cpp/src/arrow/filesystem/azurefs.cc | 151 ++++++++++++++--------- cpp/src/arrow/filesystem/azurefs_test.cc | 95 ++++++++------ 2 files changed, 148 insertions(+), 98 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 21350a490411a..5e72e2242c143 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -882,10 +882,12 @@ namespace { const char kDelimiter[] = {internal::kSep, '\0'}; +/// \pre location.container is not empty. template -Result GetContainerPropsAsFileInfo(const std::string& container_name, +Result GetContainerPropsAsFileInfo(const AzureLocation& location, ContainerClient& container_client) { - FileInfo info{container_name}; + DCHECK(!location.container.empty()); + FileInfo info{location.path.empty() ? location.all : location.container}; try { auto properties = container_client.GetProperties(); info.set_type(FileType::Directory); @@ -946,7 +948,16 @@ class AzureFileSystem::Impl { io::IOContext& io_context() { return io_context_; } const AzureOptions& options() const { return options_; } - private: + Blobs::BlobContainerClient GetBlobContainerClient(const std::string& container_name) { + return blob_service_client_->GetBlobContainerClient(container_name); + } + + /// \param container_name Also known as "filesystem" in the ADLS Gen2 API. + DataLake::DataLakeFileSystemClient GetFileSystemClient( + const std::string& container_name) { + return datalake_service_client_->GetFileSystemClient(container_name); + } + /// \brief Memoized version of CheckIfHierarchicalNamespaceIsEnabled. /// /// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never returned). @@ -978,7 +989,6 @@ class AzureFileSystem::Impl { return hns_support; } - public: /// This is used from unit tests to ensure we perform operations on all the /// possible states of cached_hns_support_. void ForceCachedHierarchicalNamespaceSupport(int support) { @@ -995,33 +1005,20 @@ class AzureFileSystem::Impl { DCHECK(false) << "Invalid enum HierarchicalNamespaceSupport value."; } - Result GetFileInfo(const AzureLocation& location) { - if (location.container.empty()) { - DCHECK(location.path.empty()); - // Root directory of the storage account. - return FileInfo{"", FileType::Directory}; - } - if (location.path.empty()) { - // We have a container, but no path within the container. - // The container itself represents a directory. - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - return GetContainerPropsAsFileInfo(location.container, container_client); - } - // There is a path to search within the container. - FileInfo info{location.all}; - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); + /// \pre location.path is not empty. + Result GetFileInfo(DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location) { auto file_client = adlfs_client.GetFileClient(location.path); try { + FileInfo info{location.all}; auto properties = file_client.GetProperties(); if (properties.Value.IsDirectory) { info.set_type(FileType::Directory); } else if (internal::HasTrailingSlash(location.path)) { - // For a path with a trailing slash a hierarchical namespace may return a blob - // with that trailing slash removed. For consistency with flat namespace and - // other filesystems we chose to return NotFound. - // - // NOTE(felipecrv): could this be an empty directory marker? + // For a path with a trailing slash, a Hierarchical Namespace storage account + // may recognize a file (path with trailing slash removed). For consistency + // with other arrow::FileSystem implementations we chose to return NotFound + // because the trailing slash means the user was looking for a directory. info.set_type(FileType::NotFound); return info; } else { @@ -1033,43 +1030,55 @@ class AzureFileSystem::Impl { return info; } catch (const Storage::StorageException& exception) { if (exception.StatusCode == Http::HttpStatusCode::NotFound) { - ARROW_ASSIGN_OR_RAISE(auto hns_support, - HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound || - hns_support == HNSSupport::kEnabled) { - // If the hierarchical namespace is enabled, then the storage account will - // have explicit directories. Neither a file nor a directory was found. - info.set_type(FileType::NotFound); + return FileInfo{location.all, FileType::NotFound}; + } + return ExceptionToStatus( + exception, "GetProperties for '", file_client.GetUrl(), + "' failed. GetFileInfo is unable to determine whether the path exists."); + } + } + + /// On flat namespace accounts there are no real directories. Directories are + /// implied by empty directory marker blobs with names ending in "/" or there + /// being blobs with names starting with the directory path. + /// + /// \pre location.path is not empty. + Result GetFileInfo(Blobs::BlobContainerClient& container_client, + const AzureLocation& location) { + DCHECK(!location.path.empty()); + Blobs::ListBlobsOptions options; + options.Prefix = internal::RemoveTrailingSlash(location.path); + options.PageSizeHint = 1; + + try { + FileInfo info{location.all}; + auto list_response = container_client.ListBlobsByHierarchy(kDelimiter, options); + if (!list_response.BlobPrefixes.empty()) { + const auto& blob_prefix = list_response.BlobPrefixes[0]; + if (blob_prefix == internal::EnsureTrailingSlash(location.path)) { + info.set_type(FileType::Directory); return info; } - // On flat namespace accounts there are no real directories. Directories are only - // implied by using `/` in the blob name. - Blobs::ListBlobsOptions list_blob_options; - // If listing the prefix `path.path_to_file` with trailing slash returns at least - // one result then `path` refers to an implied directory. - list_blob_options.Prefix = internal::EnsureTrailingSlash(location.path); - // We only need to know if there is at least one result, so minimise page size - // for efficiency. - list_blob_options.PageSizeHint = 1; - - try { - auto paged_list_result = - blob_service_client_->GetBlobContainerClient(location.container) - .ListBlobs(list_blob_options); - auto file_type = paged_list_result.Blobs.size() > 0 ? FileType::Directory - : FileType::NotFound; - info.set_type(file_type); + } + if (!list_response.Blobs.empty()) { + const auto& blob = list_response.Blobs[0]; + if (blob.Name == location.path) { + info.set_type(FileType::File); + info.set_size(blob.BlobSize); + info.set_mtime( + std::chrono::system_clock::time_point{blob.Details.LastModified}); return info; - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - exception, "ListBlobs failed for prefix='", *list_blob_options.Prefix, - "' failed. GetFileInfo is unable to determine whether the path should " - "be considered an implied directory."); } } + info.set_type(FileType::NotFound); + return info; + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + return FileInfo{location.all, FileType::NotFound}; + } return ExceptionToStatus( - exception, "GetProperties failed for '", file_client.GetUrl(), - "' GetFileInfo is unable to determine whether the path exists."); + exception, "ListBlobsByHierarchy failed for prefix='", *options.Prefix, + "'. GetFileInfo is unable to determine whether the path exists."); } } @@ -1314,9 +1323,8 @@ class AzureFileSystem::Impl { return PathNotFound(location); } if (hns_support == HNSSupport::kDisabled) { - ARROW_ASSIGN_OR_RAISE( - auto container_info, - GetContainerPropsAsFileInfo(location.container, container_client)); + ARROW_ASSIGN_OR_RAISE(auto container_info, + GetContainerPropsAsFileInfo(location, container_client)); if (container_info.type() == FileType::NotFound) { return PathNotFound(location); } @@ -1631,7 +1639,30 @@ bool AzureFileSystem::Equals(const FileSystem& other) const { Result AzureFileSystem::GetFileInfo(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->GetFileInfo(location); + if (location.container.empty()) { + DCHECK(location.path.empty()); + // Root directory of the storage account. + return FileInfo{"", FileType::Directory}; + } + if (location.path.empty()) { + // We have a container, but no path within the container. + // The container itself represents a directory. + auto container_client = impl_->GetBlobContainerClient(location.container); + return GetContainerPropsAsFileInfo(location, container_client); + } + // There is a path to search within the container. Check HNS support to proceed. + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return FileInfo{location.all, FileType::NotFound}; + } + if (hns_support == HNSSupport::kEnabled) { + return impl_->GetFileInfo(adlfs_client, location); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + auto container_client = impl_->GetBlobContainerClient(location.container); + return impl_->GetFileInfo(container_client, location); } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 410dbf1538cdc..5f69b2f55e111 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -473,6 +473,14 @@ class TestAzureFileSystem : public ::testing::Test { return blob_client; } + Blobs::Models::BlobProperties GetBlobProperties(const std::string& container_name, + const std::string& blob_name) { + return blob_service_client_->GetBlobContainerClient(container_name) + .GetBlobClient(blob_name) + .GetProperties() + .Value; + } + void UploadLines(const std::vector& lines, const std::string& path, int total_size) { ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); @@ -570,7 +578,39 @@ class TestAzureFileSystem : public ::testing::Test { void TestDetectHierarchicalNamespace(bool trip_up_azurite); void TestDetectHierarchicalNamespaceOnMissingContainer(); - void TestGetFileInfoObject(); + + void TestGetFileInfoOfRoot() { + AssertFileInfo(fs(), "", FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://")); + } + + void TestGetFileInfoOnExistingContainer() { + auto data = SetUpPreexistingData(); + AssertFileInfo(fs(), data.container_name, FileType::Directory); + AssertFileInfo(fs(), data.container_name + "/", FileType::Directory); + auto props = GetBlobProperties(data.container_name, data.kObjectName); + AssertFileInfo(fs(), data.ObjectPath(), FileType::File, + std::chrono::system_clock::time_point{props.LastModified}, + static_cast(props.BlobSize)); + AssertFileInfo(fs(), data.NotFoundObjectPath(), FileType::NotFound); + AssertFileInfo(fs(), data.ObjectPath() + "/", FileType::NotFound); + AssertFileInfo(fs(), data.NotFoundObjectPath() + "/", FileType::NotFound); + + // URIs + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.container_name)); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + std::string{data.kObjectName})); + ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.ObjectPath())); + } + + void TestGetFileInfoOnMissingContainer() { + auto data = SetUpPreexistingData(); + AssertFileInfo(fs(), "nonexistent", FileType::NotFound); + AssertFileInfo(fs(), "nonexistent/object", FileType::NotFound); + AssertFileInfo(fs(), "nonexistent/object/", FileType::NotFound); + } + void TestGetFileInfoObjectWithNestedStructure(); void TestDeleteDirSuccessEmpty() { @@ -704,22 +744,6 @@ void TestAzureFileSystem::TestDetectHierarchicalNamespaceOnMissingContainer() { } } -void TestAzureFileSystem::TestGetFileInfoObject() { - auto data = SetUpPreexistingData(); - auto object_properties = - blob_service_client_->GetBlobContainerClient(data.container_name) - .GetBlobClient(data.kObjectName) - .GetProperties() - .Value; - - AssertFileInfo(fs(), data.ObjectPath(), FileType::File, - std::chrono::system_clock::time_point{object_properties.LastModified}, - static_cast(object_properties.BlobSize)); - - // URI - ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + std::string{data.kObjectName})); -} - void TestAzureFileSystem::TestGetFileInfoObjectWithNestedStructure() { auto data = SetUpPreexistingData(); // Adds detailed tests to handle cases of different edge cases @@ -855,6 +879,14 @@ TYPED_TEST(TestAzureFileSystemOnAllEnvs, DetectHierarchicalNamespaceOnMissingCon this->TestDetectHierarchicalNamespaceOnMissingContainer(); } +TYPED_TEST(TestAzureFileSystemOnAllEnvs, GetFileInfoOfRoot) { + this->TestGetFileInfoOfRoot(); +} + +TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirWithEmptyPath) { + ASSERT_RAISES(Invalid, this->fs()->CreateDir("", false)); +} + // Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) // combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and // known according to the environment. @@ -869,18 +901,22 @@ using AllScenarios = ::testing::Types< TYPED_TEST_SUITE(TestAzureFileSystemOnAllScenarios, AllScenarios); -TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObject) { - this->TestGetFileInfoObject(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoOnExistingContainer) { + this->TestGetFileInfoOnExistingContainer(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { - this->TestDeleteDirSuccessEmpty(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoOnMissingContainer) { + this->TestGetFileInfoOnMissingContainer(); } TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObjectWithNestedStructure) { this->TestGetFileInfoObjectWithNestedStructure(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { + this->TestDeleteDirSuccessEmpty(); +} + TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirSuccessContainerAndDirectory) { this->TestCreateDirSuccessContainerAndDirectory(); } @@ -951,23 +987,6 @@ TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { // Tests using Azurite (the local Azure emulator) -TEST_F(TestAzuriteFileSystem, GetFileInfoAccount) { - AssertFileInfo(fs(), "", FileType::Directory); - - // URI - ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://")); -} - -TEST_F(TestAzuriteFileSystem, GetFileInfoContainer) { - auto data = SetUpPreexistingData(); - AssertFileInfo(fs(), data.container_name, FileType::Directory); - - AssertFileInfo(fs(), "nonexistent-container", FileType::NotFound); - - // URI - ASSERT_RAISES(Invalid, fs()->GetFileInfo("abfs://" + data.container_name)); -} - TEST_F(TestAzuriteFileSystem, GetFileInfoSelector) { SetUpSmallFileSystemTree(); From edc803536c921932d2b340c84827140918ab9881 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 18:21:22 -0300 Subject: [PATCH 03/24] Implement CreateDir with upfront HNS checks and directory semantics on flat ns accounts --- cpp/src/arrow/filesystem/azurefs.cc | 223 ++++++++++++++--------- cpp/src/arrow/filesystem/azurefs_test.cc | 53 ++---- 2 files changed, 149 insertions(+), 127 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 5e72e2242c143..6fe8f5980620a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -903,6 +903,18 @@ Result GetContainerPropsAsFileInfo(const AzureLocation& location, } } +template +Status CreateContainerIfNotExists(const std::string& container_name, + ContainerClient& container_client) { + try { + container_client.CreateIfNotExists(); + return Status::OK(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to create a container: ", container_name, + ": ", container_client.GetUrl()); + } +} + FileInfo DirectoryFileInfoFromPath(std::string_view path) { return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, FileType::Directory}; } @@ -1082,6 +1094,25 @@ class AzureFileSystem::Impl { } } + /// \pref location.container is not empty. + template + Status CheckDirExists(ContainerClient& container_client, + const AzureLocation& location) { + DCHECK(!location.container.empty()); + FileInfo info; + if (location.path.empty()) { + ARROW_ASSIGN_OR_RAISE(info, + GetContainerPropsAsFileInfo(location, container_client)); + } else { + ARROW_ASSIGN_OR_RAISE(info, GetFileInfo(container_client, location)); + } + if (info.type() == FileType::NotFound) { + return PathNotFound(location); + } + DCHECK_EQ(info.type(), FileType::Directory); + return Status::OK(); + } + private: template Status VisitContainers(const Core::Context& context, OnContainer&& on_container) const { @@ -1297,96 +1328,80 @@ class AzureFileSystem::Impl { return ptr; } - Status CreateDir(const AzureLocation& location) { - if (location.container.empty()) { - return Status::Invalid("CreateDir requires a non-empty path."); - } - - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - if (location.path.empty()) { - try { - auto response = container_client.Create(); - return response.Value.Created - ? Status::OK() - : Status::AlreadyExists("Directory already exists: " + location.all); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to create a container: ", location.container, - ": ", container_client.GetUrl()); - } - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound) { - return PathNotFound(location); - } - if (hns_support == HNSSupport::kDisabled) { - ARROW_ASSIGN_OR_RAISE(auto container_info, - GetContainerPropsAsFileInfo(location, container_client)); - if (container_info.type() == FileType::NotFound) { - return PathNotFound(location); + /// This function cannot assume the filesystem/container already exists. + /// + /// \pre location.container is not empty. + /// \pre location.path is not empty. + template + Status CreateDirTemplate(ContainerClient& container_client, + GetDirectoryClient&& get_directory_client, + CreateDirIfNotExists&& create_if_not_exists, + const AzureLocation& location, bool recursive) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + // Non-recursive CreateDir calls require the parent directory to exist. + if (!recursive) { + auto parent = location.parent(); + if (!parent.path.empty()) { + RETURN_NOT_OK(CheckDirExists(container_client, parent)); } - // Without hierarchical namespace enabled Azure blob storage has no directories. - // Therefore we can't, and don't need to create one. Simply creating a blob with `/` - // in the name implies directories. - return Status::OK(); + // If the parent location is just the container, we don't need to check if it + // exists because the operation we perform below will fail if the container + // doesn't exist and we can handle that error according to the recursive flag. } - - auto directory_client = adlfs_client.GetDirectoryClient(location.path); + auto directory_client = get_directory_client(container_client, location); try { - auto response = directory_client.Create(); - if (response.Value.Created) { - return Status::OK(); - } else { - return StatusFromErrorResponse(directory_client.GetUrl(), *response.RawResponse, - "Failed to create a directory: " + location.path); - } - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "Failed to create a directory: ", location.path, - ": ", directory_client.GetUrl()); - } - } - - Status CreateDirRecursive(const AzureLocation& location) { - if (location.container.empty()) { - return Status::Invalid("CreateDir requires a non-empty path."); - } - - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - try { - container_client.CreateIfNotExists(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to create a container: ", location.container, " (", - container_client.GetUrl(), ")"); - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kDisabled) { - // Without hierarchical namespace enabled Azure blob storage has no directories. - // Therefore we can't, and don't need to create one. Simply creating a blob with `/` - // in the name implies directories. + create_if_not_exists(directory_client); return Status::OK(); - } - // Don't handle HNSSupport::kContainerNotFound, just assume it still exists (because - // it was created above) and try to create the directory. - - if (!location.path.empty()) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); - try { - directory_client.CreateIfNotExists(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to create a directory: ", location.path, " (", - directory_client.GetUrl(), ")"); + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + try { + if (recursive) { + container_client.CreateIfNotExists(); + create_if_not_exists(directory_client); + return Status::OK(); + } else { + auto parent = location.parent(); + return PathNotFound(parent); + } + } catch (const Storage::StorageException& second_exception) { + return ExceptionToStatus(second_exception); + } } + return ExceptionToStatus(exception); } + } - return Status::OK(); + public: + /// This function cannot assume the filesystem already exists. + /// + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status CreateDirOnFileSystem(DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location, bool recursive) { + return CreateDirTemplate( + adlfs_client, + [](auto& adlfs_client, auto& location) { + return adlfs_client.GetDirectoryClient(location.path); + }, + [](auto& directory_client) { directory_client.CreateIfNotExists(); }, location, + recursive); + } + + /// This function cannot assume the container already exists. + /// + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status CreateDirOnContainer(Blobs::BlobContainerClient& container_client, + const AzureLocation& location, bool recursive) { + return CreateDirTemplate( + container_client, + [](auto& container_client, auto& location) { + auto dir_marker_blob_path = internal::EnsureTrailingSlash(location.path); + return container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); + }, + [](auto& block_blob_client) { block_blob_client.UploadFrom(nullptr, 0); }, + location, recursive); } Result> OpenAppendStream( @@ -1676,11 +1691,45 @@ Result AzureFileSystem::GetFileInfo(const FileSelector& select) Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - if (recursive) { - return impl_->CreateDirRecursive(location); - } else { - return impl_->CreateDir(location); + if (location.container.empty()) { + return Status::Invalid("CreateDir requires a non-empty path."); + } + + auto container_client = impl_->GetBlobContainerClient(location.container); + if (location.path.empty()) { + // If the path is just the container, the parent (root) trivially exists, + // and the CreateDir operation comes down to just creating the container. + return CreateContainerIfNotExists(location.container, container_client); + } + + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + if (!recursive) { + auto parent = location.parent(); + return PathNotFound(parent); + } + RETURN_NOT_OK(CreateContainerIfNotExists(location.container, container_client)); + // Perform a second check for HNS support after creating the container. + ARROW_ASSIGN_OR_RAISE(hns_support, impl_->HierarchicalNamespaceSupport(adlfs_client)); + } + if (hns_support == HNSSupport::kContainerNotFound) { + // We only get kContainerNotFound if we are unable to read the properties of the + // container we just created. This is very unlikely, but theoretically possible in + // a concurrent system, so the error is handled to avoid infinite recursion. + return Status::IOError("Unable to read properties of a newly created container: ", + location.container, ": " + container_client.GetUrl()); + } + // CreateDirOnFileSystem and CreateDirOnContainer can handle the container + // not existing which is useful and necessary here since the only reason + // a container was created above was to check for HNS support when it wasn't + // cached yet. + if (hns_support == HNSSupport::kEnabled) { + return impl_->CreateDirOnFileSystem(adlfs_client, location, recursive); } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + return impl_->CreateDirOnContainer(container_client, location, recursive); } Status AzureFileSystem::DeleteDir(const std::string& path) { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 5f69b2f55e111..d1ae49aefb878 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -617,32 +617,20 @@ class TestAzureFileSystem : public ::testing::Test { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - if (WithHierarchicalNamespace()) { - ASSERT_OK(fs()->CreateDir(directory_path, true)); - AssertFileInfo(fs(), directory_path, FileType::Directory); - ASSERT_OK(fs()->DeleteDir(directory_path)); - AssertFileInfo(fs(), directory_path, FileType::NotFound); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() and DeleteDir() do nothing. - ASSERT_OK(fs()->CreateDir(directory_path)); - AssertFileInfo(fs(), directory_path, FileType::NotFound); - ASSERT_OK(fs()->DeleteDir(directory_path)); - AssertFileInfo(fs(), directory_path, FileType::NotFound); - } + AssertFileInfo(fs(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(directory_path, true)); + AssertFileInfo(fs(), directory_path, FileType::Directory); + // XXX: implement DeleteDir on flat namespace storage accounts + // ASSERT_OK(fs()->DeleteDir(directory_path)); + // AssertFileInfo(fs(), directory_path, FileType::NotFound); } void TestCreateDirSuccessContainerAndDirectory() { auto data = SetUpPreexistingData(); const auto path = data.RandomDirectoryPath(rng_); + AssertFileInfo(fs(), path, FileType::NotFound); ASSERT_OK(fs()->CreateDir(path, false)); - if (WithHierarchicalNamespace()) { - AssertFileInfo(fs(), path, FileType::Directory); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - AssertFileInfo(fs(), path, FileType::NotFound); - } + AssertFileInfo(fs(), path, FileType::Directory); } void TestCreateDirRecursiveSuccessContainerOnly() { @@ -656,15 +644,8 @@ class TestAzureFileSystem : public ::testing::Test { const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); ASSERT_OK(fs()->CreateDir(path, true)); - if (WithHierarchicalNamespace()) { - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - AssertFileInfo(fs(), path, FileType::NotFound); - AssertFileInfo(fs(), parent, FileType::NotFound); - } + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); } void TestCreateDirRecursiveSuccessContainerAndDirectory() { @@ -672,17 +653,9 @@ class TestAzureFileSystem : public ::testing::Test { const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); ASSERT_OK(fs()->CreateDir(path, true)); - if (WithHierarchicalNamespace()) { - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - AssertFileInfo(fs(), data.container_name, FileType::Directory); - } else { - // There is only virtual directory without hierarchical namespace - // support. So the CreateDir() does nothing. - AssertFileInfo(fs(), path, FileType::NotFound); - AssertFileInfo(fs(), parent, FileType::NotFound); - AssertFileInfo(fs(), data.container_name, FileType::Directory); - } + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + AssertFileInfo(fs(), data.container_name, FileType::Directory); } void TestDeleteDirContentsSuccessNonexistent() { From e7a36e1c707bb6507b7f0e2224b5af234ff56143 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 20 Dec 2023 22:43:59 -0300 Subject: [PATCH 04/24] Add comprehensive tests to CreateDir and delete tiny tests that would be duplicates --- cpp/src/arrow/filesystem/azurefs_test.cc | 195 ++++++++++++++++------- 1 file changed, 137 insertions(+), 58 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index d1ae49aefb878..92e725ea87b3d 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -613,6 +613,135 @@ class TestAzureFileSystem : public ::testing::Test { void TestGetFileInfoObjectWithNestedStructure(); + void TestCreateDirOnRoot() { + auto dir1 = PreexistingData::RandomContainerName(rng_); + auto dir2 = PreexistingData::RandomContainerName(rng_); + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + + AssertFileInfo(fs(), dir2, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir2, true)); + AssertFileInfo(fs(), dir1, FileType::Directory); + + // Should not fail if the directory already exists. + ASSERT_OK(fs()->CreateDir(dir1, false)); + ASSERT_OK(fs()->CreateDir(dir1, true)); + AssertFileInfo(fs(), dir1, FileType::Directory); + } + + void TestCreateDirOnExistingContainer() { + auto data = SetUpPreexistingData(); + auto dir1 = data.RandomDirectoryPath(rng_); + auto dir2 = data.RandomDirectoryPath(rng_); + + AssertFileInfo(fs(), dir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir1, /*recursive=*/false)); + AssertFileInfo(fs(), dir1, FileType::Directory); + + AssertFileInfo(fs(), dir2, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(dir2, /*recursive=*/true)); + AssertFileInfo(fs(), dir2, FileType::Directory); + + auto subdir1 = ConcatAbstractPath(dir1, "subdir"); + auto subdir2 = ConcatAbstractPath(dir2, "subdir"); + AssertFileInfo(fs(), subdir1, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(subdir1, /*recursive=*/false)); + AssertFileInfo(fs(), subdir1, FileType::Directory); + AssertFileInfo(fs(), subdir2, FileType::NotFound); + ASSERT_OK(fs()->CreateDir(subdir2, /*recursive=*/true)); + AssertFileInfo(fs(), subdir2, FileType::Directory); + + auto dir3 = data.RandomDirectoryPath(rng_); + AssertFileInfo(fs(), dir3, FileType::NotFound); + auto subdir3 = ConcatAbstractPath(dir3, "subdir"); + AssertFileInfo(fs(), subdir3, FileType::NotFound); + // Creating subdir3 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + dir3 + "'"), + fs()->CreateDir(subdir3, /*recursive=*/false)); + AssertFileInfo(fs(), dir3, FileType::NotFound); + AssertFileInfo(fs(), subdir3, FileType::NotFound); + // Creating subdir3 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(subdir3, /*recursive=*/true)); + AssertFileInfo(fs(), dir3, FileType::Directory); + AssertFileInfo(fs(), subdir3, FileType::Directory); + + auto dir4 = data.RandomDirectoryPath(rng_); + auto subdir4 = ConcatAbstractPath(dir4, "subdir4"); + auto subdir5 = ConcatAbstractPath(dir4, "subdir4/subdir5"); + // Creating subdir4 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + dir4 + "'"), + fs()->CreateDir(subdir4, /*recursive=*/false)); + AssertFileInfo(fs(), dir4, FileType::NotFound); + AssertFileInfo(fs(), subdir4, FileType::NotFound); + // Creating subdir5 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + subdir4 + "'"), + fs()->CreateDir(subdir5, /*recursive=*/false)); + AssertFileInfo(fs(), dir4, FileType::NotFound); + AssertFileInfo(fs(), subdir4, FileType::NotFound); + AssertFileInfo(fs(), subdir5, FileType::NotFound); + // Creating subdir5 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(subdir5, /*recursive=*/true)); + AssertFileInfo(fs(), dir4, FileType::Directory); + AssertFileInfo(fs(), subdir4, FileType::Directory); + AssertFileInfo(fs(), subdir5, FileType::Directory); + } + + void TestCreateDirOnMissingContainer() { + auto container1 = PreexistingData::RandomContainerName(rng_); + auto container2 = PreexistingData::RandomContainerName(rng_); + AssertFileInfo(fs(), container1, FileType::NotFound); + AssertFileInfo(fs(), container2, FileType::NotFound); + + auto dir1 = ConcatAbstractPath(container1, "dir"); + AssertFileInfo(fs(), dir1, FileType::NotFound); + // Creating dir1 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + container1 + "'"), + fs()->CreateDir(dir1, /*recursive=*/false)); + AssertFileInfo(fs(), container1, FileType::NotFound); + AssertFileInfo(fs(), dir1, FileType::NotFound); + // Creating dir1 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(dir1, /*recursive=*/true)); + AssertFileInfo(fs(), container1, FileType::Directory); + AssertFileInfo(fs(), dir1, FileType::Directory); + + auto dir2 = ConcatAbstractPath(container2, "dir"); + auto subdir2 = ConcatAbstractPath(dir2, "subdir2"); + auto subdir3 = ConcatAbstractPath(dir2, "subdir2/subdir3"); + // Creating dir2 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + container2 + "'"), + fs()->CreateDir(dir2, /*recursive=*/false)); + AssertFileInfo(fs(), container2, FileType::NotFound); + AssertFileInfo(fs(), dir2, FileType::NotFound); + // Creating subdir2 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + dir2 + "'"), + fs()->CreateDir(subdir2, /*recursive=*/false)); + AssertFileInfo(fs(), container2, FileType::NotFound); + AssertFileInfo(fs(), dir2, FileType::NotFound); + AssertFileInfo(fs(), subdir2, FileType::NotFound); + // Creating subdir3 with recursive=false should fail. + EXPECT_RAISES_WITH_MESSAGE_THAT( + IOError, ::testing::HasSubstr("Path does not exist '" + subdir2 + "'"), + fs()->CreateDir(subdir3, /*recursive=*/false)); + AssertFileInfo(fs(), container2, FileType::NotFound); + AssertFileInfo(fs(), dir2, FileType::NotFound); + AssertFileInfo(fs(), subdir2, FileType::NotFound); + AssertFileInfo(fs(), subdir3, FileType::NotFound); + // Creating subdir3 with recursive=true should work. + ASSERT_OK(fs()->CreateDir(subdir3, /*recursive=*/true)); + AssertFileInfo(fs(), container2, FileType::Directory); + AssertFileInfo(fs(), dir2, FileType::Directory); + AssertFileInfo(fs(), subdir2, FileType::Directory); + AssertFileInfo(fs(), subdir3, FileType::Directory); + } + void TestDeleteDirSuccessEmpty() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); @@ -625,39 +754,6 @@ class TestAzureFileSystem : public ::testing::Test { // AssertFileInfo(fs(), directory_path, FileType::NotFound); } - void TestCreateDirSuccessContainerAndDirectory() { - auto data = SetUpPreexistingData(); - const auto path = data.RandomDirectoryPath(rng_); - AssertFileInfo(fs(), path, FileType::NotFound); - ASSERT_OK(fs()->CreateDir(path, false)); - AssertFileInfo(fs(), path, FileType::Directory); - } - - void TestCreateDirRecursiveSuccessContainerOnly() { - auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs()->CreateDir(container_name, true)); - AssertFileInfo(fs(), container_name, FileType::Directory); - } - - void TestCreateDirRecursiveSuccessDirectoryOnly() { - auto data = SetUpPreexistingData(); - const auto parent = data.RandomDirectoryPath(rng_); - const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs()->CreateDir(path, true)); - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - } - - void TestCreateDirRecursiveSuccessContainerAndDirectory() { - auto data = SetUpPreexistingData(); - const auto parent = data.RandomDirectoryPath(rng_); - const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs()->CreateDir(path, true)); - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - AssertFileInfo(fs(), data.container_name, FileType::Directory); - } - void TestDeleteDirContentsSuccessNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); @@ -860,6 +956,8 @@ TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirWithEmptyPath) { ASSERT_RAISES(Invalid, this->fs()->CreateDir("", false)); } +TYPED_TEST(TestAzureFileSystemOnAllEnvs, CreateDirOnRoot) { this->TestCreateDirOnRoot(); } + // Tests using all the 3 environments (Azurite, Azure w/o HNS (flat), Azure w/ HNS) // combined with the two scenarios for AzureFileSystem::cached_hns_support_ -- unknown and // known according to the environment. @@ -886,25 +984,16 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, GetFileInfoObjectWithNestedStructu this->TestGetFileInfoObjectWithNestedStructure(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { - this->TestDeleteDirSuccessEmpty(); -} - -TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirSuccessContainerAndDirectory) { - this->TestCreateDirSuccessContainerAndDirectory(); -} - -TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessContainerOnly) { - this->TestCreateDirRecursiveSuccessContainerOnly(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnExistingContainer) { + this->TestCreateDirOnExistingContainer(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirRecursiveSuccessDirectoryOnly) { - this->TestCreateDirRecursiveSuccessDirectoryOnly(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateDirOnMissingContainer) { + this->TestCreateDirOnMissingContainer(); } -TYPED_TEST(TestAzureFileSystemOnAllScenarios, - CreateDirRecursiveSuccessContainerAndDirectory) { - this->TestCreateDirRecursiveSuccessContainerAndDirectory(); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { + this->TestDeleteDirSuccessEmpty(); } // Tests using a real storage account *with Hierarchical Namespace enabled* @@ -1133,16 +1222,6 @@ TEST_F(TestAzuriteFileSystem, GetFileInfoSelectorExplicitImplicitDirDedup) { AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); } -TEST_F(TestAzuriteFileSystem, CreateDirFailureNoContainer) { - ASSERT_RAISES(Invalid, fs()->CreateDir("", false)); -} - -TEST_F(TestAzuriteFileSystem, CreateDirSuccessContainerOnly) { - auto container_name = PreexistingData::RandomContainerName(rng_); - ASSERT_OK(fs()->CreateDir(container_name, false)); - AssertFileInfo(fs(), container_name, FileType::Directory); -} - TEST_F(TestAzuriteFileSystem, CreateDirFailureDirectoryWithMissingContainer) { const auto path = std::string("not-a-container/new-directory"); ASSERT_RAISES(IOError, fs()->CreateDir(path, false)); From 045eb4d3909416c76398a01c25c2d71ea8b8fbf8 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Dec 2023 15:25:32 -0300 Subject: [PATCH 05/24] Add HasSubmitBatchBug() to TestAzureFileSystem --- cpp/src/arrow/filesystem/azurefs_test.cc | 38 ++++++++++++++++-------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 92e725ea87b3d..02d9b15694184 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -574,6 +574,23 @@ class TestAzureFileSystem : public ::testing::Test { return env->WithHierarchicalNamespace(); } + constexpr static const char* const kSubmitBatchBugMessage = + "This test is affected by an Azurite isse: " + "https://github.com/Azure/Azurite/pull/2302"; + + /// Azurite has a bug that causes BlobContainerClient::SubmitBatch to fail on macOS. + /// SubmitBatch is used by: + /// - AzureFileSystem::DeleteDir + /// - AzureFileSystem::DeleteDirContents + bool HasSubmitBatchBug() const { + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); +#ifdef __APPLE__ + return env->backend() == AzureBackend::kAzurite; +#else + return false; +#endif + } + // Tests that are called from more than one implementation of TestAzureFileSystem void TestDetectHierarchicalNamespace(bool trip_up_azurite); @@ -1255,10 +1272,9 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessHaveBlobs) { -#ifdef __APPLE__ - GTEST_SKIP() << "This test fails by an Azurite problem: " - "https://github.com/Azure/Azurite/pull/2302"; -#endif + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); // We must use 257 or more blobs here to test pagination of ListBlobs(). @@ -1284,10 +1300,9 @@ TEST_F(TestAzuriteFileSystem, DeleteDirUri) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { -#ifdef __APPLE__ - GTEST_SKIP() << "This test fails by an Azurite problem: " - "https://github.com/Azure/Azurite/pull/2302"; -#endif + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); @@ -1300,10 +1315,9 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { -#ifdef __APPLE__ - GTEST_SKIP() << "This test fails by an Azurite problem: " - "https://github.com/Azure/Azurite/pull/2302"; -#endif + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); From 6e039849beba2bd398bca5f0ff3a1a2b71cf7a0b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Dec 2023 19:51:07 -0300 Subject: [PATCH 06/24] Move all the HNS-only tests to all scenarios --- cpp/src/arrow/filesystem/azurefs_test.cc | 90 ++++++++++++++---------- 1 file changed, 52 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 02d9b15694184..f5029793da333 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -771,6 +771,47 @@ class TestAzureFileSystem : public ::testing::Test { // AssertFileInfo(fs(), directory_path, FileType::NotFound); } + void TestDeleteDirFailureNonexistent() { + auto data = SetUpPreexistingData(); + const auto path = data.RandomDirectoryPath(rng_); + ASSERT_RAISES(IOError, fs()->DeleteDir(path)); + } + + void TestDeleteDirSuccessHaveBlob() { + auto data = SetUpPreexistingData(); + const auto directory_path = data.RandomDirectoryPath(rng_); + const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); + ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); + ASSERT_OK(output->Write("hello")); + ASSERT_OK(output->Close()); + AssertFileInfo(fs(), blob_path, FileType::File); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), blob_path, FileType::NotFound); + } + + void TestDeleteDirSuccessHaveDirectory() { + auto data = SetUpPreexistingData(); + const auto parent = data.RandomDirectoryPath(rng_); + const auto path = ConcatAbstractPath(parent, "new-sub"); + ASSERT_OK(fs()->CreateDir(path, true)); + AssertFileInfo(fs(), path, FileType::Directory); + AssertFileInfo(fs(), parent, FileType::Directory); + ASSERT_OK(fs()->DeleteDir(parent)); + AssertFileInfo(fs(), path, FileType::NotFound); + AssertFileInfo(fs(), parent, FileType::NotFound); + } + + void TestDeleteDirContentsSuccessExist() { + auto preexisting_data = SetUpPreexistingData(); + HierarchicalPaths paths; + CreateHierarchicalData(&paths); + ASSERT_OK(fs()->DeleteDirContents(paths.directory)); + AssertFileInfo(fs(), paths.directory, FileType::Directory); + for (const auto& sub_path : paths.sub_paths) { + AssertFileInfo(fs(), sub_path, FileType::NotFound); + } + } + void TestDeleteDirContentsSuccessNonexistent() { auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); @@ -1013,54 +1054,27 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessEmpty) { this->TestDeleteDirSuccessEmpty(); } -// Tests using a real storage account *with Hierarchical Namespace enabled* +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirFailureNonexistent) { + this->TestDeleteDirFailureNonexistent(); +} -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirFailureNonexistent) { - auto data = SetUpPreexistingData(); - const auto path = data.RandomDirectoryPath(rng_); - ASSERT_RAISES(IOError, fs()->DeleteDir(path)); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveBlob) { + this->TestDeleteDirSuccessHaveBlob(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveBlob) { - auto data = SetUpPreexistingData(); - const auto directory_path = data.RandomDirectoryPath(rng_); - const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); - ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(blob_path)); - ASSERT_OK(output->Write(std::string_view("hello"))); - ASSERT_OK(output->Close()); - AssertFileInfo(fs(), blob_path, FileType::File); - ASSERT_OK(fs()->DeleteDir(directory_path)); - AssertFileInfo(fs(), blob_path, FileType::NotFound); +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirSuccessHaveDirectory) { + this->TestDeleteDirSuccessHaveDirectory(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirSuccessHaveDirectory) { - auto data = SetUpPreexistingData(); - const auto parent = data.RandomDirectoryPath(rng_); - const auto path = ConcatAbstractPath(parent, "new-sub"); - ASSERT_OK(fs()->CreateDir(path, true)); - AssertFileInfo(fs(), path, FileType::Directory); - AssertFileInfo(fs(), parent, FileType::Directory); - ASSERT_OK(fs()->DeleteDir(parent)); - AssertFileInfo(fs(), path, FileType::NotFound); - AssertFileInfo(fs(), parent, FileType::NotFound); -} - -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessExist) { - auto preexisting_data = SetUpPreexistingData(); - HierarchicalPaths paths; - CreateHierarchicalData(&paths); - ASSERT_OK(fs()->DeleteDirContents(paths.directory)); - AssertFileInfo(fs(), paths.directory, FileType::Directory); - for (const auto& sub_path : paths.sub_paths) { - AssertFileInfo(fs(), sub_path, FileType::NotFound); - } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessExist) { + this->TestDeleteDirContentsSuccessExist(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsSuccessNonexistent) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsSuccessNonexistent) { this->TestDeleteDirContentsSuccessNonexistent(); } -TEST_F(TestAzureHierarchicalNSFileSystem, DeleteDirContentsFailureNonexistent) { +TYPED_TEST(TestAzureFileSystemOnAllScenarios, DeleteDirContentsFailureNonexistent) { this->TestDeleteDirContentsFailureNonexistent(); } From 715781a71f36ffce7dbb07145ffb70aef681429f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Dec 2023 20:00:28 -0300 Subject: [PATCH 07/24] Fix DeleteDir bugs on Flat namespace storage accounts --- cpp/src/arrow/filesystem/azurefs.cc | 91 ++++++++++++++++++------ cpp/src/arrow/filesystem/azurefs_test.cc | 23 +++++- 2 files changed, 91 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6fe8f5980620a..8526dec3c218c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1428,10 +1428,32 @@ class AzureFileSystem::Impl { } private: - Status DeleteDirContentsWithoutHierarchicalNamespace(const AzureLocation& location, - bool missing_dir_ok) { - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); + /// This function assumes the container already exists. So it can only be + /// called after that has been verified. + /// + /// \pre location.container is not empty. + /// \pre The location.container container already exists. + Status EnsureEmptyDirExists(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location) { + DCHECK(!location.container.empty()); + if (location.path.empty()) { + // Nothing to do. The container already exists per the preconditions. + return Status::OK(); + } + auto dir_marker_blob_path = internal::EnsureTrailingSlash(location.path); + auto block_blob_client = + container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); + try { + block_blob_client.UploadFrom(nullptr, 0); + return Status::OK(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception); + } + } + + std::pair DeleteDirContentsWithoutHierarchicalNamespace( + const Blobs::BlobContainerClient& container_client, const AzureLocation& location, + bool missing_dir_ok) { Blobs::ListBlobsOptions options; if (!location.path.empty()) { options.Prefix = internal::EnsureTrailingSlash(location.path); @@ -1442,10 +1464,11 @@ class AzureFileSystem::Impl { // size of the body for a batch request can't exceed 4 MB. const int32_t kNumMaxRequestsInBatch = 256; options.PageSizeHint = kNumMaxRequestsInBatch; + int num_potentially_deleted_blobs = 0; try { auto list_response = container_client.ListBlobs(options); if (!missing_dir_ok && list_response.Blobs.empty()) { - return PathNotFound(location); + return {num_potentially_deleted_blobs, PathNotFound(location)}; } for (; list_response.HasPage(); list_response.MoveToNextPage()) { if (list_response.Blobs.empty()) { @@ -1455,13 +1478,15 @@ class AzureFileSystem::Impl { std::vector> deferred_responses; for (const auto& blob_item : list_response.Blobs) { + num_potentially_deleted_blobs += 1; deferred_responses.push_back(batch.DeleteBlob(blob_item.Name)); } try { container_client.SubmitBatch(batch); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", - location.path, ": ", container_client.GetUrl()); + return {num_potentially_deleted_blobs, + ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", + location.path, ": ", container_client.GetUrl())}; } std::vector failed_blob_names; for (size_t i = 0; i < deferred_responses.size(); ++i) { @@ -1469,7 +1494,12 @@ class AzureFileSystem::Impl { bool success = true; try { auto delete_result = deferred_response.GetResponse(); - success = delete_result.Value.Deleted; + if (!delete_result.Value.Deleted) { + // We know for sure the blob was not deleted, so + // we can decrement num_potentially_deleted_blobs. + num_potentially_deleted_blobs -= 1; + success = false; + } } catch (const Storage::StorageException& exception) { success = false; } @@ -1480,21 +1510,23 @@ class AzureFileSystem::Impl { } if (!failed_blob_names.empty()) { if (failed_blob_names.size() == 1) { - return Status::IOError("Failed to delete a blob: ", failed_blob_names[0], - ": " + container_client.GetUrl()); + return {num_potentially_deleted_blobs, + Status::IOError("Failed to delete a blob: ", failed_blob_names[0], + ": " + container_client.GetUrl())}; } else { - return Status::IOError("Failed to delete blobs: [", - arrow::internal::JoinStrings(failed_blob_names, ", "), - "]: " + container_client.GetUrl()); + return {num_potentially_deleted_blobs, + Status::IOError("Failed to delete blobs: [", + arrow::internal::JoinStrings(failed_blob_names, ", "), + "]: " + container_client.GetUrl())}; } } } } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to list blobs in a directory: ", location.path, - ": ", container_client.GetUrl()); + return {num_potentially_deleted_blobs, + ExceptionToStatus(exception, "Failed to list blobs in a directory: ", + location.path, ": ", container_client.GetUrl())}; } - return Status::OK(); + return {num_potentially_deleted_blobs, Status::OK()}; } public: @@ -1545,8 +1577,13 @@ class AzureFileSystem::Impl { directory_client.GetUrl()); } } else { - return DeleteDirContentsWithoutHierarchicalNamespace(location, - /*missing_dir_ok=*/true); + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + Status status; + std::tie(std::ignore, status) = + DeleteDirContentsWithoutHierarchicalNamespace(container_client, location, + /*missing_dir_ok=*/false); + return status; } } @@ -1599,7 +1636,21 @@ class AzureFileSystem::Impl { } return Status::OK(); } else { - return DeleteDirContentsWithoutHierarchicalNamespace(location, missing_dir_ok); + auto container_client = + blob_service_client_->GetBlobContainerClient(location.container); + auto [num_potentially_deleted_blobs, status] = + DeleteDirContentsWithoutHierarchicalNamespace(container_client, location, + missing_dir_ok); + if (num_potentially_deleted_blobs > 0) { + // If we potentially deleted some blobs, we need to ensure the empty directory + // marker blob exists. Otherwise, we don't need to do anything because the + // directory marker blob may not exist from the beginning and should remain that + // way. EnsureEmptyDirExists requires the container to exist as a pre-condition + // and having num_potentially_deleted_blobs > 0 implies the container exists. + auto second_status = EnsureEmptyDirExists(container_client, location); + return status.ok() ? second_status : status; + } + return status; } } diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index f5029793da333..49b6a9e3859fd 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -760,15 +760,17 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessEmpty() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); AssertFileInfo(fs(), directory_path, FileType::NotFound); ASSERT_OK(fs()->CreateDir(directory_path, true)); AssertFileInfo(fs(), directory_path, FileType::Directory); - // XXX: implement DeleteDir on flat namespace storage accounts - // ASSERT_OK(fs()->DeleteDir(directory_path)); - // AssertFileInfo(fs(), directory_path, FileType::NotFound); + ASSERT_OK(fs()->DeleteDir(directory_path)); + AssertFileInfo(fs(), directory_path, FileType::NotFound); } void TestDeleteDirFailureNonexistent() { @@ -778,6 +780,9 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessHaveBlob() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); const auto blob_path = ConcatAbstractPath(directory_path, "hello.txt"); @@ -790,6 +795,9 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirSuccessHaveDirectory() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto parent = data.RandomDirectoryPath(rng_); const auto path = ConcatAbstractPath(parent, "new-sub"); @@ -802,6 +810,9 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessExist() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto preexisting_data = SetUpPreexistingData(); HierarchicalPaths paths; CreateHierarchicalData(&paths); @@ -813,6 +824,9 @@ class TestAzureFileSystem : public ::testing::Test { } void TestDeleteDirContentsSuccessNonexistent() { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); ASSERT_OK(fs()->DeleteDirContents(directory_path, true)); @@ -1277,6 +1291,9 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessContainer) { } TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { + if (HasSubmitBatchBug()) { + GTEST_SKIP() << kSubmitBatchBugMessage; + } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); // There is only virtual directory without hierarchical namespace From e9ea375fab8184cdef4c124ec7933f8248e31029 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 21 Dec 2023 00:00:19 -0300 Subject: [PATCH 08/24] Split HNS and non-HNS code for the implementation of DeleteDir --- cpp/src/arrow/filesystem/azurefs.cc | 138 +++++++++++++++------------- 1 file changed, 75 insertions(+), 63 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 8526dec3c218c..bb36ff6bd8752 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1094,6 +1094,7 @@ class AzureFileSystem::Impl { } } + private: /// \pref location.container is not empty. template Status CheckDirExists(ContainerClient& container_client, @@ -1113,7 +1114,6 @@ class AzureFileSystem::Impl { return Status::OK(); } - private: template Status VisitContainers(const Core::Context& context, OnContainer&& on_container) const { Blobs::ListBlobContainersOptions options; @@ -1328,6 +1328,7 @@ class AzureFileSystem::Impl { return ptr; } + private: /// This function cannot assume the filesystem/container already exists. /// /// \pre location.container is not empty. @@ -1427,7 +1428,6 @@ class AzureFileSystem::Impl { return stream; } - private: /// This function assumes the container already exists. So it can only be /// called after that has been verified. /// @@ -1451,9 +1451,36 @@ class AzureFileSystem::Impl { } } - std::pair DeleteDirContentsWithoutHierarchicalNamespace( + /// \pre location.container is not empty. + /// \pre location.path is empty. + Status DeleteContainer(Blobs::BlobContainerClient& container_client, + const AzureLocation& location) { + DCHECK(!location.container.empty()); + DCHECK(location.path.empty()); + try { + auto response = container_client.Delete(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse( + container_client.GetUrl(), *response.RawResponse, + "Failed to delete a container: " + location.container); + } + } catch (const Storage::StorageException& exception) { + if (IsContainerNotFound(exception)) { + return PathNotFound(location); + } + return ExceptionToStatus(exception, + "Failed to delete a container: ", location.container, ": ", + container_client.GetUrl()); + } + } + + /// \pre location.container is not empty. + std::pair DeleteDirContentsOnContainer( const Blobs::BlobContainerClient& container_client, const AzureLocation& location, bool missing_dir_ok) { + DCHECK(!location.container.empty()); Blobs::ListBlobsOptions options; if (!location.path.empty()) { options.Prefix = internal::EnsureTrailingSlash(location.path); @@ -1521,69 +1548,33 @@ class AzureFileSystem::Impl { } } } + return {num_potentially_deleted_blobs, Status::OK()}; } catch (const Storage::StorageException& exception) { return {num_potentially_deleted_blobs, ExceptionToStatus(exception, "Failed to list blobs in a directory: ", location.path, ": ", container_client.GetUrl())}; } - return {num_potentially_deleted_blobs, Status::OK()}; } - public: - Status DeleteDir(const AzureLocation& location) { - if (location.container.empty()) { - return Status::Invalid("DeleteDir requires a non-empty path."); - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound) { - return PathNotFound(location); - } - - if (location.path.empty()) { - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - try { - auto response = container_client.Delete(); - if (response.Value.Deleted) { - return Status::OK(); - } else { - return StatusFromErrorResponse( - container_client.GetUrl(), *response.RawResponse, - "Failed to delete a container: " + location.container); - } - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to delete a container: ", location.container, - ": ", container_client.GetUrl()); - } - } - - if (hns_support == HNSSupport::kEnabled) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); - try { - auto response = directory_client.DeleteRecursive(); - if (response.Value.Deleted) { - return Status::OK(); - } else { - return StatusFromErrorResponse( - directory_client.GetUrl(), *response.RawResponse, - "Failed to delete a directory: " + location.path); - } - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, - "Failed to delete a directory: ", location.path, ": ", - directory_client.GetUrl()); + /// \pre location.container is not empty. + /// \pre location.path is not empty. + Status DeleteDirOnFileSystem(DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location) { + DCHECK(!location.container.empty()); + DCHECK(!location.path.empty()); + auto directory_client = adlfs_client.GetDirectoryClient(location.path); + // XXX: should "directory not found" be considered an error? + try { + auto response = directory_client.DeleteRecursive(); + if (response.Value.Deleted) { + return Status::OK(); + } else { + return StatusFromErrorResponse(directory_client.GetUrl(), *response.RawResponse, + "Failed to delete a directory: " + location.path); } - } else { - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); - Status status; - std::tie(std::ignore, status) = - DeleteDirContentsWithoutHierarchicalNamespace(container_client, location, - /*missing_dir_ok=*/false); - return status; + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus(exception, "Failed to delete a directory: ", location.path, + ": ", directory_client.GetUrl()); } } @@ -1636,11 +1627,9 @@ class AzureFileSystem::Impl { } return Status::OK(); } else { - auto container_client = - blob_service_client_->GetBlobContainerClient(location.container); + auto container_client = GetBlobContainerClient(location.container); auto [num_potentially_deleted_blobs, status] = - DeleteDirContentsWithoutHierarchicalNamespace(container_client, location, - missing_dir_ok); + DeleteDirContentsOnContainer(container_client, location, missing_dir_ok); if (num_potentially_deleted_blobs > 0) { // If we potentially deleted some blobs, we need to ensure the empty directory // marker blob exists. Otherwise, we don't need to do anything because the @@ -1785,7 +1774,30 @@ Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { Status AzureFileSystem::DeleteDir(const std::string& path) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->DeleteDir(location); + if (location.container.empty()) { + return Status::Invalid("DeleteDir requires a non-empty path."); + } + if (location.path.empty()) { + auto container_client = impl_->GetBlobContainerClient(location.container); + return impl_->DeleteContainer(container_client, location); + } + + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return PathNotFound(location); + } + if (hns_support == HNSSupport::kEnabled) { + return impl_->DeleteDirOnFileSystem(adlfs_client, location); + } + DCHECK_EQ(hns_support, HNSSupport::kDisabled); + auto container_client = impl_->GetBlobContainerClient(location.container); + Status status; + std::tie(std::ignore, status) = + impl_->DeleteDirContentsOnContainer(container_client, location, + /*missing_dir_ok=*/false); + return status; } Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { From 2748798f387888b5ad8fd8acf6d6625f1f77fbb2 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 22 Dec 2023 16:04:40 -0300 Subject: [PATCH 09/24] Extract DeleteDirContentsOnFileSystem from DeleteDirContents --- cpp/src/arrow/filesystem/azurefs.cc | 116 ++++++++++++++-------------- 1 file changed, 59 insertions(+), 57 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index bb36ff6bd8752..9ae1157a7dbeb 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1578,68 +1578,44 @@ class AzureFileSystem::Impl { } } - Status DeleteDirContents(const AzureLocation& location, bool missing_dir_ok) { - if (location.container.empty()) { - return internal::InvalidDeleteDirContents(location.all); - } - - auto adlfs_client = datalake_service_client_->GetFileSystemClient(location.container); - ARROW_ASSIGN_OR_RAISE(auto hns_support, HierarchicalNamespaceSupport(adlfs_client)); - if (hns_support == HNSSupport::kContainerNotFound) { - return missing_dir_ok ? Status::OK() : PathNotFound(location); - } - - if (hns_support == HNSSupport::kEnabled) { - auto directory_client = adlfs_client.GetDirectoryClient(location.path); - try { - auto list_response = directory_client.ListPaths(false); - for (; list_response.HasPage(); list_response.MoveToNextPage()) { - for (const auto& path : list_response.Paths) { - if (path.IsDirectory) { - auto sub_directory_client = adlfs_client.GetDirectoryClient(path.Name); - try { - sub_directory_client.DeleteRecursive(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - exception, "Failed to delete a sub directory: ", location.container, - kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); - } - } else { - auto sub_file_client = adlfs_client.GetFileClient(path.Name); - try { - sub_file_client.Delete(); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus( - exception, "Failed to delete a sub file: ", location.container, - kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); - } + /// \pre location.container is not empty. + Status DeleteDirContentsOnFileSystem( + const DataLake::DataLakeFileSystemClient& adlfs_client, + const AzureLocation& location, bool missing_dir_ok) { + auto directory_client = adlfs_client.GetDirectoryClient(location.path); + try { + auto list_response = directory_client.ListPaths(false); + for (; list_response.HasPage(); list_response.MoveToNextPage()) { + for (const auto& path : list_response.Paths) { + if (path.IsDirectory) { + auto sub_directory_client = adlfs_client.GetDirectoryClient(path.Name); + try { + sub_directory_client.DeleteRecursive(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( + exception, "Failed to delete a sub directory: ", location.container, + kDelimiter, path.Name, ": ", sub_directory_client.GetUrl()); + } + } else { + auto sub_file_client = adlfs_client.GetFileClient(path.Name); + try { + sub_file_client.Delete(); + } catch (const Storage::StorageException& exception) { + return ExceptionToStatus( + exception, "Failed to delete a sub file: ", location.container, + kDelimiter, path.Name, ": ", sub_file_client.GetUrl()); } } } - } catch (const Storage::StorageException& exception) { - if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { - return Status::OK(); - } else { - return ExceptionToStatus(exception, - "Failed to delete directory contents: ", location.path, - ": ", directory_client.GetUrl()); - } } return Status::OK(); - } else { - auto container_client = GetBlobContainerClient(location.container); - auto [num_potentially_deleted_blobs, status] = - DeleteDirContentsOnContainer(container_client, location, missing_dir_ok); - if (num_potentially_deleted_blobs > 0) { - // If we potentially deleted some blobs, we need to ensure the empty directory - // marker blob exists. Otherwise, we don't need to do anything because the - // directory marker blob may not exist from the beginning and should remain that - // way. EnsureEmptyDirExists requires the container to exist as a pre-condition - // and having num_potentially_deleted_blobs > 0 implies the container exists. - auto second_status = EnsureEmptyDirExists(container_client, location); - return status.ok() ? second_status : status; + } catch (const Storage::StorageException& exception) { + if (missing_dir_ok && exception.StatusCode == Http::HttpStatusCode::NotFound) { + return Status::OK(); } - return status; + return ExceptionToStatus(exception, + "Failed to delete directory contents: ", location.path, + ": ", directory_client.GetUrl()); } } @@ -1802,7 +1778,33 @@ Status AzureFileSystem::DeleteDir(const std::string& path) { Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { ARROW_ASSIGN_OR_RAISE(auto location, AzureLocation::FromString(path)); - return impl_->DeleteDirContents(location, missing_dir_ok); + if (location.container.empty()) { + return internal::InvalidDeleteDirContents(location.all); + } + + auto adlfs_client = impl_->GetFileSystemClient(location.container); + ARROW_ASSIGN_OR_RAISE(auto hns_support, + impl_->HierarchicalNamespaceSupport(adlfs_client)); + if (hns_support == HNSSupport::kContainerNotFound) { + return missing_dir_ok ? Status::OK() : PathNotFound(location); + } + + if (hns_support == HNSSupport::kEnabled) { + return impl_->DeleteDirContentsOnFileSystem(adlfs_client, location, missing_dir_ok); + } + auto container_client = impl_->GetBlobContainerClient(location.container); + auto [num_potentially_deleted_blobs, status] = + impl_->DeleteDirContentsOnContainer(container_client, location, missing_dir_ok); + if (num_potentially_deleted_blobs > 0) { + // If we potentially deleted some blobs, we need to ensure the empty directory + // marker blob exists. Otherwise, we don't need to do anything because the + // directory marker blob may not exist from the beginning and should remain that + // way. EnsureEmptyDirExists requires the container to exist as a pre-condition + // and having num_potentially_deleted_blobs > 0 implies the container exists. + auto second_status = impl_->EnsureEmptyDirExists(container_client, location); + return status.ok() ? second_status : status; + } + return status; } Status AzureFileSystem::DeleteRootDirContents() { From e9a1705e9f70e4c353e9094b1592afb1687547db Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 23 Dec 2023 16:03:39 -0300 Subject: [PATCH 10/24] Flip name and meaning of boolean passed to DeleteDirContentsOnContainer This will fit better together with another boolean I'm adding to DeleteDirContentsOnContainer. --- cpp/src/arrow/filesystem/azurefs.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 9ae1157a7dbeb..1cdfb9f716527 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1479,7 +1479,7 @@ class AzureFileSystem::Impl { /// \pre location.container is not empty. std::pair DeleteDirContentsOnContainer( const Blobs::BlobContainerClient& container_client, const AzureLocation& location, - bool missing_dir_ok) { + bool require_dir_to_exist) { DCHECK(!location.container.empty()); Blobs::ListBlobsOptions options; if (!location.path.empty()) { @@ -1494,7 +1494,7 @@ class AzureFileSystem::Impl { int num_potentially_deleted_blobs = 0; try { auto list_response = container_client.ListBlobs(options); - if (!missing_dir_ok && list_response.Blobs.empty()) { + if (require_dir_to_exist && list_response.Blobs.empty()) { return {num_potentially_deleted_blobs, PathNotFound(location)}; } for (; list_response.HasPage(); list_response.MoveToNextPage()) { @@ -1772,7 +1772,7 @@ Status AzureFileSystem::DeleteDir(const std::string& path) { Status status; std::tie(std::ignore, status) = impl_->DeleteDirContentsOnContainer(container_client, location, - /*missing_dir_ok=*/false); + /*require_dir_to_exist=*/true); return status; } @@ -1793,8 +1793,8 @@ Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_ return impl_->DeleteDirContentsOnFileSystem(adlfs_client, location, missing_dir_ok); } auto container_client = impl_->GetBlobContainerClient(location.container); - auto [num_potentially_deleted_blobs, status] = - impl_->DeleteDirContentsOnContainer(container_client, location, missing_dir_ok); + auto [num_potentially_deleted_blobs, status] = impl_->DeleteDirContentsOnContainer( + container_client, location, /*require_dir_to_exist=*/!missing_dir_ok); if (num_potentially_deleted_blobs > 0) { // If we potentially deleted some blobs, we need to ensure the empty directory // marker blob exists. Otherwise, we don't need to do anything because the From ca28c0f6b702ba2563fdb48619f40ddaf99e7bc2 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 23 Dec 2023 17:33:08 -0300 Subject: [PATCH 11/24] Add a local type alias for convenience --- cpp/src/arrow/filesystem/azurefs.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 1cdfb9f716527..eb85dfa17def5 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1480,6 +1480,7 @@ class AzureFileSystem::Impl { std::pair DeleteDirContentsOnContainer( const Blobs::BlobContainerClient& container_client, const AzureLocation& location, bool require_dir_to_exist) { + using DeleteBlobResponse = Storage::DeferredResponse; DCHECK(!location.container.empty()); Blobs::ListBlobsOptions options; if (!location.path.empty()) { @@ -1502,8 +1503,7 @@ class AzureFileSystem::Impl { continue; } auto batch = container_client.CreateBatch(); - std::vector> - deferred_responses; + std::vector deferred_responses; for (const auto& blob_item : list_response.Blobs) { num_potentially_deleted_blobs += 1; deferred_responses.push_back(batch.DeleteBlob(blob_item.Name)); From ee12b7c7ee1fe3acc8cb5188a7890c4fbf6b0fd4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 23 Dec 2023 21:56:52 -0300 Subject: [PATCH 12/24] Simplify the use of DeleteDirContentsOnContainer by adding a bool param --- cpp/src/arrow/filesystem/azurefs.cc | 115 ++++++++++++++++------------ 1 file changed, 65 insertions(+), 50 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index eb85dfa17def5..7737a6ce603ef 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1476,10 +1476,21 @@ class AzureFileSystem::Impl { } } + /// Deletes and contents of a directory and possibly the directory itself + /// depending on the value of preserve_dir_marker_blob. + /// /// \pre location.container is not empty. - std::pair DeleteDirContentsOnContainer( - const Blobs::BlobContainerClient& container_client, const AzureLocation& location, - bool require_dir_to_exist) { + /// + /// \param require_dir_to_exist Require the directory to exist *before* this + /// operation, otherwise return PathNotFound. + /// \param preserve_dir_marker_blob Ensure the empty directory marker blob + /// is preserved (not deleted) or created (before the contents are deleted) if it + /// doesn't exist explicitly but is implied by the existence of blobs with names + /// starting with the directory path. + Status DeleteDirContentsOnContainer(const Blobs::BlobContainerClient& container_client, + const AzureLocation& location, + bool require_dir_to_exist, + bool preserve_dir_marker_blob) { using DeleteBlobResponse = Storage::DeferredResponse; DCHECK(!location.container.empty()); Blobs::ListBlobsOptions options; @@ -1492,67 +1503,82 @@ class AzureFileSystem::Impl { // size of the body for a batch request can't exceed 4 MB. const int32_t kNumMaxRequestsInBatch = 256; options.PageSizeHint = kNumMaxRequestsInBatch; - int num_potentially_deleted_blobs = 0; + // trusted only if preserve_dir_marker_blob is true. + bool found_dir_marker_blob = false; try { auto list_response = container_client.ListBlobs(options); if (require_dir_to_exist && list_response.Blobs.empty()) { - return {num_potentially_deleted_blobs, PathNotFound(location)}; + return PathNotFound(location); } for (; list_response.HasPage(); list_response.MoveToNextPage()) { if (list_response.Blobs.empty()) { continue; } auto batch = container_client.CreateBatch(); - std::vector deferred_responses; + std::vector> deferred_responses; for (const auto& blob_item : list_response.Blobs) { - num_potentially_deleted_blobs += 1; - deferred_responses.push_back(batch.DeleteBlob(blob_item.Name)); + if (preserve_dir_marker_blob && !found_dir_marker_blob) { + const bool is_dir_marker_blob = + options.Prefix.HasValue() && blob_item.Name == *options.Prefix; + if (is_dir_marker_blob) { + // Skip deletion of the existing directory marker blob, + // but take note that it exists. + found_dir_marker_blob = true; + continue; + } + } + deferred_responses.emplace_back(blob_item.Name, + batch.DeleteBlob(blob_item.Name)); } try { - container_client.SubmitBatch(batch); + // Before submitting the batch deleting directory contents, ensure + // the empty directory marker blob exists. Doing this first, means that + // directory doesn't "stop existing" during the duration of the batch delete + // operation. + if (preserve_dir_marker_blob && !found_dir_marker_blob) { + // Only create an empty directory marker blob if the directory's + // existence is implied by the existence of blobs with names + // starting with the directory path. + if (!deferred_responses.empty()) { + RETURN_NOT_OK(EnsureEmptyDirExists(container_client, location)); + } + } + if (!deferred_responses.empty()) { + container_client.SubmitBatch(batch); + } } catch (const Storage::StorageException& exception) { - return {num_potentially_deleted_blobs, - ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", - location.path, ": ", container_client.GetUrl())}; + return ExceptionToStatus(exception, "Failed to delete blobs in a directory: ", + location.path, ": ", container_client.GetUrl()); } std::vector failed_blob_names; - for (size_t i = 0; i < deferred_responses.size(); ++i) { - const auto& deferred_response = deferred_responses[i]; + for (auto& [blob_name_view, deferred_response] : deferred_responses) { bool success = true; try { auto delete_result = deferred_response.GetResponse(); - if (!delete_result.Value.Deleted) { - // We know for sure the blob was not deleted, so - // we can decrement num_potentially_deleted_blobs. - num_potentially_deleted_blobs -= 1; - success = false; - } + success = delete_result.Value.Deleted; } catch (const Storage::StorageException& exception) { success = false; } if (!success) { - const auto& blob_item = list_response.Blobs[i]; - failed_blob_names.push_back(blob_item.Name); + failed_blob_names.emplace_back(blob_name_view); } } if (!failed_blob_names.empty()) { if (failed_blob_names.size() == 1) { - return {num_potentially_deleted_blobs, - Status::IOError("Failed to delete a blob: ", failed_blob_names[0], - ": " + container_client.GetUrl())}; + return Status::IOError("Failed to delete a blob: ", failed_blob_names[0], + ": " + container_client.GetUrl()); } else { - return {num_potentially_deleted_blobs, - Status::IOError("Failed to delete blobs: [", - arrow::internal::JoinStrings(failed_blob_names, ", "), - "]: " + container_client.GetUrl())}; + return Status::IOError("Failed to delete blobs: [", + arrow::internal::JoinStrings(failed_blob_names, ", "), + "]: " + container_client.GetUrl()); } } } - return {num_potentially_deleted_blobs, Status::OK()}; + return Status::OK(); } catch (const Storage::StorageException& exception) { - return {num_potentially_deleted_blobs, - ExceptionToStatus(exception, "Failed to list blobs in a directory: ", - location.path, ": ", container_client.GetUrl())}; + return ExceptionToStatus(exception, + "Failed to list blobs in a directory: ", location.path, + ": ", container_client.GetUrl()); } } @@ -1769,11 +1795,9 @@ Status AzureFileSystem::DeleteDir(const std::string& path) { } DCHECK_EQ(hns_support, HNSSupport::kDisabled); auto container_client = impl_->GetBlobContainerClient(location.container); - Status status; - std::tie(std::ignore, status) = - impl_->DeleteDirContentsOnContainer(container_client, location, - /*require_dir_to_exist=*/true); - return status; + return impl_->DeleteDirContentsOnContainer(container_client, location, + /*require_dir_to_exist=*/true, + /*preserve_dir_marker_blob=*/false); } Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { @@ -1793,18 +1817,9 @@ Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_ return impl_->DeleteDirContentsOnFileSystem(adlfs_client, location, missing_dir_ok); } auto container_client = impl_->GetBlobContainerClient(location.container); - auto [num_potentially_deleted_blobs, status] = impl_->DeleteDirContentsOnContainer( - container_client, location, /*require_dir_to_exist=*/!missing_dir_ok); - if (num_potentially_deleted_blobs > 0) { - // If we potentially deleted some blobs, we need to ensure the empty directory - // marker blob exists. Otherwise, we don't need to do anything because the - // directory marker blob may not exist from the beginning and should remain that - // way. EnsureEmptyDirExists requires the container to exist as a pre-condition - // and having num_potentially_deleted_blobs > 0 implies the container exists. - auto second_status = impl_->EnsureEmptyDirExists(container_client, location); - return status.ok() ? second_status : status; - } - return status; + return impl_->DeleteDirContentsOnContainer(container_client, location, + /*require_dir_to_exist=*/!missing_dir_ok, + /*preserve_dir_marker_blob=*/true); } Status AzureFileSystem::DeleteRootDirContents() { From 59e113d61b3eaaa98085a138c4eeb16eef221769 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 23 Dec 2023 22:01:04 -0300 Subject: [PATCH 13/24] Document a pre-condition of DeleteDirContentsOnContainer --- cpp/src/arrow/filesystem/azurefs.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 7737a6ce603ef..222ff4a3bd459 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1480,6 +1480,8 @@ class AzureFileSystem::Impl { /// depending on the value of preserve_dir_marker_blob. /// /// \pre location.container is not empty. + /// \pre preserve_dir_marker_blob=false implies location.path is not empty + /// because we can't *not preserve* the root directory of a container. /// /// \param require_dir_to_exist Require the directory to exist *before* this /// operation, otherwise return PathNotFound. @@ -1493,6 +1495,9 @@ class AzureFileSystem::Impl { bool preserve_dir_marker_blob) { using DeleteBlobResponse = Storage::DeferredResponse; DCHECK(!location.container.empty()); + DCHECK(preserve_dir_marker_blob || !location.path.empty()) + << "Must pass preserve_dir_marker_blob=true when location.path is empty " + "(i.e. deleting the contents of a container)."; Blobs::ListBlobsOptions options; if (!location.path.empty()) { options.Prefix = internal::EnsureTrailingSlash(location.path); From 7b663b9c1c1a043fba5883156fb14c795e753ebb Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Sat, 23 Dec 2023 22:10:28 -0300 Subject: [PATCH 14/24] Declare all client parameters as const & --- cpp/src/arrow/filesystem/azurefs.cc | 24 ++++++++++----------- cpp/src/arrow/filesystem/azurefs_internal.h | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 222ff4a3bd459..6d3c85dc26ba3 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -819,7 +819,7 @@ bool IsDfsEmulator(const AzureOptions& options) { namespace internal { Result CheckIfHierarchicalNamespaceIsEnabled( - DataLake::DataLakeFileSystemClient& adlfs_client, const AzureOptions& options) { + const DataLake::DataLakeFileSystemClient& adlfs_client, const AzureOptions& options) { try { auto directory_client = adlfs_client.GetDirectoryClient(""); // GetAccessControlList will fail on storage accounts @@ -885,7 +885,7 @@ const char kDelimiter[] = {internal::kSep, '\0'}; /// \pre location.container is not empty. template Result GetContainerPropsAsFileInfo(const AzureLocation& location, - ContainerClient& container_client) { + const ContainerClient& container_client) { DCHECK(!location.container.empty()); FileInfo info{location.path.empty() ? location.all : location.container}; try { @@ -905,7 +905,7 @@ Result GetContainerPropsAsFileInfo(const AzureLocation& location, template Status CreateContainerIfNotExists(const std::string& container_name, - ContainerClient& container_client) { + const ContainerClient& container_client) { try { container_client.CreateIfNotExists(); return Status::OK(); @@ -974,7 +974,7 @@ class AzureFileSystem::Impl { /// /// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never returned). Result HierarchicalNamespaceSupport( - DataLake::DataLakeFileSystemClient& adlfs_client) { + const DataLake::DataLakeFileSystemClient& adlfs_client) { switch (cached_hns_support_) { case HNSSupport::kEnabled: case HNSSupport::kDisabled: @@ -1018,7 +1018,7 @@ class AzureFileSystem::Impl { } /// \pre location.path is not empty. - Result GetFileInfo(DataLake::DataLakeFileSystemClient& adlfs_client, + Result GetFileInfo(const DataLake::DataLakeFileSystemClient& adlfs_client, const AzureLocation& location) { auto file_client = adlfs_client.GetFileClient(location.path); try { @@ -1055,7 +1055,7 @@ class AzureFileSystem::Impl { /// being blobs with names starting with the directory path. /// /// \pre location.path is not empty. - Result GetFileInfo(Blobs::BlobContainerClient& container_client, + Result GetFileInfo(const Blobs::BlobContainerClient& container_client, const AzureLocation& location) { DCHECK(!location.path.empty()); Blobs::ListBlobsOptions options; @@ -1097,7 +1097,7 @@ class AzureFileSystem::Impl { private: /// \pref location.container is not empty. template - Status CheckDirExists(ContainerClient& container_client, + Status CheckDirExists(const ContainerClient& container_client, const AzureLocation& location) { DCHECK(!location.container.empty()); FileInfo info; @@ -1334,7 +1334,7 @@ class AzureFileSystem::Impl { /// \pre location.container is not empty. /// \pre location.path is not empty. template - Status CreateDirTemplate(ContainerClient& container_client, + Status CreateDirTemplate(const ContainerClient& container_client, GetDirectoryClient&& get_directory_client, CreateDirIfNotExists&& create_if_not_exists, const AzureLocation& location, bool recursive) { @@ -1378,7 +1378,7 @@ class AzureFileSystem::Impl { /// /// \pre location.container is not empty. /// \pre location.path is not empty. - Status CreateDirOnFileSystem(DataLake::DataLakeFileSystemClient& adlfs_client, + Status CreateDirOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, const AzureLocation& location, bool recursive) { return CreateDirTemplate( adlfs_client, @@ -1393,7 +1393,7 @@ class AzureFileSystem::Impl { /// /// \pre location.container is not empty. /// \pre location.path is not empty. - Status CreateDirOnContainer(Blobs::BlobContainerClient& container_client, + Status CreateDirOnContainer(const Blobs::BlobContainerClient& container_client, const AzureLocation& location, bool recursive) { return CreateDirTemplate( container_client, @@ -1453,7 +1453,7 @@ class AzureFileSystem::Impl { /// \pre location.container is not empty. /// \pre location.path is empty. - Status DeleteContainer(Blobs::BlobContainerClient& container_client, + Status DeleteContainer(const Blobs::BlobContainerClient& container_client, const AzureLocation& location) { DCHECK(!location.container.empty()); DCHECK(location.path.empty()); @@ -1589,7 +1589,7 @@ class AzureFileSystem::Impl { /// \pre location.container is not empty. /// \pre location.path is not empty. - Status DeleteDirOnFileSystem(DataLake::DataLakeFileSystemClient& adlfs_client, + Status DeleteDirOnFileSystem(const DataLake::DataLakeFileSystemClient& adlfs_client, const AzureLocation& location) { DCHECK(!location.container.empty()); DCHECK(!location.path.empty()); diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h index 13d84c9b542b4..5642e16bcfb05 100644 --- a/cpp/src/arrow/filesystem/azurefs_internal.h +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -71,7 +71,7 @@ enum class HierarchicalNamespaceSupport { /// \return kEnabled/kDisabled/kContainerNotFound (kUnknown is never /// returned). Result CheckIfHierarchicalNamespaceIsEnabled( - Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, + const Azure::Storage::Files::DataLake::DataLakeFileSystemClient& adlfs_client, const arrow::fs::AzureOptions& options); } // namespace internal From 85184766a8d02bc7a80eff91f79b3ee1dfad5df6 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Dec 2023 12:59:43 -0300 Subject: [PATCH 15/24] Fix typo --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 49b6a9e3859fd..d1871247d9ee5 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -575,7 +575,7 @@ class TestAzureFileSystem : public ::testing::Test { } constexpr static const char* const kSubmitBatchBugMessage = - "This test is affected by an Azurite isse: " + "This test is affected by an Azurite issue: " "https://github.com/Azure/Azurite/pull/2302"; /// Azurite has a bug that causes BlobContainerClient::SubmitBatch to fail on macOS. From 2a6499629284a97a29c8f46a5adfb029a20ddfec Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Dec 2023 13:00:34 -0300 Subject: [PATCH 16/24] Only call GetAzureEnv if env is used --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index d1871247d9ee5..be6a0c7b54a49 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -583,8 +583,8 @@ class TestAzureFileSystem : public ::testing::Test { /// - AzureFileSystem::DeleteDir /// - AzureFileSystem::DeleteDirContents bool HasSubmitBatchBug() const { - EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); #ifdef __APPLE__ + EXPECT_OK_AND_ASSIGN(auto env, GetAzureEnv()); return env->backend() == AzureBackend::kAzurite; #else return false; From de2a76814802a5172f1dd20bd6d759dddf44946b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 26 Dec 2023 13:22:35 -0300 Subject: [PATCH 17/24] Document the interpretation of the ListBlobByHierarchy response --- cpp/src/arrow/filesystem/azurefs.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6d3c85dc26ba3..54e2dcf4f130a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1065,7 +1065,17 @@ class AzureFileSystem::Impl { try { FileInfo info{location.all}; auto list_response = container_client.ListBlobsByHierarchy(kDelimiter, options); + // Since PageSizeHint=1, we expect at most one entry in either Blobs or + // BlobPrefixes. A BlobPrefix always ends with kDelimiter ("/"), so we can + // distinguish between a directory and a file by checking if we received a + // prefix or a blob. if (!list_response.BlobPrefixes.empty()) { + // Ensure the returned BlobPrefixes[0] string doesn't contain more characters than + // the requested Prefix. For instance, if we request with Prefix="dir/abra" and + // the container contains "dir/abracadabra/" but not "dir/abra/", we will get back + // "dir/abracadabra/" in the BlobPrefixes list. If "dir/abra/" existed, + // it would be returned instead because it comes before "dir/abracadabra/" in the + // lexicographic order guaranteed by ListBlobsByHierarchy. const auto& blob_prefix = list_response.BlobPrefixes[0]; if (blob_prefix == internal::EnsureTrailingSlash(location.path)) { info.set_type(FileType::Directory); From f091ad5195e4bbd599d7ee16bec850d4c2ccd15a Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 16:38:24 -0300 Subject: [PATCH 18/24] Add more information to some error messages --- cpp/src/arrow/filesystem/azurefs.cc | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 54e2dcf4f130a..de990af75c005 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1376,10 +1376,12 @@ class AzureFileSystem::Impl { return PathNotFound(parent); } } catch (const Storage::StorageException& second_exception) { - return ExceptionToStatus(second_exception); + return ExceptionToStatus(second_exception, "Failed to create directory '", + location.all, "': ", container_client.GetUrl()); } } - return ExceptionToStatus(exception); + return ExceptionToStatus(exception, "Failed to create directory '", location.all, + "': ", container_client.GetUrl()); } } @@ -1444,7 +1446,7 @@ class AzureFileSystem::Impl { /// \pre location.container is not empty. /// \pre The location.container container already exists. Status EnsureEmptyDirExists(const Blobs::BlobContainerClient& container_client, - const AzureLocation& location) { + const AzureLocation& location, const char* operation_name) { DCHECK(!location.container.empty()); if (location.path.empty()) { // Nothing to do. The container already exists per the preconditions. @@ -1457,7 +1459,9 @@ class AzureFileSystem::Impl { block_blob_client.UploadFrom(nullptr, 0); return Status::OK(); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception); + return ExceptionToStatus( + exception, operation_name, " failed to ensure empty directory marker '", + location.path, "' exists in container: ", container_client.GetUrl()); } } @@ -1499,10 +1503,12 @@ class AzureFileSystem::Impl { /// is preserved (not deleted) or created (before the contents are deleted) if it /// doesn't exist explicitly but is implied by the existence of blobs with names /// starting with the directory path. + /// \param operation_name Used in error messages to accurately describe the operation Status DeleteDirContentsOnContainer(const Blobs::BlobContainerClient& container_client, const AzureLocation& location, bool require_dir_to_exist, - bool preserve_dir_marker_blob) { + bool preserve_dir_marker_blob, + const char* operation_name) { using DeleteBlobResponse = Storage::DeferredResponse; DCHECK(!location.container.empty()); DCHECK(preserve_dir_marker_blob || !location.path.empty()) @@ -1555,7 +1561,8 @@ class AzureFileSystem::Impl { // existence is implied by the existence of blobs with names // starting with the directory path. if (!deferred_responses.empty()) { - RETURN_NOT_OK(EnsureEmptyDirExists(container_client, location)); + RETURN_NOT_OK( + EnsureEmptyDirExists(container_client, location, operation_name)); } } if (!deferred_responses.empty()) { @@ -1812,7 +1819,8 @@ Status AzureFileSystem::DeleteDir(const std::string& path) { auto container_client = impl_->GetBlobContainerClient(location.container); return impl_->DeleteDirContentsOnContainer(container_client, location, /*require_dir_to_exist=*/true, - /*preserve_dir_marker_blob=*/false); + /*preserve_dir_marker_blob=*/false, + "DeleteDir"); } Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { @@ -1834,7 +1842,8 @@ Status AzureFileSystem::DeleteDirContents(const std::string& path, bool missing_ auto container_client = impl_->GetBlobContainerClient(location.container); return impl_->DeleteDirContentsOnContainer(container_client, location, /*require_dir_to_exist=*/!missing_dir_ok, - /*preserve_dir_marker_blob=*/true); + /*preserve_dir_marker_blob=*/true, + "DeleteDirContents"); } Status AzureFileSystem::DeleteRootDirContents() { From 2e4d4e0df540d7764fba699497abe78ff8ee1bc4 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 16:58:42 -0300 Subject: [PATCH 19/24] Remove GetDirectoryClient tparam from CreateDirTemplate --- cpp/src/arrow/filesystem/azurefs.cc | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index de990af75c005..6ef19ef320f77 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1343,9 +1343,8 @@ class AzureFileSystem::Impl { /// /// \pre location.container is not empty. /// \pre location.path is not empty. - template + template Status CreateDirTemplate(const ContainerClient& container_client, - GetDirectoryClient&& get_directory_client, CreateDirIfNotExists&& create_if_not_exists, const AzureLocation& location, bool recursive) { DCHECK(!location.container.empty()); @@ -1360,16 +1359,15 @@ class AzureFileSystem::Impl { // exists because the operation we perform below will fail if the container // doesn't exist and we can handle that error according to the recursive flag. } - auto directory_client = get_directory_client(container_client, location); try { - create_if_not_exists(directory_client); + create_if_not_exists(container_client, location); return Status::OK(); } catch (const Storage::StorageException& exception) { if (IsContainerNotFound(exception)) { try { if (recursive) { container_client.CreateIfNotExists(); - create_if_not_exists(directory_client); + create_if_not_exists(container_client, location); return Status::OK(); } else { auto parent = location.parent(); @@ -1394,11 +1392,11 @@ class AzureFileSystem::Impl { const AzureLocation& location, bool recursive) { return CreateDirTemplate( adlfs_client, - [](auto& adlfs_client, auto& location) { - return adlfs_client.GetDirectoryClient(location.path); + [](const auto& adlfs_client, const auto& location) { + auto directory_client = adlfs_client.GetDirectoryClient(location.path); + directory_client.CreateIfNotExists(); }, - [](auto& directory_client) { directory_client.CreateIfNotExists(); }, location, - recursive); + location, recursive); } /// This function cannot assume the container already exists. @@ -1409,11 +1407,12 @@ class AzureFileSystem::Impl { const AzureLocation& location, bool recursive) { return CreateDirTemplate( container_client, - [](auto& container_client, auto& location) { + [](const auto& container_client, const auto& location) { auto dir_marker_blob_path = internal::EnsureTrailingSlash(location.path); - return container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); + auto block_blob_client = + container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); + block_blob_client.UploadFrom(nullptr, 0); }, - [](auto& block_blob_client) { block_blob_client.UploadFrom(nullptr, 0); }, location, recursive); } From 52030717f860f03c54bd6ec84b583d419f8e64ff Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 17:04:54 -0300 Subject: [PATCH 20/24] Extract EnsureEmptyDirExistsImplThatThrows and use it in two places --- cpp/src/arrow/filesystem/azurefs.cc | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6ef19ef320f77..78993db4ff108 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1407,11 +1407,8 @@ class AzureFileSystem::Impl { const AzureLocation& location, bool recursive) { return CreateDirTemplate( container_client, - [](const auto& container_client, const auto& location) { - auto dir_marker_blob_path = internal::EnsureTrailingSlash(location.path); - auto block_blob_client = - container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); - block_blob_client.UploadFrom(nullptr, 0); + [this](const auto& container_client, const auto& location) { + EnsureEmptyDirExistsImplThatThrows(container_client, location.path); }, location, recursive); } @@ -1439,6 +1436,17 @@ class AzureFileSystem::Impl { return stream; } + private: + void EnsureEmptyDirExistsImplThatThrows( + const Blobs::BlobContainerClient& container_client, + const std::string& path_within_container) { + auto dir_marker_blob_path = internal::EnsureTrailingSlash(path_within_container); + auto block_blob_client = + container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); + block_blob_client.UploadFrom(nullptr, 0); + } + + public: /// This function assumes the container already exists. So it can only be /// called after that has been verified. /// @@ -1451,11 +1459,8 @@ class AzureFileSystem::Impl { // Nothing to do. The container already exists per the preconditions. return Status::OK(); } - auto dir_marker_blob_path = internal::EnsureTrailingSlash(location.path); - auto block_blob_client = - container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); try { - block_blob_client.UploadFrom(nullptr, 0); + EnsureEmptyDirExistsImplThatThrows(container_client, location.path); return Status::OK(); } catch (const Storage::StorageException& exception) { return ExceptionToStatus( From 8e2e9ffc0b91a549bd9493624349fcfdd10647d9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 17:07:56 -0300 Subject: [PATCH 21/24] Fix typo --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 78993db4ff108..4f3953045561c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1494,7 +1494,7 @@ class AzureFileSystem::Impl { } } - /// Deletes and contents of a directory and possibly the directory itself + /// Deletes contents of a directory and possibly the directory itself /// depending on the value of preserve_dir_marker_blob. /// /// \pre location.container is not empty. From 9957577cc8c82a1c90b44d00e92e34de69470f32 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 17:21:30 -0300 Subject: [PATCH 22/24] Nest second kContainerNotFound check --- cpp/src/arrow/filesystem/azurefs.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 4f3953045561c..3c18399eb6426 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1781,13 +1781,13 @@ Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { RETURN_NOT_OK(CreateContainerIfNotExists(location.container, container_client)); // Perform a second check for HNS support after creating the container. ARROW_ASSIGN_OR_RAISE(hns_support, impl_->HierarchicalNamespaceSupport(adlfs_client)); - } - if (hns_support == HNSSupport::kContainerNotFound) { - // We only get kContainerNotFound if we are unable to read the properties of the - // container we just created. This is very unlikely, but theoretically possible in - // a concurrent system, so the error is handled to avoid infinite recursion. - return Status::IOError("Unable to read properties of a newly created container: ", - location.container, ": " + container_client.GetUrl()); + if (hns_support == HNSSupport::kContainerNotFound) { + // We only get kContainerNotFound if we are unable to read the properties of the + // container we just created. This is very unlikely, but theoretically possible in + // a concurrent system, so the error is handled to avoid infinite recursion. + return Status::IOError("Unable to read properties of a newly created container: ", + location.container, ": " + container_client.GetUrl()); + } } // CreateDirOnFileSystem and CreateDirOnContainer can handle the container // not existing which is useful and necessary here since the only reason From 96bdba2b3f0f59e440221618a6836c2ecfcedf23 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 18:58:24 -0300 Subject: [PATCH 23/24] Attach metadata to directory marker blobs --- cpp/src/arrow/filesystem/azurefs.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 3c18399eb6426..c6e8e3131406c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -1443,7 +1443,12 @@ class AzureFileSystem::Impl { auto dir_marker_blob_path = internal::EnsureTrailingSlash(path_within_container); auto block_blob_client = container_client.GetBlobClient(dir_marker_blob_path).AsBlockBlobClient(); - block_blob_client.UploadFrom(nullptr, 0); + // Attach metadata that other filesystem implementations expect to be present + // on directory marker blobs. + // https://github.com/fsspec/adlfs/blob/32132c4094350fca2680155a5c236f2e9f991ba5/adlfs/spec.py#L855-L870 + Blobs::UploadBlockBlobFromOptions blob_options; + blob_options.Metadata.emplace("is_directory", "true"); + block_blob_client.UploadFrom(nullptr, 0, blob_options); } public: From 432dd613b7aae888573d073f6473b8fe5099359d Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 4 Jan 2024 19:25:32 -0300 Subject: [PATCH 24/24] Fix test-code that doesn't match the expectations --- cpp/src/arrow/filesystem/azurefs_test.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index be6a0c7b54a49..ff94578b041dc 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -1296,9 +1296,8 @@ TEST_F(TestAzuriteFileSystem, DeleteDirSuccessNonexistent) { } auto data = SetUpPreexistingData(); const auto directory_path = data.RandomDirectoryPath(rng_); - // There is only virtual directory without hierarchical namespace - // support. So the DeleteDir() for nonexistent directory does nothing. - ASSERT_OK(fs()->DeleteDir(directory_path)); + // DeleteDir() fails if the directory doesn't exist. + ASSERT_RAISES(IOError, fs()->DeleteDir(directory_path)); AssertFileInfo(fs(), directory_path, FileType::NotFound); } @@ -1353,8 +1352,7 @@ TEST_F(TestAzuriteFileSystem, DeleteDirContentsSuccessDirectory) { HierarchicalPaths paths; CreateHierarchicalData(&paths); ASSERT_OK(fs()->DeleteDirContents(paths.directory)); - // GH-38772: We may change this to FileType::Directory. - AssertFileInfo(fs(), paths.directory, FileType::NotFound); + AssertFileInfo(fs(), paths.directory, FileType::Directory); for (const auto& sub_path : paths.sub_paths) { AssertFileInfo(fs(), sub_path, FileType::NotFound); }