diff --git a/LICENSE.bin b/LICENSE.bin index 738687a6a29..9ab5edbd6fc 100644 --- a/LICENSE.bin +++ b/LICENSE.bin @@ -285,6 +285,7 @@ Apache Hadoop Aliyun connector Apache Hadoop GCS connector Apache Hadoop AWS connector + Apache Hadoop Azure connector Apache Hadoop Annotatations Apache Hadoop Auth Apache Hadoop Client Aggregator diff --git a/build.gradle.kts b/build.gradle.kts index 23074cbe028..d290bd61eb3 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -774,7 +774,7 @@ tasks { !it.name.startsWith("client") && !it.name.startsWith("filesystem") && !it.name.startsWith("spark") && !it.name.startsWith("iceberg") && it.name != "trino-connector" && it.name != "integration-test" && it.name != "bundled-catalog" && !it.name.startsWith("flink") && it.name != "integration-test" && it.name != "hive-metastore-common" && !it.name.startsWith("flink") && - it.name != "gcp-bundle" && it.name != "aliyun-bundle" && it.name != "aws-bundle" + it.name != "gcp-bundle" && it.name != "aliyun-bundle" && it.name != "aws-bundle" && it.name != "azure-bundle" ) { from(it.configurations.runtimeClasspath) into("distribution/package/libs") @@ -796,7 +796,7 @@ tasks { !it.name.startsWith("trino-connector") && it.name != "bundled-catalog" && it.name != "hive-metastore-common" && it.name != "gcp-bundle" && - it.name != "aliyun-bundle" && it.name != "aws-bundle" + it.name != "aliyun-bundle" && it.name != "aws-bundle" && it.name != "azure-bundle" ) { dependsOn("${it.name}:build") from("${it.name}/build/libs") diff --git a/bundles/azure-bundle/build.gradle.kts b/bundles/azure-bundle/build.gradle.kts new file mode 100644 index 00000000000..fa6a68d1af5 --- /dev/null +++ b/bundles/azure-bundle/build.gradle.kts @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar + +plugins { + `maven-publish` + id("java") + alias(libs.plugins.shadow) +} + +dependencies { + compileOnly(project(":api")) + compileOnly(project(":core")) + compileOnly(project(":catalogs:catalog-hadoop")) + + compileOnly(libs.hadoop3.common) + + implementation(libs.commons.lang3) + // runtime used + implementation(libs.commons.logging) + implementation(libs.hadoop3.abs) + implementation(project(":catalogs:catalog-common")) { + exclude("*") + } +} + +tasks.withType(ShadowJar::class.java) { + isZip64 = true + configurations = listOf(project.configurations.runtimeClasspath.get()) + archiveClassifier.set("") + + // Relocate dependencies to avoid conflicts + relocate("org.apache.httpcomponents", "org.apache.gravitino.azure.shaded.org.apache.httpcomponents") + relocate("org.apache.commons", "org.apache.gravitino.azure.shaded.org.apache.commons") + relocate("com.fasterxml", "org.apache.gravitino.azure.shaded.com.fasterxml") + relocate("com.google.guava", "org.apache.gravitino.azure.shaded.com.google.guava") +} + +tasks.jar { + dependsOn(tasks.named("shadowJar")) + archiveClassifier.set("empty") +} + +tasks.compileJava { + dependsOn(":catalogs:catalog-hadoop:runtimeJars") +} diff --git a/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java new file mode 100644 index 00000000000..cad38e14c96 --- /dev/null +++ b/bundles/azure-bundle/src/main/java/org/apache/gravitino/abs/fs/AzureFileSystemProvider.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.abs.fs; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableMap; +import java.io.IOException; +import java.util.Map; +import javax.annotation.Nonnull; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.storage.ABSProperties; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +public class AzureFileSystemProvider implements FileSystemProvider { + + @VisibleForTesting public static final String ABS_PROVIDER_SCHEME = "abfss"; + + @VisibleForTesting public static final String ABS_PROVIDER_NAME = "abs"; + + private static final String ABFS_IMPL = "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem"; + + private static final String ABFS_IMPL_KEY = "fs.abfss.impl"; + + @Override + public FileSystem getFileSystem(@Nonnull Path path, @Nonnull Map config) + throws IOException { + Configuration configuration = new Configuration(); + + Map hadoopConfMap = + FileSystemUtils.toHadoopConfigMap(config, ImmutableMap.of()); + + if (config.containsKey(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME) + && config.containsKey(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY)) { + hadoopConfMap.put( + String.format( + "fs.azure.account.key.%s.dfs.core.windows.net", + config.get(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME)), + config.get(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY)); + } + + if (!config.containsKey(ABFS_IMPL_KEY)) { + configuration.set(ABFS_IMPL_KEY, ABFS_IMPL); + } + + hadoopConfMap.forEach(configuration::set); + + return FileSystem.get(path.toUri(), configuration); + } + + @Override + public String scheme() { + return ABS_PROVIDER_SCHEME; + } + + @Override + public String name() { + return ABS_PROVIDER_NAME; + } +} diff --git a/bundles/azure-bundle/src/main/resources/META-INF/services/org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider b/bundles/azure-bundle/src/main/resources/META-INF/services/org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider new file mode 100644 index 00000000000..ab864341cce --- /dev/null +++ b/bundles/azure-bundle/src/main/resources/META-INF/services/org.apache.gravitino.catalog.hadoop.fs.FileSystemProvider @@ -0,0 +1,20 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +org.apache.gravitino.abs.fs.AzureFileSystemProvider \ No newline at end of file diff --git a/catalogs/catalog-common/src/main/java/org/apache/gravitino/storage/ABSProperties.java b/catalogs/catalog-common/src/main/java/org/apache/gravitino/storage/ABSProperties.java new file mode 100644 index 00000000000..a76ece32ba5 --- /dev/null +++ b/catalogs/catalog-common/src/main/java/org/apache/gravitino/storage/ABSProperties.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.storage; + +public class ABSProperties { + + // The account name of the Azure Blob Storage. + public static final String GRAVITINO_ABS_ACCOUNT_NAME = "abs-account-name"; + + // The account key of the Azure Blob Storage. + public static final String GRAVITINO_ABS_ACCOUNT_KEY = "abs-account-key"; +} diff --git a/catalogs/catalog-hadoop/build.gradle.kts b/catalogs/catalog-hadoop/build.gradle.kts index c925d1b92df..409a87fb103 100644 --- a/catalogs/catalog-hadoop/build.gradle.kts +++ b/catalogs/catalog-hadoop/build.gradle.kts @@ -80,6 +80,7 @@ dependencies { testImplementation(project(":bundles:aws-bundle")) testImplementation(project(":bundles:gcp-bundle")) testImplementation(project(":bundles:aliyun-bundle")) + testImplementation(project(":bundles:azure-bundle")) testImplementation(libs.minikdc) testImplementation(libs.hadoop3.minicluster) diff --git a/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java new file mode 100644 index 00000000000..0da915a7d4b --- /dev/null +++ b/catalogs/catalog-hadoop/src/test/java/org/apache/gravitino/catalog/hadoop/integration/test/HadoopABSCatalogIT.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.gravitino.catalog.hadoop.integration.test; + +import static org.apache.gravitino.catalog.hadoop.HadoopCatalogPropertiesMetadata.FILESYSTEM_PROVIDERS; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import java.io.IOException; +import java.net.URI; +import java.util.Map; +import org.apache.gravitino.Catalog; +import org.apache.gravitino.NameIdentifier; +import org.apache.gravitino.Schema; +import org.apache.gravitino.abs.fs.AzureFileSystemProvider; +import org.apache.gravitino.file.Fileset; +import org.apache.gravitino.integration.test.util.GravitinoITUtils; +import org.apache.gravitino.storage.ABSProperties; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.condition.EnabledIf; +import org.junit.platform.commons.util.StringUtils; + +@EnabledIf("absIsConfigured") +public class HadoopABSCatalogIT extends HadoopCatalogIT { + + public static final String ABS_ACCOUNT_NAME = System.getenv("ABS_ACCOUNT_NAME"); + public static final String ABS_ACCOUNT_KEY = System.getenv("ABS_ACCOUNT_KEY"); + public static final String ABS_CONTAINER_NAME = System.getenv("ABS_CONTAINER_NAME"); + + @Override + public void startIntegrationTest() throws Exception { + // Just overwrite super, do nothing. + } + + @BeforeAll + public void setup() throws IOException { + copyBundleJarsToHadoop("azure-bundle"); + + try { + super.startIntegrationTest(); + } catch (Exception e) { + throw new RuntimeException(e); + } + + metalakeName = GravitinoITUtils.genRandomName("CatalogFilesetIT_metalake"); + catalogName = GravitinoITUtils.genRandomName("CatalogFilesetIT_catalog"); + schemaName = GravitinoITUtils.genRandomName("CatalogFilesetIT_schema"); + + schemaName = GravitinoITUtils.genRandomName(SCHEMA_PREFIX); + Configuration conf = new Configuration(); + + conf.set( + String.format("fs.azure.account.key.%s.dfs.core.windows.net", ABS_ACCOUNT_NAME), + ABS_ACCOUNT_KEY); + + fileSystem = + FileSystem.get( + URI.create( + String.format( + "abfs://%s@%s.dfs.core.windows.net", ABS_CONTAINER_NAME, ABS_ACCOUNT_NAME)), + conf); + + createMetalake(); + createCatalog(); + createSchema(); + } + + protected String defaultBaseLocation() { + if (defaultBaseLocation == null) { + try { + Path bucket = + new Path( + String.format( + "%s://%s@%s.dfs.core.windows.net/%s", + AzureFileSystemProvider.ABS_PROVIDER_SCHEME, + ABS_CONTAINER_NAME, + ABS_ACCOUNT_NAME, + GravitinoITUtils.genRandomName("CatalogFilesetIT"))); + + if (!fileSystem.exists(bucket)) { + fileSystem.mkdirs(bucket); + } + + defaultBaseLocation = bucket.toString(); + } catch (IOException e) { + throw new RuntimeException("Failed to create default base location", e); + } + } + + return defaultBaseLocation; + } + + protected void createCatalog() { + Map map = Maps.newHashMap(); + map.put(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME, ABS_ACCOUNT_NAME); + map.put(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY, ABS_ACCOUNT_KEY); + map.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); + metalake.createCatalog(catalogName, Catalog.Type.FILESET, provider, "comment", map); + + catalog = metalake.loadCatalog(catalogName); + } + + protected String generateLocation(String filesetName) { + return String.format("%s/%s", defaultBaseLocation, filesetName); + } + + @Test + public void testCreateSchemaAndFilesetWithSpecialLocation() { + String localCatalogName = GravitinoITUtils.genRandomName("local_catalog"); + + String ossLocation = + String.format( + "%s://%s@%s.dfs.core.windows.net/%s", + AzureFileSystemProvider.ABS_PROVIDER_SCHEME, + ABS_CONTAINER_NAME, + ABS_ACCOUNT_NAME, + GravitinoITUtils.genRandomName("CatalogCatalogIT")); + Map catalogProps = Maps.newHashMap(); + catalogProps.put("location", ossLocation); + catalogProps.put(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME, ABS_ACCOUNT_NAME); + catalogProps.put(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY, ABS_ACCOUNT_KEY); + catalogProps.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); + + Catalog localCatalog = + metalake.createCatalog( + localCatalogName, Catalog.Type.FILESET, provider, "comment", catalogProps); + Assertions.assertEquals(ossLocation, localCatalog.properties().get("location")); + + // Create schema without specifying location. + Schema localSchema = + localCatalog + .asSchemas() + .createSchema("local_schema", "comment", ImmutableMap.of("key1", "val1")); + + Fileset localFileset = + localCatalog + .asFilesetCatalog() + .createFileset( + NameIdentifier.of(localSchema.name(), "local_fileset"), + "fileset comment", + Fileset.Type.MANAGED, + null, + ImmutableMap.of("k1", "v1")); + Assertions.assertEquals( + ossLocation + "/local_schema/local_fileset", localFileset.storageLocation()); + + // Delete schema + localCatalog.asSchemas().dropSchema(localSchema.name(), true); + + // Create schema with specifying location. + Map schemaProps = ImmutableMap.of("location", ossLocation); + Schema localSchema2 = + localCatalog.asSchemas().createSchema("local_schema2", "comment", schemaProps); + Assertions.assertEquals(ossLocation, localSchema2.properties().get("location")); + + Fileset localFileset2 = + localCatalog + .asFilesetCatalog() + .createFileset( + NameIdentifier.of(localSchema2.name(), "local_fileset2"), + "fileset comment", + Fileset.Type.MANAGED, + null, + ImmutableMap.of("k1", "v1")); + Assertions.assertEquals(ossLocation + "/local_fileset2", localFileset2.storageLocation()); + + // Delete schema + localCatalog.asSchemas().dropSchema(localSchema2.name(), true); + + // Delete catalog + metalake.dropCatalog(localCatalogName, true); + } + + private static boolean absIsConfigured() { + return StringUtils.isNotBlank(System.getenv("ABS_ACCOUNT_NAME")) + && StringUtils.isNotBlank(System.getenv("ABS_ACCOUNT_KEY")) + && StringUtils.isNotBlank(System.getenv("ABS_CONTAINER_NAME")); + } +} diff --git a/clients/filesystem-hadoop3/build.gradle.kts b/clients/filesystem-hadoop3/build.gradle.kts index 9836c35147d..55c0f59a05d 100644 --- a/clients/filesystem-hadoop3/build.gradle.kts +++ b/clients/filesystem-hadoop3/build.gradle.kts @@ -45,6 +45,7 @@ dependencies { testImplementation(project(":bundles:gcp-bundle")) testImplementation(project(":bundles:aliyun-bundle")) testImplementation(project(":bundles:aws-bundle")) + testImplementation(project(":bundles:azure-bundle")) testImplementation(libs.awaitility) testImplementation(libs.bundles.jetty) testImplementation(libs.bundles.jersey) diff --git a/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSIT.java b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSIT.java new file mode 100644 index 00000000000..cc16ce920ae --- /dev/null +++ b/clients/filesystem-hadoop3/src/test/java/org/apache/gravitino/filesystem/hadoop/integration/test/GravitinoVirtualFileSystemABSIT.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.gravitino.filesystem.hadoop.integration.test; + +import static org.apache.gravitino.catalog.hadoop.HadoopCatalogPropertiesMetadata.FILESYSTEM_PROVIDERS; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Maps; +import java.io.IOException; +import java.util.Collections; +import java.util.Map; +import org.apache.gravitino.Catalog; +import org.apache.gravitino.abs.fs.AzureFileSystemProvider; +import org.apache.gravitino.catalog.hadoop.fs.FileSystemUtils; +import org.apache.gravitino.integration.test.util.GravitinoITUtils; +import org.apache.gravitino.storage.ABSProperties; +import org.apache.hadoop.conf.Configuration; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.condition.EnabledIf; +import org.junit.platform.commons.util.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@EnabledIf("absIsConfigured") +public class GravitinoVirtualFileSystemABSIT extends GravitinoVirtualFileSystemIT { + private static final Logger LOG = LoggerFactory.getLogger(GravitinoVirtualFileSystemABSIT.class); + + public static final String ABS_ACCOUNT_NAME = System.getenv("ABS_ACCOUNT_NAME"); + public static final String ABS_ACCOUNT_KEY = System.getenv("ABS_ACCOUNT_KEY"); + public static final String ABS_CONTAINER_NAME = System.getenv("ABS_CONTAINER_NAME"); + + @BeforeAll + public void startIntegrationTest() { + // Do nothing + } + + @BeforeAll + public void startUp() throws Exception { + // Copy the Azure jars to the gravitino server if in deploy mode. + copyBundleJarsToHadoop("azure-bundle"); + // Need to download jars to gravitino server + super.startIntegrationTest(); + + // This value can be by tune by the user, please change it accordingly. + defaultBockSize = 32 * 1024 * 1024; + + // This value is 1 for ABS, 3 for GCS, and 1 for S3A. + defaultReplication = 1; + + metalakeName = GravitinoITUtils.genRandomName("gvfs_it_metalake"); + catalogName = GravitinoITUtils.genRandomName("catalog"); + schemaName = GravitinoITUtils.genRandomName("schema"); + + Assertions.assertFalse(client.metalakeExists(metalakeName)); + metalake = client.createMetalake(metalakeName, "metalake comment", Collections.emptyMap()); + Assertions.assertTrue(client.metalakeExists(metalakeName)); + + Map properties = Maps.newHashMap(); + + properties.put(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME, ABS_ACCOUNT_NAME); + properties.put(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY, ABS_ACCOUNT_KEY); + properties.put(FILESYSTEM_PROVIDERS, AzureFileSystemProvider.ABS_PROVIDER_NAME); + + Catalog catalog = + metalake.createCatalog( + catalogName, Catalog.Type.FILESET, "hadoop", "catalog comment", properties); + Assertions.assertTrue(metalake.catalogExists(catalogName)); + + catalog.asSchemas().createSchema(schemaName, "schema comment", properties); + Assertions.assertTrue(catalog.asSchemas().schemaExists(schemaName)); + + conf.set("fs.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.GravitinoVirtualFileSystem"); + conf.set("fs.AbstractFileSystem.gvfs.impl", "org.apache.gravitino.filesystem.hadoop.Gvfs"); + conf.set("fs.gvfs.impl.disable.cache", "true"); + conf.set("fs.gravitino.server.uri", serverUri); + conf.set("fs.gravitino.client.metalake", metalakeName); + + conf.set("fs.gvfs.filesystem.providers", AzureFileSystemProvider.ABS_PROVIDER_NAME); + // Pass this configuration to the real file system + conf.set(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME, ABS_ACCOUNT_NAME); + conf.set(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY, ABS_ACCOUNT_KEY); + conf.set("fs.abfss.impl", "org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem"); + } + + @AfterAll + public void tearDown() throws IOException { + Catalog catalog = metalake.loadCatalog(catalogName); + catalog.asSchemas().dropSchema(schemaName, true); + metalake.dropCatalog(catalogName, true); + client.dropMetalake(metalakeName, true); + + if (client != null) { + client.close(); + client = null; + } + + try { + closer.close(); + } catch (Exception e) { + LOG.error("Exception in closing CloseableGroup", e); + } + } + + /** + * Remove the `gravitino.bypass` prefix from the configuration and pass it to the real file system + * This method corresponds to the method org.apache.gravitino.filesystem.hadoop + * .GravitinoVirtualFileSystem#getConfigMap(Configuration) in the original code. + */ + protected Configuration convertGvfsConfigToRealFileSystemConfig(Configuration gvfsConf) { + Configuration absConf = new Configuration(); + Map map = Maps.newHashMap(); + + gvfsConf.forEach(entry -> map.put(entry.getKey(), entry.getValue())); + + Map hadoopConfMap = FileSystemUtils.toHadoopConfigMap(map, ImmutableMap.of()); + + if (gvfsConf.get(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME) != null + && gvfsConf.get(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY) != null) { + hadoopConfMap.put( + String.format( + "fs.azure.account.key.%s.dfs.core.windows.net", + gvfsConf.get(ABSProperties.GRAVITINO_ABS_ACCOUNT_NAME)), + gvfsConf.get(ABSProperties.GRAVITINO_ABS_ACCOUNT_KEY)); + } + + hadoopConfMap.forEach(absConf::set); + + return absConf; + } + + protected String genStorageLocation(String fileset) { + return String.format( + "%s://%s@%s.dfs.core.windows.net/%s", + AzureFileSystemProvider.ABS_PROVIDER_SCHEME, ABS_CONTAINER_NAME, ABS_ACCOUNT_NAME, fileset); + } + + @Disabled("java.lang.UnsupportedOperationException: Append Support not enabled") + public void testAppend() throws IOException {} + + private static boolean absIsConfigured() { + return StringUtils.isNotBlank(System.getenv("ABS_ACCOUNT_NAME")) + && StringUtils.isNotBlank(System.getenv("ABS_ACCOUNT_KEY")) + && StringUtils.isNotBlank(System.getenv("ABS_CONTAINER_NAME")); + } +} diff --git a/docs/hadoop-catalog.md b/docs/hadoop-catalog.md index 0622574d4d9..f0fb9bb171a 100644 --- a/docs/hadoop-catalog.md +++ b/docs/hadoop-catalog.md @@ -76,6 +76,18 @@ In the meantime, you need to place the corresponding bundle jar [`gravitino-gcp- In the meantime, you need to place the corresponding bundle jar [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/aliyun-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. + +#### Azure Blob Storage fileset + +| Configuration item | Description | Default value | Required | Since version | +|-------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-------------------------------------------|------------------| +| `filesystem-providers` | The file system providers to add. Set it to `abs` if it's a Azure Blob Storage fileset, or a comma separated string that contains `abs` like `oss,abs,s3` to support multiple kinds of fileset including `abs`. | (none) | Yes | 0.8.0-incubating | +| `default-filesystem-provider` | The name default filesystem providers of this Hadoop catalog if users do not specify the scheme in the URI. Default value is `builtin-local`, for Azure Blob Storage, if we set this value, we can omit the prefix 'abfss://' in the location. | `builtin-local` | No | 0.8.0-incubating | +| `abs-account-name` | The account name of Azure Blob storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | +| `abs-account-key` | The account key of Azure Blob storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | + +Similar to the above, you need to place the corresponding bundle jar [`gravitino-azure-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/azure-bundle/) in the directory `${GRAVITINO_HOME}/catalogs/hadoop/libs`. + :::note - Gravitino contains builtin file system providers for local file system(`builtin-local`) and HDFS(`builtin-hdfs`), that is to say if `filesystem-providers` is not set, Gravitino will still support local file system and HDFS. Apart from that, you can set the `filesystem-providers` to support other file systems like S3, GCS, OSS or custom file system. - `default-filesystem-provider` is used to set the default file system provider for the Hadoop catalog. If the user does not specify the scheme in the URI, Gravitino will use the default file system provider to access the fileset. For example, if the default file system provider is set to `builtin-local`, the user can omit the prefix `file://` in the location. diff --git a/docs/how-to-use-gvfs.md b/docs/how-to-use-gvfs.md index 7a3373092c0..6ea3a972d02 100644 --- a/docs/how-to-use-gvfs.md +++ b/docs/how-to-use-gvfs.md @@ -102,6 +102,16 @@ In the meantime, you need to place the corresponding bundle jar [`gravitino-gcp- In the meantime, you need to place the corresponding bundle jar [`gravitino-aliyun-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/aliyun-bundle/) in the Hadoop environment(typically located in `${HADOOP_HOME}/share/hadoop/common/lib/`). +#### Azure Blob Storage fileset + +| Configuration item | Description | Default value | Required | Since version | +|--------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|-------------------------------------------|------------------| +| `fs.gvfs.filesystem.providers` | The file system providers to add. Set it to `abs` if it's a Azure Blob Storage fileset, or a comma separated string that contains `abs` like `oss,abs,s3` to support multiple kinds of fileset including `abs`. | (none) | Yes | 0.8.0-incubating | +| `abs-account-name` | The account name of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | +| `abs-account-key` | The account key of Azure Blob Storage. | (none) | Yes if it's a Azure Blob Storage fileset. | 0.8.0-incubating | + +Similar to the above, you need to place the corresponding bundle jar [`gravitino-azure-bundle-${version}.jar`](https://repo1.maven.org/maven2/org/apache/gravitino/azure-bundle/) in the Hadoop environment(typically located in `${HADOOP_HOME}/share/hadoop/common/lib/`). + #### Custom fileset Since 0.7.0-incubating, users can define their own fileset type and configure the corresponding properties, for more, please refer to [Custom Fileset](./hadoop-catalog.md#how-to-custom-your-own-hcfs-file-system-fileset). So, if you want to access the custom fileset through GVFS, you need to configure the corresponding properties. @@ -111,8 +121,6 @@ So, if you want to access the custom fileset through GVFS, you need to configure | `fs.gvfs.filesystem.providers` | The file system providers. please set it to the value of `YourCustomFileSystemProvider#name` | (none) | Yes | 0.7.0-incubating | | `your-custom-properties` | The properties will be used to create a FileSystem instance in `CustomFileSystemProvider#getFileSystem` | (none) | No | - | - - You can configure these properties in two ways: 1. Before obtaining the `FileSystem` in the code, construct a `Configuration` object and set its properties: diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 79f953fa9a0..4b7441ea297 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -34,6 +34,7 @@ hive2 = "2.3.9" hadoop2 = "2.10.2" hadoop3 = "3.3.0" hadoop3-gcs = "1.9.4-hadoop3" +hadoop3-abs = "3.3.0" hadoop3-aliyun = "3.3.0" hadoop-minikdc = "3.3.0" htrace-core4 = "4.1.0-incubating" @@ -170,6 +171,7 @@ hadoop3-client = { group = "org.apache.hadoop", name = "hadoop-client", version. hadoop3-minicluster = { group = "org.apache.hadoop", name = "hadoop-minicluster", version.ref = "hadoop-minikdc"} hadoop3-gcs = { group = "com.google.cloud.bigdataoss", name = "gcs-connector", version.ref = "hadoop3-gcs"} hadoop3-oss = { group = "org.apache.hadoop", name = "hadoop-aliyun", version.ref = "hadoop3-aliyun"} +hadoop3-abs = { group = "org.apache.hadoop", name = "hadoop-azure", version.ref = "hadoop3-abs"} htrace-core4 = { group = "org.apache.htrace", name = "htrace-core4", version.ref = "htrace-core4" } airlift-json = { group = "io.airlift", name = "json", version.ref = "airlift-json"} airlift-resolver = { group = "io.airlift.resolver", name = "resolver", version.ref = "airlift-resolver"} diff --git a/settings.gradle.kts b/settings.gradle.kts index 1f3efb49544..2cde39c222b 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -74,3 +74,4 @@ include("integration-test-common") include(":bundles:aws-bundle") include(":bundles:gcp-bundle") include(":bundles:aliyun-bundle") +include("bundles:azure-bundle")