Skip to content

Commit

Permalink
Merge branch 'main' into ARROW-42030
Browse files Browse the repository at this point in the history
  • Loading branch information
llama90 authored Jun 10, 2024
2 parents 64f8b60 + 0ff17bf commit 2ad2717
Showing 223 changed files with 5,703 additions and 4,293 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -102,8 +102,8 @@ __debug_bin
.envrc

# Develocity
.mvn/.gradle-enterprise/
.mvn/.develocity/
java/.mvn/.gradle-enterprise/
java/.mvn/.develocity/

# rat
filtered_rat.txt
14 changes: 14 additions & 0 deletions cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
@@ -117,6 +117,8 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
credential_kind = CredentialKind::kDefault;
} else if (kv.second == "anonymous") {
credential_kind = CredentialKind::kAnonymous;
} else if (kv.second == "cli") {
credential_kind = CredentialKind::kCLI;
} else if (kv.second == "workload_identity") {
credential_kind = CredentialKind::kWorkloadIdentity;
} else if (kv.second == "environment") {
@@ -170,6 +172,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
case CredentialKind::kAnonymous:
RETURN_NOT_OK(ConfigureAnonymousCredential());
break;
case CredentialKind::kCLI:
RETURN_NOT_OK(ConfigureCLICredential());
break;
case CredentialKind::kWorkloadIdentity:
RETURN_NOT_OK(ConfigureWorkloadIdentityCredential());
break;
@@ -255,6 +260,7 @@ bool AzureOptions::Equals(const AzureOptions& other) const {
return storage_shared_key_credential_->AccountName ==
other.storage_shared_key_credential_->AccountName;
case CredentialKind::kClientSecret:
case CredentialKind::kCLI:
case CredentialKind::kManagedIdentity:
case CredentialKind::kWorkloadIdentity:
case CredentialKind::kEnvironment:
@@ -337,6 +343,12 @@ Status AzureOptions::ConfigureManagedIdentityCredential(const std::string& clien
return Status::OK();
}

Status AzureOptions::ConfigureCLICredential() {
credential_kind_ = CredentialKind::kCLI;
token_credential_ = std::make_shared<Azure::Identity::AzureCliCredential>();
return Status::OK();
}

Status AzureOptions::ConfigureWorkloadIdentityCredential() {
credential_kind_ = CredentialKind::kWorkloadIdentity;
token_credential_ = std::make_shared<Azure::Identity::WorkloadIdentityCredential>();
@@ -364,6 +376,7 @@ Result<std::unique_ptr<Blobs::BlobServiceClient>> AzureOptions::MakeBlobServiceC
[[fallthrough]];
case CredentialKind::kClientSecret:
case CredentialKind::kManagedIdentity:
case CredentialKind::kCLI:
case CredentialKind::kWorkloadIdentity:
case CredentialKind::kEnvironment:
return std::make_unique<Blobs::BlobServiceClient>(AccountBlobUrl(account_name),
@@ -391,6 +404,7 @@ AzureOptions::MakeDataLakeServiceClient() const {
[[fallthrough]];
case CredentialKind::kClientSecret:
case CredentialKind::kManagedIdentity:
case CredentialKind::kCLI:
case CredentialKind::kWorkloadIdentity:
case CredentialKind::kEnvironment:
return std::make_unique<DataLake::DataLakeServiceClient>(
11 changes: 7 additions & 4 deletions cpp/src/arrow/filesystem/azurefs.h
Original file line number Diff line number Diff line change
@@ -119,6 +119,7 @@ struct ARROW_EXPORT AzureOptions {
kStorageSharedKey,
kClientSecret,
kManagedIdentity,
kCLI,
kWorkloadIdentity,
kEnvironment,
} credential_kind_ = CredentialKind::kDefault;
@@ -160,14 +161,15 @@ struct ARROW_EXPORT AzureOptions {
/// * blob_storage_authority: Set AzureOptions::blob_storage_authority
/// * dfs_storage_authority: Set AzureOptions::dfs_storage_authority
/// * enable_tls: If it's "false" or "0", HTTP not HTTPS is used.
/// * credential_kind: One of "default", "anonymous",
/// "workload_identity" or "environment". If "default" is specified, it's
/// * credential_kind: One of "default", "anonymous", "workload_identity",
/// "environment" or "cli". If "default" is specified, it's
/// just ignored. If "anonymous" is specified,
/// AzureOptions::ConfigureAnonymousCredential() is called. If
/// "workload_identity" is specified,
/// AzureOptions::ConfigureWorkloadIdentityCredential() is called, If
/// AzureOptions::ConfigureWorkloadIdentityCredential() is called. If
/// "environment" is specified,
/// AzureOptions::ConfigureEnvironmentCredential() is called.
/// AzureOptions::ConfigureEnvironmentCredential() is called. If "cli" is
/// specified, AzureOptions::ConfigureCLICredential() is called.
/// * tenant_id: You must specify "client_id" and "client_secret"
/// too. AzureOptions::ConfigureClientSecretCredential() is called.
/// * client_id: If you don't specify "tenant_id" and
@@ -190,6 +192,7 @@ struct ARROW_EXPORT AzureOptions {
const std::string& client_id,
const std::string& client_secret);
Status ConfigureManagedIdentityCredential(const std::string& client_id = std::string());
Status ConfigureCLICredential();
Status ConfigureWorkloadIdentityCredential();
Status ConfigureEnvironmentCredential();

17 changes: 17 additions & 0 deletions cpp/src/arrow/filesystem/azurefs_test.cc
Original file line number Diff line number Diff line change
@@ -521,6 +521,13 @@ TEST(AzureFileSystem, InitializeWithManagedIdentityCredential) {
EXPECT_OK_AND_ASSIGN(fs, AzureFileSystem::Make(options));
}

TEST(AzureFileSystem, InitializeWithCLICredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
ARROW_EXPECT_OK(options.ConfigureCLICredential());
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));
}

TEST(AzureFileSystem, InitializeWithWorkloadIdentityCredential) {
AzureOptions options;
options.account_name = "dummy-account-name";
@@ -667,6 +674,15 @@ class TestAzureOptions : public ::testing::Test {
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kManagedIdentity);
}

void TestFromUriCredentialCLI() {
ASSERT_OK_AND_ASSIGN(
auto options,
AzureOptions::FromUri("abfs://account.blob.core.windows.net/container/dir/blob?"
"credential_kind=cli",
nullptr));
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kCLI);
}

void TestFromUriCredentialWorkloadIdentity() {
ASSERT_OK_AND_ASSIGN(
auto options,
@@ -733,6 +749,7 @@ TEST_F(TestAzureOptions, FromUriCredentialClientSecret) {
TEST_F(TestAzureOptions, FromUriCredentialManagedIdentity) {
TestFromUriCredentialManagedIdentity();
}
TEST_F(TestAzureOptions, FromUriCredentialCLI) { TestFromUriCredentialCLI(); }
TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) {
TestFromUriCredentialWorkloadIdentity();
}
46 changes: 44 additions & 2 deletions docs/source/developers/java/development.rst
Original file line number Diff line number Diff line change
@@ -110,7 +110,46 @@ integration tests, you would do:
Code Style
==========

Java code style is enforced with Checkstyle. The configuration is located at `checkstyle`_.
The current Java code follows the `Google Java Style`_ with Apache license headers.

Java code style is checked by `Spotless`_ during the build, and the continuous integration build will verify
that changes adhere to the style guide.

Automatically fixing code style issues
--------------------------------------

- You can check the style without building the project with ``mvn spotless:check``.
- You can autoformat the source with ``mvn spotless:apply``.

Example:

.. code-block:: bash
The following files had format violations:
src/main/java/org/apache/arrow/algorithm/rank/VectorRank.java
@@ -15,7 +15,6 @@
·*·limitations·under·the·License.
·*/
-
package·org.apache.arrow.algorithm.rank;
import·java.util.stream.IntStream;
Run 'mvn spotless:apply' to fix these violations.
Code Formatter for Intellij IDEA and Eclipse
--------------------------------------------

Follow the instructions to set up google-java-format for:

- `Eclipse`_
- `IntelliJ`_


Checkstyle
----------

Checkstyle is also used for general linting. The configuration is located at `checkstyle`_.
You can also just check the style without building the project.
This checks the code style of all source code under the current directory or from within an individual module.

@@ -137,7 +176,10 @@ This applies the style to all pom.xml files under the current directory or from
.. _conbench: https://github.com/conbench/conbench
.. _checkstyle: https://github.com/apache/arrow/blob/main/java/dev/checkstyle/checkstyle.xml
.. _Apache Maven pom.xml guidelines: https://maven.apache.org/developers/conventions/code.html#pom-code-convention

.. _Spotless: https://github.com/diffplug/spotless
.. _Google Java Style: https://google.github.io/styleguide/javaguide.html
.. _Eclipse: https://github.com/google/google-java-format?tab=readme-ov-file#eclipse
.. _IntelliJ: https://github.com/google/google-java-format?tab=readme-ov-file#intellij-android-studio-and-other-jetbrains-ides

Build Caching
=============
2 changes: 1 addition & 1 deletion go/README.md
Original file line number Diff line number Diff line change
@@ -48,7 +48,7 @@ func main() {

DSN option keys are expressed as `k=v`, delimited with `;`.
Some options keys are defined in ADBC, others are defined in the FlightSQL ADBC driver.
- Arrow ADBC [developer doc](https://arrow.apache.org/adbc/main/driver/go/flight_sql.html#client-options)
- Arrow ADBC [developer doc](https://arrow.apache.org/adbc/main/driver/flight_sql.html#client-options)
- ADBC [source code](https://github.com/apache/arrow-adbc/blob/3d12fad1bae21029a8ff25604d6e65760c3f65bd/go/adbc/adbc.go#L149-L158)
- FlightSQL driver option keys [source code](https://github.com/apache/arrow-adbc/blob/3d12fad1bae21029a8ff25604d6e65760c3f65bd/go/adbc/driver/flightsql/flightsql_adbc.go#L70-L81)

File renamed without changes.
File renamed without changes.
33 changes: 23 additions & 10 deletions java/adapter/avro/pom.xml
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
license agreements. See the NOTICE file distributed with this work for additional
information regarding copyright ownership. The ASF licenses this file to
You under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License. You may obtain a copy of
the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
by applicable law or agreed to in writing, software distributed under the
License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
OF ANY KIND, either express or implied. See the License for the specific
language governing permissions and limitations under the License. -->
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

@@ -24,6 +32,11 @@
<description>(Contrib/Experimental) A library for converting Avro data to Arrow data.</description>
<url>http://maven.apache.org</url>

<properties>
<checkstyle.config.location>dev/checkstyle/checkstyle-spotless.xml</checkstyle.config.location>
<spotless.java.excludes>none</spotless.java.excludes>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.arrow</groupId>
Original file line number Diff line number Diff line change
@@ -14,24 +14,20 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.adapter.avro;

import java.io.IOException;

import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.avro.Schema;
import org.apache.avro.io.Decoder;

/**
* Utility class to convert Avro objects to columnar Arrow format objects.
*/
/** Utility class to convert Avro objects to columnar Arrow format objects. */
public class AvroToArrow {

/**
* Fetch the data from {@link Decoder} and convert it to Arrow objects.
* Only for testing purpose.
* Fetch the data from {@link Decoder} and convert it to Arrow objects. Only for testing purpose.
*
* @param schema avro schema.
* @param decoder avro decoder
* @param config configuration of the conversion.
@@ -48,15 +44,14 @@ static VectorSchemaRoot avroToArrow(Schema schema, Decoder decoder, AvroToArrowC

/**
* Fetch the data from {@link Decoder} and iteratively convert it to Arrow objects.
*
* @param schema avro schema
* @param decoder avro decoder
* @param config configuration of the conversion.
* @throws IOException on error
*/
public static AvroToArrowVectorIterator avroToArrowIterator(
Schema schema,
Decoder decoder,
AvroToArrowConfig config) throws IOException {
Schema schema, Decoder decoder, AvroToArrowConfig config) throws IOException {

Preconditions.checkNotNull(schema, "Avro schema object cannot be null");
Preconditions.checkNotNull(decoder, "Avro decoder object cannot be null");
Original file line number Diff line number Diff line change
@@ -14,40 +14,35 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.arrow.adapter.avro;

import java.util.Set;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.util.Preconditions;
import org.apache.arrow.vector.dictionary.DictionaryProvider;

/**
* This class configures the Avro-to-Arrow conversion process.
*/
/** This class configures the Avro-to-Arrow conversion process. */
public class AvroToArrowConfig {

private final BufferAllocator allocator;
/**
* The maximum rowCount to read each time when partially convert data.
* Default value is 1024 and -1 means read all data into one vector.
* The maximum rowCount to read each time when partially convert data. Default value is 1024 and
* -1 means read all data into one vector.
*/
private final int targetBatchSize;

/**
* The dictionary provider used for enum type.
* If avro schema has enum type, will create dictionary and update this provider.
* The dictionary provider used for enum type. If avro schema has enum type, will create
* dictionary and update this provider.
*/
private final DictionaryProvider.MapDictionaryProvider provider;

/**
* The field names which to skip when reading decoder values.
*/
/** The field names which to skip when reading decoder values. */
private final Set<String> skipFieldNames;

/**
* Instantiate an instance.
*
* @param allocator The memory allocator to construct the Arrow vectors with.
* @param targetBatchSize The maximum rowCount to read each time when partially convert data.
* @param provider The dictionary provider used for enum type, adapter will update this provider.
@@ -59,8 +54,10 @@ public class AvroToArrowConfig {
DictionaryProvider.MapDictionaryProvider provider,
Set<String> skipFieldNames) {

Preconditions.checkArgument(targetBatchSize == AvroToArrowVectorIterator.NO_LIMIT_BATCH_SIZE ||
targetBatchSize > 0, "invalid targetBatchSize: %s", targetBatchSize);
Preconditions.checkArgument(
targetBatchSize == AvroToArrowVectorIterator.NO_LIMIT_BATCH_SIZE || targetBatchSize > 0,
"invalid targetBatchSize: %s",
targetBatchSize);

this.allocator = allocator;
this.targetBatchSize = targetBatchSize;
Loading

0 comments on commit 2ad2717

Please sign in to comment.