Merge pull request #53 from civitaspo/develop

v0.2.0
civitaspo · Jul 16, 2019 · 1a98ee8 · 1a98ee8
2 parents ba6df44 + f0f8b43
commit 1a98ee8
Show file tree

Hide file tree

Showing 35 changed files with 2,285 additions and 1,006 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -11,11 +11,6 @@ references:
       TERM: dumb
 
 jobs:
-  spotless:
-    <<: *environment
-    steps:
-      - checkout
-      - run: ./gradlew spotlessCheck
 
   build:
     <<: *environment
@@ -36,6 +31,4 @@ workflows:
   merge-before:
     jobs:
       - build
-      - spotless
-
 
diff --git a/.scalafmt.conf b/.scalafmt.conf
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,25 @@
+0.2.0 (2019-07-16)
+==================
+
+* [New Feature] Add `athena.apas>` operator.
+* [Enhancement] Use scala 2.13.0
+* [New Feature] `athena.add_partition>` operator
+* [New Feature] `athena.drop_partition>` operator
+* [New Feature] `athena.drop_table>` operator
+* [Enhancement] Suppress aws-java-sdk log
+* [Note] Use aws-java-sdk-glue for catalog only operations.
+* [Breaking Change - `athena.query>`] `preview` option is `false` by default.
+* [Enhancement - `athena.ctas>`] Remove `;` from the query.
+* [Enhancement] Create wrappers for aws-java-sdk for the readability and the separation of responsibilities.
+    * The change of STS has a possibility to break the backward compatibility of `assume role` behavior.
+* [Enhancement] Introduce region variable for `Aws` to resolve region according to `auth_method` option.
+* [New Feature] Add `workgroup` option.
+* [Breaking Change - `athena.query`] Remove the `output` option as the deprecation is notified from before.
+* [Deprecated - `athena.ctas>`] Make `select_query` deprecated.
+* [Note] Introduce `pro.civitaspo.digdag.plugin.athena.aws` package to divide dependencies about aws.
+* [Note] Use the Intellij formatter instead of spotless, so remove spotless from CI.
+
+
 0.1.5 (2018-12-11)
 ==================
 

diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ _export:
     repositories:
       - https://jitpack.io
     dependencies:
-      - pro.civitaspo:digdag-operator-athena:0.1.5
+      - pro.civitaspo:digdag-operator-athena:0.2.0
   athena:
     auth_method: profile
 
@@ -80,21 +80,83 @@ Define the below options on properties (which is indicated by `-c`, `--config`).
 - **region**: The AWS region to use for Athena service. (string, optional)
 - **endpoint**: The Amazon Athena endpoint address to use. (string, optional)
 
+## Configuration for `athena.add_partition>` operator
+
+### Options
+
+- **database**: The name of the database. (string, required)
+- **table**: The name of the partitioned table. (string, required)
+- **location**: The location of the partition. If not specified, this operator generates like hive automatically. (string, default: auto generated like the below)
+    - `${table location}/${partition key1}=${partition value1}/${partition key2}=${partition value2}/...`
+- **partition_kv**: key-value pairs for partitioning (string to string map, required)
+- **save_mode**: The mode to save the partition. (string, default = `"overwrite"`, available values are `"skip_if_exists"`, `"error_if_exists"`, `"overwrite"`)
+- **follow_location**: Skip to add a partition and drop the partition if the location does not exist. (boolean, default: `true`)
+- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
+
+### Output Parameters
+
+Nothing
+
+## Configuration for `athena.drop_partition>` operator
+
+### Options
+
+- **database**: The name of the database. (string, required)
+- **table**: The name of the partitioned table. (string, required)
+- **partition_kv**: key-value pairs for partitioning (string to string map, required)
+- **with_location**: Drop the partition with removing objects on S3 (boolean, default: `false`)
+- **ignore_if_not_exist**: Ignore if the partition does not exist. (boolean, default: `true`)
+- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
+
+### Output Parameters
+
+Nothing
+
+## Configuration for `athena.apas>` operator
+
+`apas` means *Add a partition as select* that creates a partition the query result is stored.
+
+### Options
+
+- **athena.apas>**: The select SQL statements or file location (in local or Amazon S3) to be executed for a new table by [`Create Table As Select`]((https://aws.amazon.com/jp/about-aws/whats-new/2018/10/athena_ctas_support/)). You can use digdag's template engine like `${...}` in the SQL query. (string, required)
+- **database**: The name of the database that has the partitioned table. (string, required)
+- **table**: The name of the partitioned table. (string, required)
+- **workgroup**: The name of the workgroup in which the query is being started. (string, optional)
+- **partition_kv**: key-value pairs for partitioning (string to string map, required)
+- **location**: The location of the partition. If not specified, this operator generates like hive automatically. (string, default: auto generated like the below)
+    - `${table location}/${partition key1}=${partition value1}/${partition key2}=${partition value2}/...`
+- **save_mode**: Specify the expected behavior. Available values are `"skip_if_exists"`, `"error_if_exists"`, `"ignore"`, `"overwrite"`. See the below explanation of the behaviour. (string, default: `"overwrite"`)
+    - `"skip_if_exists"`: Skip processing if the partition or the location exists.
+    - `"error_if_exists"`: Raise error if the partition or the location exists.
+    - `"overwrite"`: Always recreate the partition and the location if exists. This operation is not atomic.
+- **bucketed_by**: An array list of buckets to bucket data. If omitted, Athena does not bucket your data in this query. (array of string, optional)
+- **bucket_count**: The number of buckets for bucketing your data. If omitted, Athena does not bucket your data. (integer, optional)
+- **additional_properties**: Additional properties for CTAS that is used `athena.apas>` internally. These are used for CTAS WITH clause without escaping. (string to string map, optional)
+- **ignore_schema_diff**: Ignore if the schema of the query result is different from tha table. (boolean, default: `false`)
+- **token_prefix**: Prefix for `ClientRequestToken` that a unique case-sensitive string used to ensure the request to create the query is idempotent (executes only once). On this plugin, the token is composed like `${token_prefix}-${session_uuid}-${hash value of query}-${radom string}`. (string, default: `"digdag-athena-apas"`)
+- **timeout**: Specify timeout period. (`DurationParam`, default: `"10m"`)
+- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
+
+### Output Parameters
+
+Nothing
+
 ## Configuration for `athena.query>` operator
 
 ### Options
 
 - **athena.query>**: The SQL query statements or file location (in local or Amazon S3) to be executed. You can use digdag's template engine like `${...}` in the SQL query. (string, required)
 - **token_prefix**: Prefix for `ClientRequestToken` that a unique case-sensitive string used to ensure the request to create the query is idempotent (executes only once). On this plugin, the token is composed like `${token_prefix}-${session_uuid}-${hash value of query}-${random string}`. (string, default: `"digdag-athena"`)
 - **database**: The name of the database. (string, optional)
-- **output**: The location in Amazon S3 where your query results are stored, such as `"s3://path/to/query/"`. For more information, see [Queries and Query Result Files](https://docs.aws.amazon.com/athena/latest/ug/querying.html). (string, default: `"s3://aws-athena-query-results-${AWS_ACCOUNT_ID}-<AWS_REGION>"`)
+- **workgroup**: The name of the workgroup in which the query is being started. (string, optional)
 - **timeout**: Specify timeout period. (`DurationParam`, default: `"10m"`)
 - **preview**: Call `athena.preview>` operator after run `athena.query>`. (boolean, default: `true`)
 
 ### Output Parameters
 
 - **athena.last_query.id**: The unique identifier for each query execution. (string)
 - **athena.last_query.database**: The name of the database. (string)
+- **athena.last_query.workgroup**: The name of the workgroup in which the query is being started. (string)
 - **athena.last_query.query**: The SQL query statements which the query execution ran. (string)
 - **athena.last_query.output**: The location in Amazon S3 where your query results are stored. (string)
 - **athena.last_query.scan_bytes**: The number of bytes in the data that was queried. (long)
@@ -131,9 +193,10 @@ Define the below options on properties (which is indicated by `-c`, `--config`).
 
 ### Options
 
-- **select_query**: The select SQL statements or file location (in local or Amazon S3) to be executed for a new table by [`Create Table As Select`]((https://aws.amazon.com/jp/about-aws/whats-new/2018/10/athena_ctas_support/)). You can use digdag's template engine like `${...}` in the SQL query. (string, required)
+- **athena.ctas>**: The select SQL statements or file location (in local or Amazon S3) to be executed for a new table by [`Create Table As Select`]((https://aws.amazon.com/jp/about-aws/whats-new/2018/10/athena_ctas_support/)). You can use digdag's template engine like `${...}` in the SQL query. (string, required)
 - **database**: The database name for query execution context. (string, optional)
 - **table**: The table name for the new table (string, default: `digdag_athena_ctas_${session_uuid.replaceAll("-", "")}_${random}`)
+- **workgroup**: The name of the workgroup in which the query is being started. (string, optional)
 - **output**: Output location for data created by CTAS (string, default: `"s3://aws-athena-query-results-${AWS_ACCOUNT_ID}-<AWS_REGION>/Unsaved/${YEAR}/${MONTH}/${DAY}/${athena_query_id}/"`)
 - **format**: The data format for the CTAS query results, such as `"orc"`, `"parquet"`, `"avro"`, `"json"`, or `"textfile"`. (string, default: `"parquet"`)
 - **compression**: The compression type to use for `"orc"` or `"parquet"`. (string, default: `"snappy"`)
@@ -144,7 +207,7 @@ Define the below options on properties (which is indicated by `-c`, `--config`).
 - **additional_properties**: Additional properties for CTAS. These are used for CTAS WITH clause without escaping. (string to string map, optional)
 - **table_mode**: Specify the expected behavior of CTAS results. Available values are `"default"`, `"empty"`, `"data_only"`. See the below explanation of the behaviour. (string, default: `"default"`)
   - `"default"`: Do not do any care. This option require the least IAM privileges for digdag, but the behaviour depends on Athena.
-  - `"empty_table"`: Create a new empty table with the same schema as the select query results.
+  - `"empty"`: Create a new empty table with the same schema as the select query results.
   - `"data_only"`: Create a new table with data by CTAS, but drop this after CTAS execution. The table created by CTAS is an external table, so the data is left even if the table is dropped.
 - **save_mode**: Specify the expected behavior of CTAS. Available values are `"none"`, `"error_if_exists"`, `"ignore"`, `"overwrite"`. See the below explanation of the behaviour. (string, default: `"overwrite"`)
   - `"none"`: Do not do any care. This option require the least IAM privileges for digdag, but the behaviour depends on Athena.
@@ -158,6 +221,18 @@ Define the below options on properties (which is indicated by `-c`, `--config`).
 
 Nothing
 
+## Configuration for `athena.drop_table>` operator
+
+- **database**: The name of the database. (string, required)
+- **table**: The name of the partitioned table. (string, required)
+- **with_location**: Drop the partition with removing objects on S3 (boolean, default: `false`)
+- **ignore_if_not_exist**: Ignore if the partition does not exist. (boolean, default: `true`)
+- **catalog_id**: glue data catalog id if you use a catalog different from account/region default catalog. (string, optional)
+
+### Output Parameters
+
+Nothing
+
 # Development
 
 ## Run an Example

diff --git a/build.gradle b/build.gradle
@@ -1,17 +1,16 @@
 plugins {
     id 'scala'
     id 'maven-publish'
-    id 'com.github.johnrengelman.shadow' version '2.0.2'
-    id "com.diffplug.gradle.spotless" version "3.13.0"
+    id 'com.github.johnrengelman.shadow' version '5.1.0'
 }
 
 group = 'pro.civitaspo'
-version = '0.1.5'
+version = '0.2.0'
 
-def digdagVersion = '0.9.27'
-def awsSdkVersion = "1.11.372"
-def scalaSemanticVersion = "2.12.6"
-def depScalaVersion = "2.12"
+def digdagVersion = '0.9.37'
+def awsSdkVersion = "1.11.587"
+def scalaSemanticVersion = "2.13.0"
+def depScalaVersion = "2.13"
 
 repositories {
     mavenCentral()
@@ -33,6 +32,8 @@ dependencies {
     compile group: 'com.amazonaws', name: 'aws-java-sdk-s3', version: awsSdkVersion
     // https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-sts
     compile group: 'com.amazonaws', name: 'aws-java-sdk-sts', version: awsSdkVersion
+    // https://mvnrepository.com/artifact/com.amazonaws/aws-java-sdk-glue
+    compile group: 'com.amazonaws', name: 'aws-java-sdk-glue', version: awsSdkVersion
 }
 
 shadowJar {
@@ -55,12 +56,6 @@ publishing {
     }
 }
 
-spotless {
-    scala {
-        scalafmt('1.5.1').configFile('.scalafmt.conf')
-    }
-}
-
 sourceCompatibility = 1.8
 targetCompatibility = 1.8
 

diff --git a/example/example.dig b/example/example.dig
@@ -4,7 +4,7 @@ _export:
       - file://${repos}
       # - https://jitpack.io
     dependencies:
-      - pro.civitaspo:digdag-operator-athena:0.1.5
+      - pro.civitaspo:digdag-operator-athena:0.2.0
   athena:
     auth_method: profile
     value: 5
@@ -15,10 +15,78 @@ _export:
 +step2:
   echo>: ${athena}
 
-+stap3:
-  athena.ctas>:
-  select_query: template.sql
++step3:
+  athena.preview>: ${athena.last_query.id}
+
++stap4:
+  athena.ctas>: template.sql
+  database: ${database}
+  table: hoge
+  output: ${output}
+
++step5:
+  echo>: ${athena}
+
++step6:
+  athena.drop_table>:
+  database: ${database}
+  table: hoge
+  with_location: true
+
++step7:
+  athena.ctas>: select 1 as a, 2 as b, 3 as c union all select 4 as a, 5 as b, 6 as c
   database: ${database}
   table: hoge
   output: ${output}
+  partitioned_by: [b, c]
 
++step8:
+  athena.drop_partition>:
+  database: ${database}
+  table: hoge
+  partition_kv:
+    b: "5"
+    c: "6"
+
++step9:
+  athena.add_partition>:
+  database: ${database}
+  table: hoge
+  partition_kv:
+    b: "5"
+    c: "6"
+
++step10:
+  athena.add_partition>:
+  database: ${database}
+  table: hoge
+  partition_kv:
+    b: "5"
+    c: "6"
+
++step11:
+  athena.add_partition>:
+  database: ${database}
+  table: hoge
+  partition_kv:
+    b: "5"
+    c: "6"
+  location: ${output}/hoge/fuga/hogo/
+
++step12:
+  athena.drop_partition>:
+  database: ${database}
+  table: hoge
+  with_location: true
+  partition_kv:
+    b: "2"
+    c: "3"
+
++step13:
+  athena.apas>: select 5 as a
+  database: ${database}
+  table: hoge
+  partition_kv:
+    b: "9"
+    c: "10"
+  save_mode: overwrite
diff --git a/example/run.sh b/example/run.sh
@@ -5,6 +5,8 @@ EXAMPLE_ROOT=$ROOT/example
 LOCAL_MAVEN_REPO=$ROOT/build/repo
 DATABASE="$1"
 OUTPUT="$2"
+WORKGROUP="$3"
+PARAM_OPTION=""
 
 if [ -z "$DATABASE" ]; then
     echo "[ERROR] Set database as the first argument."
@@ -14,6 +16,9 @@ if [ -z "$OUTPUT" ]; then
     echo "[ERROR] Set output s3 URI as the second argument."
     exit 1
 fi
+if [ -n "$WORKGROUP" ]; then
+    PARAM_OPTION="-p workgroup=$WORKGROUP"
+fi
 
 (
   cd $EXAMPLE_ROOT
@@ -22,5 +27,5 @@ fi
   rm -rfv .digdag
 
   ## run
-  digdag run example.dig -c allow.properties -p repos=${LOCAL_MAVEN_REPO} -p output=${OUTPUT} -p database=${DATABASE} --no-save
+  digdag run example.dig -c allow.properties -p repos=${LOCAL_MAVEN_REPO} -p output=${OUTPUT} -p database=${DATABASE} $PARAM_OPTION --no-save
 )
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
@@ -1,5 +1,5 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-5.5-bin.zip
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists