diff --git a/.github/workflows/spanner-staging-tests.yml b/.github/workflows/spanner-staging-tests.yml new file mode 100644 index 0000000000..51c22a991e --- /dev/null +++ b/.github/workflows/spanner-staging-tests.yml @@ -0,0 +1,51 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Spanner Staging integration tests + +on: + workflow_dispatch: + +permissions: read-all + +jobs: + spanner_java_integration_tests_templates: + name: Spanner Dataflow Templates Integration Tests + timeout-minutes: 180 + # Run on any runner that matches all the specified runs-on values. + runs-on: [ self-hosted, it ] + steps: + - name: Checkout Code + uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2.7.0 + - name: Setup Environment + id: setup-env + uses: ./.github/actions/setup-env + - name: Run Integration Tests + run: | + ./cicd/run-it-tests \ + --modules-to-build="ALL" \ + --it-region="us-central1" \ + --it-project="cloud-teleport-testing" \ + --it-artifact-bucket="cloud-teleport-testing-it-gitactions" \ + --it-private-connectivity="datastream-private-connect-us-central1" \ + --it-spanner-host="https://staging-wrenchworks.sandbox.googleapis.com/" + - name: Upload Integration Tests Report + uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2 + if: always() # always run even if the previous step fails + with: + name: surefire-test-results + path: '**/surefire-reports/TEST-*.xml' + retention-days: 1 + - name: Cleanup Java Environment + uses: ./.github/actions/cleanup-java-env \ No newline at end of file diff --git a/cicd/cmd/run-it-smoke-tests/main.go b/cicd/cmd/run-it-smoke-tests/main.go index e5fa462235..5a2fc17fc1 100644 --- a/cicd/cmd/run-it-smoke-tests/main.go +++ b/cicd/cmd/run-it-smoke-tests/main.go @@ -66,6 +66,7 @@ func main() { flags.ArtifactBucket(), flags.StageBucket(), flags.PrivateConnectivity(), + flags.SpannerHost(), flags.FailureMode(), flags.RetryFailures(), flags.StaticOracleHost(), diff --git a/cicd/cmd/run-it-tests/main.go b/cicd/cmd/run-it-tests/main.go index 24b8145d63..8830edfb9b 100644 --- a/cicd/cmd/run-it-tests/main.go +++ b/cicd/cmd/run-it-tests/main.go @@ -67,6 +67,7 @@ func main() { flags.StageBucket(), flags.HostIp(), flags.PrivateConnectivity(), + flags.SpannerHost(), flags.FailureMode(), flags.RetryFailures(), flags.StaticOracleHost(), diff --git a/cicd/internal/flags/it-flags.go b/cicd/internal/flags/it-flags.go index 1aa55e9a08..6937353983 100644 --- a/cicd/internal/flags/it-flags.go +++ b/cicd/internal/flags/it-flags.go @@ -101,10 +101,10 @@ func PrivateConnectivity() string { } func SpannerHost() string { - if dSpannerHost == "" { - return "-DspannerHost=" + "https://staging-wrenchworks.sandbox.googleapis.com/" + if dSpannerHost != "" { + return "-DspannerHost=" + dSpannerHost } - return "-DspannerHost=" + dSpannerHost + return "" } func FailureMode() string { diff --git a/contributor-docs/add-integration-or-load-test.md b/contributor-docs/add-integration-or-load-test.md index ef56147eb5..4c81c39187 100644 --- a/contributor-docs/add-integration-or-load-test.md +++ b/contributor-docs/add-integration-or-load-test.md @@ -408,7 +408,7 @@ vary on whether the pipeline under test is a `Batch` or `Streaming` pipeline and the type of test. ### Structure -First extend the test class from the [LoadTestBase](https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/it/google-cloud-platform/src/main/java/com/google/cloud/teleport/it/gcp/LoadTestBase.java) +First extend the test class from the [LoadTestBase](https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java) class. LoadTestBase contains helper methods which abstract irrelevant information and make it easier to write load tests. It also defines some clients and variables which are useful for writing tests. @@ -552,8 +552,8 @@ public void testSteadyState1hr() { ### Exporting Results -After the pipeline finishes successfully, we can get the performance metrics using [getMetrics](https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/it/google-cloud-platform/src/main/java/com/google/cloud/teleport/it/gcp/LoadTestBase.java#L272) -method and export the results to BigQuery by calling the [exportMetricsToBigQuery](https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/it/google-cloud-platform/src/main/java/com/google/cloud/teleport/it/gcp/LoadTestBase.java#L127) method. +After the pipeline finishes successfully, we can get the performance metrics using [getMetrics](https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java#L279) +method and export the results to BigQuery by calling the [exportMetricsToBigQuery](https://github.com/GoogleCloudPlatform/DataflowTemplates/blob/main/it/google-cloud-platform/src/main/java/org/apache/beam/it/gcp/LoadTestBase.java#L139) method. The BigQuery project, dataset, and table to be used to export the data can be specified in the command line using, * `-DexportProject` - BigQuery Project to export metrics (optional, if not provided `-Dproject` is used) diff --git a/metadata/src/main/java/com/google/cloud/teleport/metadata/SpannerStagingTest.java b/metadata/src/main/java/com/google/cloud/teleport/metadata/SpannerStagingTest.java new file mode 100644 index 0000000000..48947f267d --- /dev/null +++ b/metadata/src/main/java/com/google/cloud/teleport/metadata/SpannerStagingTest.java @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.metadata; + +/** Annotation that marks the test as a Spanner staging test. */ +public @interface SpannerStagingTest {} diff --git a/plugins/core-plugin/src/main/java/com/google/cloud/teleport/plugin/model/ImageSpecParameter.java b/plugins/core-plugin/src/main/java/com/google/cloud/teleport/plugin/model/ImageSpecParameter.java index 979496c892..f1125b33ac 100644 --- a/plugins/core-plugin/src/main/java/com/google/cloud/teleport/plugin/model/ImageSpecParameter.java +++ b/plugins/core-plugin/src/main/java/com/google/cloud/teleport/plugin/model/ImageSpecParameter.java @@ -652,7 +652,7 @@ protected void processDescriptions( this.setHelpText(helpText); if (example != null && !example.isEmpty()) { - this.setHelpText(this.getHelpText() + " (Example: " + example + ")"); + this.setHelpText(this.getHelpText() + " For example, `" + example + "`"); } } } diff --git a/plugins/core-plugin/src/main/resources/README-template.md b/plugins/core-plugin/src/main/resources/README-template.md index a3b01c3af5..2c80e28a2d 100644 --- a/plugins/core-plugin/src/main/resources/README-template.md +++ b/plugins/core-plugin/src/main/resources/README-template.md @@ -21,12 +21,12 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -<#list spec.metadata.parameters as parameter><#if !parameter.optional!false>* **${parameter.name}** : ${parameter.helpText?ensure_ends_with(".")} +<#list spec.metadata.parameters as parameter><#if !parameter.optional!false>* **${parameter.name}**: ${parameter.helpText?ensure_ends_with(".")} ### Optional parameters -<#list spec.metadata.parameters as parameter><#if parameter.optional!false>* **${parameter.name}** : ${parameter.helpText?ensure_ends_with(".")} +<#list spec.metadata.parameters as parameter><#if parameter.optional!false>* **${parameter.name}**: ${parameter.helpText?ensure_ends_with(".")} diff --git a/pom.xml b/pom.xml index 297ac1ae3d..04b1d3d6d1 100644 --- a/pom.xml +++ b/pom.xml @@ -106,6 +106,7 @@ com.google.cloud.teleport.metadata.TemplateLoadTest com.google.cloud.teleport.metadata.DirectRunnerTest + com.google.cloud.teleport.metadata.SpannerStagingTest JAVA_LICENSE_HEADER @@ -458,6 +459,56 @@ + + spannerStagingIntegrationTests + + false + + + + true + + false + + classesAndMethods + 2 + + + + + org.apache.maven.plugins + maven-surefire-plugin + ${surefire.version} + + + + beamPythonVersion + ${beam-python.version} + + + beamJavaVersion + ${beam.version} + + + beamMavenRepo + ${beam-maven-repo} + + + + **/*.java + + + ${spanner.staging.tests} + + true + ${itParallelismType} + ${itParallelism} + false + + + + + templatesLoadTests diff --git a/python/README_Yaml_Template.md b/python/README_Yaml_Template.md index 650ff8b3c0..4d2b7d07bc 100644 --- a/python/README_Yaml_Template.md +++ b/python/README_Yaml_Template.md @@ -25,9 +25,9 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Optional parameters -* **yaml_pipeline** : A yaml description of the pipeline to run. -* **yaml_pipeline_file** : A file in Cloud Storage containing a yaml description of the pipeline to run. -* **jinja_variables** : A json dict of variables used when invoking the jinja preprocessor on the provided yaml pipeline. +* **yaml_pipeline**: A yaml description of the pipeline to run. +* **yaml_pipeline_file**: A file in Cloud Storage containing a yaml description of the pipeline to run. +* **jinja_variables**: A json dict of variables used when invoking the jinja preprocessor on the provided yaml pipeline. diff --git a/v1/README_Bulk_Compress_GCS_Files.md b/v1/README_Bulk_Compress_GCS_Files.md index 8a7aec4625..d1bdce7bd1 100644 --- a/v1/README_Bulk_Compress_GCS_Files.md +++ b/v1/README_Bulk_Compress_GCS_Files.md @@ -27,14 +27,14 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The Cloud Storage location of the files you'd like to process. (Example: gs://your-bucket/your-files/*.txt). -* **outputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. (Example: gs://your-bucket/your-path). -* **outputFailureFile** : The error log output file to use for write failures that occur during compression. The contents will be one line for each file which failed compression. Note that this parameter will allow the pipeline to continue processing in the event of a failure. (Example: gs://your-bucket/compressed/failed.csv). -* **compression** : The compression algorithm used to compress the matched files. Valid algorithms: BZIP2, DEFLATE, GZIP. +* **inputFilePattern**: The Cloud Storage location of the files you'd like to process. For example, `gs://your-bucket/your-files/*.txt`. +* **outputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. For example, `gs://your-bucket/your-path`. +* **outputFailureFile**: The error log output file to use for write failures that occur during compression. The contents will be one line for each file which failed compression. Note that this parameter will allow the pipeline to continue processing in the event of a failure. For example, `gs://your-bucket/compressed/failed.csv`. +* **compression**: The compression algorithm used to compress the matched files. Valid algorithms: BZIP2, DEFLATE, GZIP. ### Optional parameters -* **outputFilenameSuffix** : Output filename suffix of the files to write. Defaults to .bzip2, .deflate or .gz depending on the compression algorithm. +* **outputFilenameSuffix**: Output filename suffix of the files to write. Defaults to .bzip2, .deflate or .gz depending on the compression algorithm. @@ -211,9 +211,9 @@ resource "google_dataflow_job" "bulk_compress_gcs_files" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://your-bucket/your-files/*.txt" - outputDirectory = "gs://your-bucket/your-path" - outputFailureFile = "gs://your-bucket/compressed/failed.csv" + inputFilePattern = "" + outputDirectory = "" + outputFailureFile = "" compression = "" # outputFilenameSuffix = "" } diff --git a/v1/README_Bulk_Decompress_GCS_Files.md b/v1/README_Bulk_Decompress_GCS_Files.md index b885462bdf..b645a00324 100644 --- a/v1/README_Bulk_Decompress_GCS_Files.md +++ b/v1/README_Bulk_Decompress_GCS_Files.md @@ -26,9 +26,9 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The Cloud Storage location of the files you'd like to process. (Example: gs://your-bucket/your-files/*.gz). -* **outputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. (Example: gs://your-bucket/decompressed/). -* **outputFailureFile** : The output file to write failures to during the decompression process. If there are no failures, the file will still be created but will be empty. The contents will be one line for each file which failed decompression in CSV format (Filename, Error). Note that this parameter will allow the pipeline to continue processing in the event of a failure. (Example: gs://your-bucket/decompressed/failed.csv). +* **inputFilePattern**: The Cloud Storage location of the files you'd like to process. For example, `gs://your-bucket/your-files/*.gz`. +* **outputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. For example, `gs://your-bucket/decompressed/`. +* **outputFailureFile**: The output file to write failures to during the decompression process. If there are no failures, the file will still be created but will be empty. The contents will be one line for each file which failed decompression in CSV format (Filename, Error). Note that this parameter will allow the pipeline to continue processing in the event of a failure. For example, `gs://your-bucket/decompressed/failed.csv`. ### Optional parameters @@ -202,9 +202,9 @@ resource "google_dataflow_job" "bulk_decompress_gcs_files" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://your-bucket/your-files/*.gz" - outputDirectory = "gs://your-bucket/decompressed/" - outputFailureFile = "gs://your-bucket/decompressed/failed.csv" + inputFilePattern = "" + outputDirectory = "" + outputFailureFile = "" } } ``` diff --git a/v1/README_Cassandra_To_Cloud_Bigtable.md b/v1/README_Cassandra_To_Cloud_Bigtable.md index f52429a981..e322eed223 100644 --- a/v1/README_Cassandra_To_Cloud_Bigtable.md +++ b/v1/README_Cassandra_To_Cloud_Bigtable.md @@ -23,21 +23,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **cassandraHosts** : The hosts of the Apache Cassandra nodes in a comma-separated list. -* **cassandraKeyspace** : The Apache Cassandra keyspace where the table is located. -* **cassandraTable** : The Apache Cassandra table to copy. -* **bigtableProjectId** : The Google Cloud project ID associated with the Bigtable instance. -* **bigtableInstanceId** : The ID of the Bigtable instance that the Apache Cassandra table is copied to. -* **bigtableTableId** : The name of the Bigtable table that the Apache Cassandra table is copied to. +* **cassandraHosts**: The hosts of the Apache Cassandra nodes in a comma-separated list. +* **cassandraKeyspace**: The Apache Cassandra keyspace where the table is located. +* **cassandraTable**: The Apache Cassandra table to copy. +* **bigtableProjectId**: The Google Cloud project ID associated with the Bigtable instance. +* **bigtableInstanceId**: The ID of the Bigtable instance that the Apache Cassandra table is copied to. +* **bigtableTableId**: The name of the Bigtable table that the Apache Cassandra table is copied to. ### Optional parameters -* **cassandraPort** : The TCP port to use to reach Apache Cassandra on the nodes. The default value is 9042. -* **defaultColumnFamily** : The name of the column family of the Bigtable table. The default value is default. -* **rowKeySeparator** : The separator used to build row-keys. The default value is '#'. -* **splitLargeRows** : The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. . -* **writetimeCassandraColumnSchema** : GCS path to schema to copy Cassandra writetimes to Bigtable. The command to generate this schema is ```cqlsh -e "select json * from system_schema.columns where keyspace_name='$CASSANDRA_KEYSPACE' and table_name='$CASSANDRA_TABLE'`" > column_schema.json```. Set $WRITETIME_CASSANDRA_COLUMN_SCHEMA to a GCS path, e.g. `gs://$BUCKET_NAME/column_schema.json`. Then upload the schema to GCS: `gcloud storage cp column_schema.json $WRITETIME_CASSANDRA_COLUMN_SCHEMA`. Requires Cassandra version 2.2 onwards for JSON support. -* **setZeroTimestamp** : The flag for setting Bigtable cell timestamp to 0 if Cassandra writetime is not present. The default behavior for when this flag is not set is to set the Bigtable cell timestamp as the template replication time, i.e. now. +* **cassandraPort**: The TCP port to use to reach Apache Cassandra on the nodes. The default value is `9042`. +* **defaultColumnFamily**: The name of the column family of the Bigtable table. The default value is `default`. +* **rowKeySeparator**: The separator used to build row-keys. The default value is `#`. +* **splitLargeRows**: The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. . +* **writetimeCassandraColumnSchema**: GCS path to schema to copy Cassandra writetimes to Bigtable. The command to generate this schema is ```cqlsh -e "select json * from system_schema.columns where keyspace_name='$CASSANDRA_KEYSPACE' and table_name='$CASSANDRA_TABLE'`" > column_schema.json```. Set $WRITETIME_CASSANDRA_COLUMN_SCHEMA to a GCS path, e.g. `gs://$BUCKET_NAME/column_schema.json`. Then upload the schema to GCS: `gcloud storage cp column_schema.json $WRITETIME_CASSANDRA_COLUMN_SCHEMA`. Requires Cassandra version 2.2 onwards for JSON support. +* **setZeroTimestamp**: The flag for setting Bigtable cell timestamp to 0 if Cassandra writetime is not present. The default behavior for when this flag is not set is to set the Bigtable cell timestamp as the template replication time, i.e. now. diff --git a/v1/README_Cloud_BigQuery_to_Cloud_Datastore.md b/v1/README_Cloud_BigQuery_to_Cloud_Datastore.md index 93517b8105..f98873f127 100644 --- a/v1/README_Cloud_BigQuery_to_Cloud_Datastore.md +++ b/v1/README_Cloud_BigQuery_to_Cloud_Datastore.md @@ -15,17 +15,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **readQuery** : A BigQuery SQL query that extracts data from the source. For example, select * from dataset1.sample_table. -* **datastoreWriteProjectId** : The ID of the Google Cloud project to write the Datastore entities to. -* **errorWritePath** : The error log output file to use for write failures that occur during processing. (Example: gs://your-bucket/errors/). +* **readQuery**: A BigQuery SQL query that extracts data from the source. For example, `select * from dataset1.sample_table`. +* **datastoreWriteProjectId**: The ID of the Google Cloud project to write the Datastore entities to. +* **errorWritePath**: The error log output file to use for write failures that occur during processing. For example, `gs://your-bucket/errors/`. ### Optional parameters -* **readIdColumn** : Name of the BigQuery column storing the unique identifier of the row. -* **invalidOutputPath** : Cloud Storage path where to write BigQuery rows that cannot be converted to target entities. (Example: gs://your-bucket/your-path). -* **datastoreWriteEntityKind** : Datastore kind under which entities will be written in the output Google Cloud project. -* **datastoreWriteNamespace** : Datastore namespace under which entities will be written in the output Google Cloud project. -* **datastoreHintNumWorkers** : Hint for the expected number of workers in the Datastore ramp-up throttling step. Default is `500`. +* **readIdColumn**: Name of the BigQuery column storing the unique identifier of the row. +* **invalidOutputPath**: Cloud Storage path where to write BigQuery rows that cannot be converted to target entities. For example, `gs://your-bucket/your-path`. +* **datastoreWriteEntityKind**: Datastore kind under which entities will be written in the output Google Cloud project. +* **datastoreWriteNamespace**: Datastore namespace under which entities will be written in the output Google Cloud project. +* **datastoreHintNumWorkers**: Hint for the expected number of workers in the Datastore ramp-up throttling step. Defaults to `500`. @@ -213,9 +213,9 @@ resource "google_dataflow_job" "cloud_bigquery_to_cloud_datastore" { parameters = { readQuery = "" datastoreWriteProjectId = "" - errorWritePath = "gs://your-bucket/errors/" + errorWritePath = "" # readIdColumn = "" - # invalidOutputPath = "gs://your-bucket/your-path" + # invalidOutputPath = "" # datastoreWriteEntityKind = "" # datastoreWriteNamespace = "" # datastoreHintNumWorkers = "500" diff --git a/v1/README_Cloud_BigQuery_to_GCS_TensorFlow_Records.md b/v1/README_Cloud_BigQuery_to_GCS_TensorFlow_Records.md index 784bcaac9e..c13a6dafe9 100644 --- a/v1/README_Cloud_BigQuery_to_GCS_TensorFlow_Records.md +++ b/v1/README_Cloud_BigQuery_to_GCS_TensorFlow_Records.md @@ -22,17 +22,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **readQuery** : A BigQuery SQL query that extracts data from the source. For example, select * from dataset1.sample_table. -* **outputDirectory** : The top-level Cloud Storage path prefix to use when writing the training, testing, and validation TFRecord files. Subdirectories for resulting training, testing, and validation TFRecord files are automatically generated from `outputDirectory`. For example, `gs://mybucket/output/train` (Example: gs://mybucket/output). +* **readQuery**: A BigQuery SQL query that extracts data from the source. For example, `select * from dataset1.sample_table`. +* **outputDirectory**: The top-level Cloud Storage path prefix to use when writing the training, testing, and validation TFRecord files. Subdirectories for resulting training, testing, and validation TFRecord files are automatically generated from `outputDirectory`. For example, `gs://mybucket/output`. ### Optional parameters -* **readIdColumn** : Name of the BigQuery column storing the unique identifier of the row. -* **invalidOutputPath** : Cloud Storage path where to write BigQuery rows that cannot be converted to target entities. (Example: gs://your-bucket/your-path). -* **outputSuffix** : The file suffix for the training, testing, and validation TFRecord files that are written. The default value is `.tfrecord`. -* **trainingPercentage** : The percentage of query data allocated to training TFRecord files. The default value is 1, or 100%. -* **testingPercentage** : The percentage of query data allocated to testing TFRecord files. The default value is 0, or 0%. -* **validationPercentage** : The percentage of query data allocated to validation TFRecord files. The default value is 0, or 0%. +* **readIdColumn**: Name of the BigQuery column storing the unique identifier of the row. +* **invalidOutputPath**: Cloud Storage path where to write BigQuery rows that cannot be converted to target entities. For example, `gs://your-bucket/your-path`. +* **outputSuffix**: The file suffix for the training, testing, and validation TFRecord files that are written. The default value is `.tfrecord`. +* **trainingPercentage**: The percentage of query data allocated to training TFRecord files. The default value is `1`, or `100%`. +* **testingPercentage**: The percentage of query data allocated to testing TFRecord files. The default value is `0`, or `0%`. +* **validationPercentage**: The percentage of query data allocated to validation TFRecord files. The default value is `0`, or `0%`. @@ -219,9 +219,9 @@ resource "google_dataflow_job" "cloud_bigquery_to_gcs_tensorflow_records" { temp_gcs_location = "gs://bucket-name-here/temp" parameters = { readQuery = "" - outputDirectory = "gs://mybucket/output" + outputDirectory = "" # readIdColumn = "" - # invalidOutputPath = "gs://your-bucket/your-path" + # invalidOutputPath = "" # outputSuffix = ".tfrecord" # trainingPercentage = "1.0" # testingPercentage = "0.0" diff --git a/v1/README_Cloud_Bigtable_to_GCS_Avro.md b/v1/README_Cloud_Bigtable_to_GCS_Avro.md index 09d93b37b2..05043a9619 100644 --- a/v1/README_Cloud_Bigtable_to_GCS_Avro.md +++ b/v1/README_Cloud_Bigtable_to_GCS_Avro.md @@ -18,15 +18,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProjectId** : The ID of the Google Cloud project that contains the Bigtable instance that you want to read data from. -* **bigtableInstanceId** : The ID of the Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Bigtable table to export. -* **outputDirectory** : The Cloud Storage path where data is written. (Example: gs://mybucket/somefolder). -* **filenamePrefix** : The prefix of the Avro filename. For example, `output-`. Defaults to: part. +* **bigtableProjectId**: The ID of the Google Cloud project that contains the Bigtable instance that you want to read data from. +* **bigtableInstanceId**: The ID of the Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Bigtable table to export. +* **outputDirectory**: The Cloud Storage path where data is written. For example, `gs://mybucket/somefolder`. +* **filenamePrefix**: The prefix of the Avro filename. For example, `output-`. Defaults to: part. ### Optional parameters -* **bigtableAppProfileId** : The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. +* **bigtableAppProfileId**: The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. @@ -209,7 +209,7 @@ resource "google_dataflow_job" "cloud_bigtable_to_gcs_avro" { bigtableProjectId = "" bigtableInstanceId = "" bigtableTableId = "" - outputDirectory = "gs://mybucket/somefolder" + outputDirectory = "" filenamePrefix = "part" # bigtableAppProfileId = "default" } diff --git a/v1/README_Cloud_Bigtable_to_GCS_Json.md b/v1/README_Cloud_Bigtable_to_GCS_Json.md index ec793ebf20..0b95ea43ba 100644 --- a/v1/README_Cloud_Bigtable_to_GCS_Json.md +++ b/v1/README_Cloud_Bigtable_to_GCS_Json.md @@ -17,17 +17,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProjectId** : The ID for the Google Cloud project that contains the Bigtable instance that you want to read data from. -* **bigtableInstanceId** : The ID of the Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Bigtable table to read from. -* **outputDirectory** : The Cloud Storage path where the output JSON files are stored. (Example: gs://your-bucket/your-path/). +* **bigtableProjectId**: The ID for the Google Cloud project that contains the Bigtable instance that you want to read data from. +* **bigtableInstanceId**: The ID of the Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Bigtable table to read from. +* **outputDirectory**: The Cloud Storage path where the output JSON files are stored. For example, `gs://your-bucket/your-path/`. ### Optional parameters -* **filenamePrefix** : The prefix of the JSON file name. For example, "table1-". If no value is provided, defaults to `part`. -* **userOption** : Possible values are `FLATTEN` or `NONE`. `FLATTEN` flattens the row to the single level. `NONE` stores the whole row as a JSON string. Defaults to `NONE`. -* **columnsAliases** : A comma-separated list of columns that are required for the Vertex AI Vector Search index. The columns `id` and `embedding` are required for Vertex AI Vector Search. You can use the notation `fromfamily:fromcolumn;to`. For example, if the columns are `rowkey` and `cf:my_embedding`, where `rowkey` has a different name than the embedding column, specify `cf:my_embedding;embedding` and, `rowkey;id`. Only use this option when the value for `userOption` is `FLATTEN`. -* **bigtableAppProfileId** : The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. +* **filenamePrefix**: The prefix of the JSON file name. For example, `table1-`. If no value is provided, defaults to `part`. +* **userOption**: Possible values are `FLATTEN` or `NONE`. `FLATTEN` flattens the row to the single level. `NONE` stores the whole row as a JSON string. Defaults to `NONE`. +* **columnsAliases**: A comma-separated list of columns that are required for the Vertex AI Vector Search index. The columns `id` and `embedding` are required for Vertex AI Vector Search. You can use the notation `fromfamily:fromcolumn;to`. For example, if the columns are `rowkey` and `cf:my_embedding`, where `rowkey` has a different name than the embedding column, specify `cf:my_embedding;embedding` and, `rowkey;id`. Only use this option when the value for `userOption` is `FLATTEN`. +* **bigtableAppProfileId**: The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. @@ -216,7 +216,7 @@ resource "google_dataflow_job" "cloud_bigtable_to_gcs_json" { bigtableProjectId = "" bigtableInstanceId = "" bigtableTableId = "" - outputDirectory = "gs://your-bucket/your-path/" + outputDirectory = "" # filenamePrefix = "part" # userOption = "NONE" # columnsAliases = "" diff --git a/v1/README_Cloud_Bigtable_to_GCS_Parquet.md b/v1/README_Cloud_Bigtable_to_GCS_Parquet.md index cd005cd87d..b813769be2 100644 --- a/v1/README_Cloud_Bigtable_to_GCS_Parquet.md +++ b/v1/README_Cloud_Bigtable_to_GCS_Parquet.md @@ -18,16 +18,16 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProjectId** : The ID of the Google Cloud project that contains the Cloud Bigtable instance that you want to read data from. -* **bigtableInstanceId** : The ID of the Cloud Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Cloud Bigtable table to export. -* **outputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse the directory path for date and time formatters. For example: gs://your-bucket/your-path. -* **filenamePrefix** : The prefix of the Parquet file name. For example, "table1-". Defaults to: part. +* **bigtableProjectId**: The ID of the Google Cloud project that contains the Cloud Bigtable instance that you want to read data from. +* **bigtableInstanceId**: The ID of the Cloud Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Cloud Bigtable table to export. +* **outputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse the directory path for date and time formatters. For example: `gs://your-bucket/your-path`. +* **filenamePrefix**: The prefix of the Parquet file name. For example, `table1-`. Defaults to: `part`. ### Optional parameters -* **numShards** : The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. The default value is decided by Dataflow. -* **bigtableAppProfileId** : The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. +* **numShards**: The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. The default value is decided by Dataflow. +* **bigtableAppProfileId**: The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. diff --git a/v1/README_Cloud_Bigtable_to_GCS_SequenceFile.md b/v1/README_Cloud_Bigtable_to_GCS_SequenceFile.md index 47c4c7b4ea..86e5072e87 100644 --- a/v1/README_Cloud_Bigtable_to_GCS_SequenceFile.md +++ b/v1/README_Cloud_Bigtable_to_GCS_SequenceFile.md @@ -19,19 +19,19 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProject** : The ID of the Google Cloud project that contains the Bigtable instance that you want to read data from. -* **bigtableInstanceId** : The ID of the Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Bigtable table to export. -* **destinationPath** : The Cloud Storage path where data is written. (Example: gs://your-bucket/your-path/). -* **filenamePrefix** : The prefix of the SequenceFile filename. (Example: output-). +* **bigtableProject**: The ID of the Google Cloud project that contains the Bigtable instance that you want to read data from. +* **bigtableInstanceId**: The ID of the Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Bigtable table to export. +* **destinationPath**: The Cloud Storage path where data is written. For example, `gs://your-bucket/your-path/`. +* **filenamePrefix**: The prefix of the SequenceFile filename. For example, `output-`. ### Optional parameters -* **bigtableAppProfileId** : The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. -* **bigtableStartRow** : The row where to start the export from, defaults to the first row. -* **bigtableStopRow** : The row where to stop the export, defaults to the last row. -* **bigtableMaxVersions** : Maximum number of cell versions. Defaults to: 2147483647. -* **bigtableFilter** : Filter string. See: http://hbase.apache.org/book.html#thrift. Defaults to empty. +* **bigtableAppProfileId**: The ID of the Bigtable application profile to use for the export. If you don't specify an app profile, Bigtable uses the instance's default app profile: https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile. +* **bigtableStartRow**: The row where to start the export from, defaults to the first row. +* **bigtableStopRow**: The row where to stop the export, defaults to the last row. +* **bigtableMaxVersions**: Maximum number of cell versions. Defaults to: 2147483647. +* **bigtableFilter**: Filter string. See: http://hbase.apache.org/book.html#thrift. Defaults to empty. @@ -226,8 +226,8 @@ resource "google_dataflow_job" "cloud_bigtable_to_gcs_sequencefile" { bigtableProject = "" bigtableInstanceId = "" bigtableTableId = "" - destinationPath = "gs://your-bucket/your-path/" - filenamePrefix = "output-" + destinationPath = "" + filenamePrefix = "" # bigtableAppProfileId = "" # bigtableStartRow = "" # bigtableStopRow = "" diff --git a/v1/README_Cloud_Bigtable_to_Vector_Embeddings.md b/v1/README_Cloud_Bigtable_to_Vector_Embeddings.md index dc31ca96bc..b794016fb7 100644 --- a/v1/README_Cloud_Bigtable_to_Vector_Embeddings.md +++ b/v1/README_Cloud_Bigtable_to_Vector_Embeddings.md @@ -18,24 +18,24 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProjectId** : The ID for the Google Cloud project that contains the Bigtable instance that you want to read data from. -* **bigtableInstanceId** : The ID of the Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Bigtable table to read from. -* **outputDirectory** : The Cloud Storage path where the output JSON files are stored. (Example: gs://your-bucket/your-path/). -* **idColumn** : The fully qualified column name where the ID is stored. In the format cf:col or _key. -* **embeddingColumn** : The fully qualified column name where the embeddings are stored. In the format cf:col or _key. +* **bigtableProjectId**: The ID for the Google Cloud project that contains the Bigtable instance that you want to read data from. +* **bigtableInstanceId**: The ID of the Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Bigtable table to read from. +* **outputDirectory**: The Cloud Storage path where the output JSON files are stored. For example, `gs://your-bucket/your-path/`. +* **idColumn**: The fully qualified column name where the ID is stored. In the format `cf:col` or `_key`. +* **embeddingColumn**: The fully qualified column name where the embeddings are stored. In the format `cf:col` or `_key`. ### Optional parameters -* **filenamePrefix** : The prefix of the JSON filename. For example: "table1-". If no value is provided, defaults to "part". -* **crowdingTagColumn** : The fully qualified column name where the crowding tag is stored. In the format cf:col or _key. -* **embeddingByteSize** : The byte size of each entry in the embeddings array. For float, use the value 4. For double, use the value 8. Defaults to 4. -* **allowRestrictsMappings** : The comma-separated, fully qualified column names for the columns to use as the allow restricts, with their aliases. In the format cf:col->alias. -* **denyRestrictsMappings** : The comma-separated, fully qualified column names for the columns to use as the deny restricts, with their aliases. In the format cf:col->alias. -* **intNumericRestrictsMappings** : The comma-separated, fully qualified column names of the columns to use as integer numeric_restricts, with their aliases. In the format cf:col->alias. -* **floatNumericRestrictsMappings** : The comma-separated, fully qualified column names of the columns to use as float (4 bytes) numeric_restricts, with their aliases. In the format cf:col->alias. -* **doubleNumericRestrictsMappings** : The comma-separated, fully qualified column names of the columns to use as double (8 bytes) numeric_restricts, with their aliases. In the format cf:col->alias. -* **bigtableAppProfileId** : The ID of the Cloud Bigtable app profile to be used for the export. Defaults to: default. +* **filenamePrefix**: The prefix of the JSON filename. For example: `table1-`. If no value is provided, defaults to `part`. +* **crowdingTagColumn**: The fully qualified column name where the crowding tag is stored. In the format `cf:col` or `_key`. +* **embeddingByteSize**: The byte size of each entry in the embeddings array. For float, use the value `4`. For double, use the value `8`. Defaults to `4`. +* **allowRestrictsMappings**: The comma-separated, fully qualified column names for the columns to use as the allow restricts, with their aliases. In the format `cf:col->alias`. +* **denyRestrictsMappings**: The comma-separated, fully qualified column names for the columns to use as the deny restricts, with their aliases. In the format `cf:col->alias`. +* **intNumericRestrictsMappings**: The comma-separated, fully qualified column names of the columns to use as integer numeric_restricts, with their aliases. In the format `cf:col->alias`. +* **floatNumericRestrictsMappings**: The comma-separated, fully qualified column names of the columns to use as float (4 bytes) numeric_restricts, with their aliases. In the format `cf:col->alias`. +* **doubleNumericRestrictsMappings**: The comma-separated, fully qualified column names of the columns to use as double (8 bytes) numeric_restricts, with their aliases. In the format `cf:col->alias`. +* **bigtableAppProfileId**: The ID of the Cloud Bigtable app profile to be used for the export. Defaults to: default. @@ -245,7 +245,7 @@ resource "google_dataflow_job" "cloud_bigtable_to_vector_embeddings" { bigtableProjectId = "" bigtableInstanceId = "" bigtableTableId = "" - outputDirectory = "gs://your-bucket/your-path/" + outputDirectory = "" idColumn = "" embeddingColumn = "" # filenamePrefix = "part" diff --git a/v1/README_Cloud_PubSub_to_Avro.md b/v1/README_Cloud_PubSub_to_Avro.md index 1ba243aa6b..feb93d6ef7 100644 --- a/v1/README_Cloud_PubSub_to_Avro.md +++ b/v1/README_Cloud_PubSub_to_Avro.md @@ -18,20 +18,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputTopic** : The Pub/Sub topic to subscribe to for message consumption. The topic name must be in the format projects//topics/. -* **outputDirectory** : The output directory where output Avro files are archived. Must contain / at the end. For example: gs://example-bucket/example-directory/. -* **avroTempDirectory** : The directory for temporary Avro files. Must contain / at the end. For example: gs://example-bucket/example-directory/. +* **inputTopic**: The Pub/Sub topic to subscribe to for message consumption. The topic name must be in the format `projects//topics/`. +* **outputDirectory**: The output directory where output Avro files are archived. Must contain `/` at the end. For example: `gs://example-bucket/example-directory/`. +* **avroTempDirectory**: The directory for temporary Avro files. Must contain `/` at the end. For example: `gs://example-bucket/example-directory/`. ### Optional parameters -* **outputFilenamePrefix** : The output filename prefix for the Avro files. Defaults to: output. -* **outputFilenameSuffix** : The output filename suffix for the Avro files. Defaults to empty. -* **outputShardTemplate** : The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. Therefore, all data outputs into a single file per window. The `outputShardTemplate` defaults `to W-P-SS-of-NN`, where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. -* **yearPattern** : Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no difference in the year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `YYYY`. -* **monthPattern** : Pattern for formatting the month. Must be one or more of the `M` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `MM`. -* **dayPattern** : Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for day of year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `dd`. -* **hourPattern** : Pattern for formatting the hour. Must be one or more of the `H` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `HH`. -* **minutePattern** : Pattern for formatting the minute. Must be one or more of the `m` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `mm`. +* **outputFilenamePrefix**: The output filename prefix for the Avro files. Defaults to: output. +* **outputFilenameSuffix**: The output filename suffix for the Avro files. Defaults to empty. +* **outputShardTemplate**: The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. Therefore, all data outputs into a single file per window. The `outputShardTemplate` defaults `to W-P-SS-of-NN`, where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. +* **yearPattern**: Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no difference in the year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `YYYY`. +* **monthPattern**: Pattern for formatting the month. Must be one or more of the `M` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `MM`. +* **dayPattern**: Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for day of year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `dd`. +* **hourPattern**: Pattern for formatting the hour. Must be one or more of the `H` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `HH`. +* **minutePattern**: Pattern for formatting the minute. Must be one or more of the `m` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `mm`. diff --git a/v1/README_Cloud_PubSub_to_Cloud_PubSub.md b/v1/README_Cloud_PubSub_to_Cloud_PubSub.md index 6466412598..6158092673 100644 --- a/v1/README_Cloud_PubSub_to_Cloud_PubSub.md +++ b/v1/README_Cloud_PubSub_to_Cloud_PubSub.md @@ -21,13 +21,13 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : The Pub/Sub subscription to read the input from. (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **outputTopic** : The Pub/Sub topic to write the output to. (Example: projects/your-project-id/topics/your-topic-name). +* **inputSubscription**: The Pub/Sub subscription to read the input from. For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **outputTopic**: The Pub/Sub topic to write the output to. For example, `projects/your-project-id/topics/your-topic-name`. ### Optional parameters -* **filterKey** : The attribute key to use to filter events. No filters are applied if `filterKey` is not specified. -* **filterValue** : The attribute value to use to filter events when a `filterKey` is provided. By default, a null `filterValue` is used. +* **filterKey**: The attribute key to use to filter events. No filters are applied if `filterKey` is not specified. +* **filterValue**: The attribute value to use to filter events when a `filterKey` is provided. By default, a null `filterValue` is used. @@ -201,8 +201,8 @@ resource "google_dataflow_job" "cloud_pubsub_to_cloud_pubsub" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - outputTopic = "projects/your-project-id/topics/your-topic-name" + inputSubscription = "" + outputTopic = "" # filterKey = "" # filterValue = "" } diff --git a/v1/README_Cloud_PubSub_to_Datadog.md b/v1/README_Cloud_PubSub_to_Datadog.md index 5fc7099e00..fbfd2fedf3 100644 --- a/v1/README_Cloud_PubSub_to_Datadog.md +++ b/v1/README_Cloud_PubSub_to_Datadog.md @@ -33,22 +33,22 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : The Pub/Sub subscription to read the input from. (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **url** : The Datadog Logs API URL. This URL must be routable from the VPC that the pipeline runs in. See Send logs (https://docs.datadoghq.com/api/latest/logs/#send-logs) in the Datadog documentation for more information. (Example: https://http-intake.logs.datadoghq.com). -* **outputDeadletterTopic** : The Pub/Sub topic to forward undeliverable messages to. For example, projects//topics/. +* **inputSubscription**: The Pub/Sub subscription to read the input from. For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **url**: The Datadog Logs API URL. This URL must be routable from the VPC that the pipeline runs in. See Send logs (https://docs.datadoghq.com/api/latest/logs/#send-logs) in the Datadog documentation for more information. For example, `https://http-intake.logs.datadoghq.com`. +* **outputDeadletterTopic**: The Pub/Sub topic to forward undeliverable messages to. For example, `projects//topics/`. ### Optional parameters -* **apiKey** : The Datadog API key. You must provide this value if the `apiKeySource` is set to `PLAINTEXT` or `KMS`. For more information, see API and Application Keys (https://docs.datadoghq.com/account_management/api-app-keys/) in the Datadog documentation. -* **batchCount** : The batch size for sending multiple events to Datadog. The default is `1` (no batching). -* **parallelism** : The maximum number of parallel requests. The default is `1` (no parallelism). -* **includePubsubMessage** : Whether to include the full Pub/Sub message in the payload. The default is `true` (all elements, including the data element, are included in the payload). -* **apiKeyKMSEncryptionKey** : The Cloud KMS key to use to decrypt the API Key. You must provide this parameter if the `apiKeySource` is set to `KMS`. If the Cloud KMS key is provided, you must pass in an encrypted API Key. (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **apiKeySecretId** : The Secret Manager secret ID for the API Key. You must provide this parameter if the `apiKeySource` is set to `SECRET_MANAGER`. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **apiKeySource** : The source of the API key. The following values are supported: `PLAINTEXT`, `KMS`, and `SECRET_MANAGER`. You must provide this parameter if you're using Secret Manager. If `apiKeySource` is set to `KMS`, you must also provide `apiKeyKMSEncryptionKey` and encrypted `API Key`. If `apiKeySource` is set to `SECRET_MANAGER`, you must also provide `apiKeySecretId`. If `apiKeySource` is set to `PLAINTEXT`, you must also provide `apiKey`. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. +* **apiKey**: The Datadog API key. You must provide this value if the `apiKeySource` is set to `PLAINTEXT` or `KMS`. For more information, see API and Application Keys (https://docs.datadoghq.com/account_management/api-app-keys/) in the Datadog documentation. +* **batchCount**: The batch size for sending multiple events to Datadog. The default is `1` (no batching). +* **parallelism**: The maximum number of parallel requests. The default is `1` (no parallelism). +* **includePubsubMessage**: Whether to include the full Pub/Sub message in the payload. The default is `true` (all elements, including the data element, are included in the payload). +* **apiKeyKMSEncryptionKey**: The Cloud KMS key to use to decrypt the API Key. You must provide this parameter if the `apiKeySource` is set to `KMS`. If the Cloud KMS key is provided, you must pass in an encrypted API Key. For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **apiKeySecretId**: The Secret Manager secret ID for the API Key. You must provide this parameter if the `apiKeySource` is set to `SECRET_MANAGER`. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **apiKeySource**: The source of the API key. The following values are supported: `PLAINTEXT`, `KMS`, and `SECRET_MANAGER`. You must provide this parameter if you're using Secret Manager. If `apiKeySource` is set to `KMS`, you must also provide `apiKeyKMSEncryptionKey` and encrypted `API Key`. If `apiKeySource` is set to `SECRET_MANAGER`, you must also provide `apiKeySecretId`. If `apiKeySource` is set to `PLAINTEXT`, you must also provide `apiKey`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. ## User-Defined functions (UDFs) @@ -259,15 +259,15 @@ resource "google_dataflow_job" "cloud_pubsub_to_datadog" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - url = "https://http-intake.logs.datadoghq.com" + inputSubscription = "" + url = "" outputDeadletterTopic = "" # apiKey = "" # batchCount = "" # parallelism = "" # includePubsubMessage = "true" - # apiKeyKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # apiKeySecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # apiKeyKMSEncryptionKey = "" + # apiKeySecretId = "" # apiKeySource = "" # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" diff --git a/v1/README_Cloud_PubSub_to_GCS_Text.md b/v1/README_Cloud_PubSub_to_GCS_Text.md index 967c31dd2a..dab723395f 100644 --- a/v1/README_Cloud_PubSub_to_GCS_Text.md +++ b/v1/README_Cloud_PubSub_to_GCS_Text.md @@ -19,20 +19,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputDirectory** : The path and filename prefix for writing output files. For example, `gs://bucket-name/path/`. This value must end in a slash. -* **outputFilenamePrefix** : The prefix to place on each windowed file. For example, `output-`. Defaults to: output. +* **outputDirectory**: The path and filename prefix for writing output files. For example, `gs://bucket-name/path/`. This value must end in a slash. +* **outputFilenamePrefix**: The prefix to place on each windowed file. For example, `output-`. Defaults to: output. ### Optional parameters -* **inputTopic** : The Pub/Sub topic to read the input from. The topic name should be in the format `projects//topics/`. -* **userTempLocation** : The user provided directory to output temporary files to. Must end with a slash. -* **outputFilenameSuffix** : The suffix to place on each windowed file. Typically a file extension such as `.txt` or `.csv`. Defaults to empty. -* **outputShardTemplate** : The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. Therefore, all data outputs into a single file per window. The `outputShardTemplate` defaults `to W-P-SS-of-NN`, where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. -* **yearPattern** : Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no difference in the year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `YYYY`. -* **monthPattern** : Pattern for formatting the month. Must be one or more of the `M` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `MM`. -* **dayPattern** : Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for day of year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `dd`. -* **hourPattern** : Pattern for formatting the hour. Must be one or more of the `H` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `HH`. -* **minutePattern** : Pattern for formatting the minute. Must be one or more of the `m` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory ('/') character. Defaults to `mm`. +* **inputTopic**: The Pub/Sub topic to read the input from. The topic name should be in the format `projects//topics/`. +* **userTempLocation**: The user provided directory to output temporary files to. Must end with a slash. +* **outputFilenameSuffix**: The suffix to place on each windowed file. Typically a file extension such as `.txt` or `.csv`. Defaults to empty. +* **outputShardTemplate**: The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. Therefore, all data outputs into a single file per window. The `outputShardTemplate` defaults `to W-P-SS-of-NN`, where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. +* **yearPattern**: Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no difference in the year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `YYYY`. +* **monthPattern**: Pattern for formatting the month. Must be one or more of the `M` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `MM`. +* **dayPattern**: Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for day of year. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `dd`. +* **hourPattern**: Pattern for formatting the hour. Must be one or more of the `H` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `HH`. +* **minutePattern**: Pattern for formatting the minute. Must be one or more of the `m` character. Optionally, wrap the pattern with characters that aren't alphanumeric or the directory (`/`) character. Defaults to `mm`. diff --git a/v1/README_Cloud_PubSub_to_Splunk.md b/v1/README_Cloud_PubSub_to_Splunk.md index 45567991cf..c3bceeb015 100644 --- a/v1/README_Cloud_PubSub_to_Splunk.md +++ b/v1/README_Cloud_PubSub_to_Splunk.md @@ -33,26 +33,26 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : The Pub/Sub subscription to read the input from. (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **url** : The Splunk HEC URL. The URL must be routable from the VPC that the pipeline runs in. (Example: https://splunk-hec-host:8088). -* **outputDeadletterTopic** : The Pub/Sub topic to forward undeliverable messages to. For example, projects//topics/. +* **inputSubscription**: The Pub/Sub subscription to read the input from. For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **url**: The Splunk HEC URL. The URL must be routable from the VPC that the pipeline runs in. For example, `https://splunk-hec-host:8088`. +* **outputDeadletterTopic**: The Pub/Sub topic to forward undeliverable messages to. For example, `projects//topics/`. ### Optional parameters -* **token** : The Splunk HEC authentication token. Must be provided if the `tokenSource` parameter is set to `PLAINTEXT` or `KMS`. -* **batchCount** : The batch size for sending multiple events to Splunk. Defaults to 1 (no batching). -* **disableCertificateValidation** : Disable SSL certificate validation. Default false (validation enabled). If true, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored. -* **parallelism** : The maximum number of parallel requests. Defaults to 1 (no parallelism). -* **includePubsubMessage** : Include the full Pub/Sub message in the payload. Default false (only the data element is included in the payload). -* **tokenKMSEncryptionKey** : The Cloud KMS key to use to decrypt the HEC token string. This parameter must be provided when tokenSource is set to KMS. If the Cloud KMS key is provided, the HEC token string `must` be passed in encrypted. (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **tokenSecretId** : The Secret Manager secret ID for the token. This parameter must provided when the tokenSource is set to SECRET_MANAGER. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **tokenSource** : The source of the token. The following values are allowed: `PLAINTEXT`, `KMS`, and `SECRET_MANAGER`. You must provide this parameter when Secret Manager is used. If `tokenSource` is set to `KMS`, `tokenKMSEncryptionKey`, and encrypted, then `token` must be provided. If `tokenSource` is set to `SECRET_MANAGER`, then `tokenSecretId` must be provided. If `tokenSource` is set to `PLAINTEXT`, then `token` must be provided. -* **rootCaCertificatePath** : The full URL to the root CA certificate in Cloud Storage. The certificate provided in Cloud Storage must be DER-encoded and can be supplied in binary or printable (Base64) encoding. If the certificate is provided in Base64 encoding, it must be bounded at the beginning by -----BEGIN CERTIFICATE-----, and must be bounded at the end by -----END CERTIFICATE-----. If this parameter is provided, this private CA certificate file is fetched and added to the Dataflow worker's trust store in order to verify the Splunk HEC endpoint's SSL certificate. If this parameter is not provided, the default trust store is used. (Example: gs://mybucket/mycerts/privateCA.crt). -* **enableBatchLogs** : Specifies whether logs should be enabled for batches written to Splunk. Default: `true`. -* **enableGzipHttpCompression** : Specifies whether HTTP requests sent to Splunk HEC should be compressed (gzip content encoded). Default: `true`. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. +* **token**: The Splunk HEC authentication token. Must be provided if the `tokenSource` parameter is set to `PLAINTEXT` or `KMS`. +* **batchCount**: The batch size for sending multiple events to Splunk. Defaults to `1` (no batching). +* **disableCertificateValidation**: Disable SSL certificate validation. Default `false` (validation enabled). If `true`, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored. +* **parallelism**: The maximum number of parallel requests. Defaults to `1` (no parallelism). +* **includePubsubMessage**: Include the full Pub/Sub message in the payload. Default `false` (only the data element is included in the payload). +* **tokenKMSEncryptionKey**: The Cloud KMS key to use to decrypt the HEC token string. This parameter must be provided when tokenSource is set to KMS. If the Cloud KMS key is provided, the HEC token string must be passed in encrypted. For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **tokenSecretId**: The Secret Manager secret ID for the token. This parameter must provided when the tokenSource is set to `SECRET_MANAGER`. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **tokenSource**: The source of the token. The following values are allowed: `PLAINTEXT`, `KMS`, and `SECRET_MANAGER`. You must provide this parameter when Secret Manager is used. If `tokenSource` is set to `KMS`, `tokenKMSEncryptionKey`, and encrypted, then `token` must be provided. If `tokenSource` is set to `SECRET_MANAGER`, then `tokenSecretId` must be provided. If `tokenSource` is set to `PLAINTEXT`, then `token` must be provided. +* **rootCaCertificatePath**: The full URL to the root CA certificate in Cloud Storage. The certificate provided in Cloud Storage must be DER-encoded and can be supplied in binary or printable (Base64) encoding. If the certificate is provided in Base64 encoding, it must be bounded at the beginning by -----BEGIN CERTIFICATE-----, and must be bounded at the end by -----END CERTIFICATE-----. If this parameter is provided, this private CA certificate file is fetched and added to the Dataflow worker's trust store in order to verify the Splunk HEC endpoint's SSL certificate. If this parameter is not provided, the default trust store is used. For example, `gs://mybucket/mycerts/privateCA.crt`. +* **enableBatchLogs**: Specifies whether logs should be enabled for batches written to Splunk. Default: `true`. +* **enableGzipHttpCompression**: Specifies whether HTTP requests sent to Splunk HEC should be compressed (gzip content encoded). Default: `true`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. ## User-Defined functions (UDFs) @@ -275,18 +275,18 @@ resource "google_dataflow_job" "cloud_pubsub_to_splunk" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - url = "https://splunk-hec-host:8088" + inputSubscription = "" + url = "" outputDeadletterTopic = "" # token = "" # batchCount = "" # disableCertificateValidation = "" # parallelism = "" # includePubsubMessage = "" - # tokenKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # tokenSecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # tokenKMSEncryptionKey = "" + # tokenSecretId = "" # tokenSource = "" - # rootCaCertificatePath = "gs://mybucket/mycerts/privateCA.crt" + # rootCaCertificatePath = "" # enableBatchLogs = "true" # enableGzipHttpCompression = "true" # javascriptTextTransformGcsPath = "" diff --git a/v1/README_Cloud_Spanner_to_GCS_Avro.md b/v1/README_Cloud_Spanner_to_GCS_Avro.md index eddd41c13e..65995ab6fd 100644 --- a/v1/README_Cloud_Spanner_to_GCS_Avro.md +++ b/v1/README_Cloud_Spanner_to_GCS_Avro.md @@ -33,21 +33,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **instanceId** : The instance ID of the Spanner database that you want to export. -* **databaseId** : The database ID of the Spanner database that you want to export. -* **outputDir** : The Cloud Storage path to export Avro files to. The export job creates a new directory under this path that contains the exported files. (Example: gs://your-bucket/your-path). +* **instanceId**: The instance ID of the Spanner database that you want to export. +* **databaseId**: The database ID of the Spanner database that you want to export. +* **outputDir**: The Cloud Storage path to export Avro files to. The export job creates a new directory under this path that contains the exported files. For example, `gs://your-bucket/your-path`. ### Optional parameters -* **avroTempDirectory** : The Cloud Storage path where temporary Avro files are written. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. Only used for testing. (Example: https://batch-spanner.googleapis.com). Defaults to: https://batch-spanner.googleapis.com. -* **snapshotTime** : The timestamp that corresponds to the version of the Spanner database that you want to read. The timestamp must be specified by using RFC 3339 UTC `Zulu` format. The timestamp must be in the past, and maximum timestamp staleness applies. (Example: 1990-12-31T23:59:60Z). Defaults to empty. -* **spannerProjectId** : The ID of the Google Cloud project that contains the Spanner database that you want to read data from. -* **shouldExportTimestampAsLogicalType** : If true, timestamps are exported as a `long` type with `timestamp-micros` logical type. By default, this parameter is set to `false` and timestamps are exported as ISO-8601 strings at nanosecond precision. -* **tableNames** : A comma-separated list of tables specifying the subset of the Spanner database to export. If you set this parameter, you must either include all of the related tables (parent tables and foreign key referenced tables) or set the `shouldExportRelatedTables` parameter to `true`.If the table is in named schema, please use fully qualified name. For example: `sch1.foo` in which `sch1` is the schema name and `foo` is the table name. Defaults to empty. -* **shouldExportRelatedTables** : Whether to include related tables. This parameter is used in conjunction with the `tableNames` parameter. Defaults to: false. -* **spannerPriority** : The request priority for Spanner calls. Possible values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`. -* **dataBoostEnabled** : Set to `true` to use the compute resources of Spanner Data Boost to run the job with near-zero impact on Spanner OLTP workflows. When set to `true`, you also need the `spanner.databases.useDataBoost` IAM permission. For more information, see the Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: false. +* **avroTempDirectory**: The Cloud Storage path where temporary Avro files are written. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. Only used for testing. For example, `https://batch-spanner.googleapis.com`. Defaults to: https://batch-spanner.googleapis.com. +* **snapshotTime**: The timestamp that corresponds to the version of the Spanner database that you want to read. The timestamp must be specified by using RFC 3339 UTC `Zulu` format. The timestamp must be in the past, and maximum timestamp staleness applies. For example, `1990-12-31T23:59:60Z`. Defaults to empty. +* **spannerProjectId**: The ID of the Google Cloud project that contains the Spanner database that you want to read data from. +* **shouldExportTimestampAsLogicalType**: If `true`, timestamps are exported as a `long` type with `timestamp-micros` logical type. By default, this parameter is set to `false` and timestamps are exported as ISO-8601 strings at nanosecond precision. +* **tableNames**: A comma-separated list of tables specifying the subset of the Spanner database to export. If you set this parameter, you must either include all of the related tables (parent tables and foreign key referenced tables) or set the `shouldExportRelatedTables` parameter to `true`.If the table is in named schema, please use fully qualified name. For example: `sch1.foo` in which `sch1` is the schema name and `foo` is the table name. Defaults to empty. +* **shouldExportRelatedTables**: Whether to include related tables. This parameter is used in conjunction with the `tableNames` parameter. Defaults to: false. +* **spannerPriority**: The request priority for Spanner calls. Possible values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`. +* **dataBoostEnabled**: Set to `true` to use the compute resources of Spanner Data Boost to run the job with near-zero impact on Spanner OLTP workflows. When set to `true`, you also need the `spanner.databases.useDataBoost` IAM permission. For more information, see the Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: false. @@ -247,10 +247,10 @@ resource "google_dataflow_job" "cloud_spanner_to_gcs_avro" { parameters = { instanceId = "" databaseId = "" - outputDir = "gs://your-bucket/your-path" + outputDir = "" # avroTempDirectory = "" # spannerHost = "https://batch-spanner.googleapis.com" - # snapshotTime = "1990-12-31T23:59:60Z" + # snapshotTime = "" # spannerProjectId = "" # shouldExportTimestampAsLogicalType = "false" # tableNames = "" diff --git a/v1/README_Cloud_Spanner_vectors_to_Cloud_Storage.md b/v1/README_Cloud_Spanner_vectors_to_Cloud_Storage.md index 4a1cb4512a..a19a3106a6 100644 --- a/v1/README_Cloud_Spanner_vectors_to_Cloud_Storage.md +++ b/v1/README_Cloud_Spanner_vectors_to_Cloud_Storage.md @@ -25,20 +25,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **spannerProjectId** : The project ID of the Spanner instance. -* **spannerInstanceId** : The ID of the Spanner instance to export the vector embeddings from. -* **spannerDatabaseId** : The ID of the Spanner database to export the vector embeddings from. -* **spannerTable** : The Spanner table to read from. -* **spannerColumnsToExport** : A comma-separated list of required columns for the Vertex AI Vector Search index. The ID and embedding columns are required by Vector Search. If your column names don't match the Vertex AI Vector Search index input structure, create column mappings by using aliases. If the column names don't match the format expected by Vertex AI, use the notation from:to. For example, if you have columns named id and my_embedding, specify id, my_embedding:embedding. -* **gcsOutputFolder** : The Cloud Storage folder to write output files to. The path must end with a slash. (Example: gs://your-bucket/folder1/). -* **gcsOutputFilePrefix** : The filename prefix for writing output files. (Example: vector-embeddings). +* **spannerProjectId**: The project ID of the Spanner instance. +* **spannerInstanceId**: The ID of the Spanner instance to export the vector embeddings from. +* **spannerDatabaseId**: The ID of the Spanner database to export the vector embeddings from. +* **spannerTable**: The Spanner table to read from. +* **spannerColumnsToExport**: A comma-separated list of required columns for the Vertex AI Vector Search index. The ID and embedding columns are required by Vector Search. If your column names don't match the Vertex AI Vector Search index input structure, create column mappings by using aliases. If the column names don't match the format expected by Vertex AI, use the notation from:to. For example, if you have columns named id and my_embedding, specify id, my_embedding:embedding. +* **gcsOutputFolder**: The Cloud Storage folder to write output files to. The path must end with a slash. For example, `gs://your-bucket/folder1/`. +* **gcsOutputFilePrefix**: The filename prefix for writing output files. For example, `vector-embeddings`. ### Optional parameters -* **spannerHost** : The Spanner endpoint to call in the template. The default value is https://batch-spanner.googleapis.com. (Example: https://batch-spanner.googleapis.com). -* **spannerVersionTime** : If set, specifies the time when the database version must be taken. The value is a string in the RFC-3339 date format in Unix epoch time. For example: 1990-12-31T23:59:60Z. The timestamp must be in the past, and maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies. If not set, a strong bound (https://cloud.google.com/spanner/docs/timestamp-bounds#strong) is used to read the latest data. Defaults to empty. (Example: 1990-12-31T23:59:60Z). -* **spannerDataBoostEnabled** : When set to true, the template uses Spanner on-demand compute. The export job runs on independent compute resources that don't impact current Spanner workloads. Using this option incurs additional charges in Spanner. For more information, see Spanner Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: false. -* **spannerPriority** : The request priority for Spanner calls. The allowed values are HIGH, MEDIUM, and LOW. The default value is MEDIUM. +* **spannerHost**: The Spanner endpoint to call in the template. The default value is https://batch-spanner.googleapis.com. For example, `https://batch-spanner.googleapis.com`. +* **spannerVersionTime**: If set, specifies the time when the database version must be taken. The value is a string in the RFC-3339 date format in Unix epoch time. For example: `1990-12-31T23:59:60Z`. The timestamp must be in the past, and maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies. If not set, a strong bound (https://cloud.google.com/spanner/docs/timestamp-bounds#strong) is used to read the latest data. Defaults to `empty`. For example, `1990-12-31T23:59:60Z`. +* **spannerDataBoostEnabled**: When set to `true`, the template uses Spanner on-demand compute. The export job runs on independent compute resources that don't impact current Spanner workloads. Using this option incurs additional charges in Spanner. For more information, see Spanner Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: `false`. +* **spannerPriority**: The request priority for Spanner calls. The allowed values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`. @@ -238,10 +238,10 @@ resource "google_dataflow_job" "cloud_spanner_vectors_to_cloud_storage" { spannerDatabaseId = "" spannerTable = "" spannerColumnsToExport = "" - gcsOutputFolder = "gs://your-bucket/folder1/" - gcsOutputFilePrefix = "vector-embeddings" + gcsOutputFolder = "" + gcsOutputFilePrefix = "" # spannerHost = "https://batch-spanner.googleapis.com" - # spannerVersionTime = "1990-12-31T23:59:60Z" + # spannerVersionTime = "" # spannerDataBoostEnabled = "false" # spannerPriority = "" } diff --git a/v1/README_Datastore_to_Datastore_Delete.md b/v1/README_Datastore_to_Datastore_Delete.md index a57e414abb..5225c01a6d 100644 --- a/v1/README_Datastore_to_Datastore_Delete.md +++ b/v1/README_Datastore_to_Datastore_Delete.md @@ -18,16 +18,16 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **datastoreReadGqlQuery** : A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. -* **datastoreReadProjectId** : The ID of the Google Cloud project that contains the Datastore instance that you want to read data from. -* **datastoreDeleteProjectId** : Google Cloud Project Id of where to delete the datastore entities. +* **datastoreReadGqlQuery**: A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. +* **datastoreReadProjectId**: The ID of the Google Cloud project that contains the Datastore instance that you want to read data from. +* **datastoreDeleteProjectId**: Google Cloud Project Id of where to delete the datastore entities. ### Optional parameters -* **datastoreReadNamespace** : The namespace of the requested entities. To use the default namespace, leave this parameter blank. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **datastoreHintNumWorkers** : Hint for the expected number of workers in the Datastore ramp-up throttling step. Defaults to: 500. +* **datastoreReadNamespace**: The namespace of the requested entities. To use the default namespace, leave this parameter blank. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **datastoreHintNumWorkers**: Hint for the expected number of workers in the Datastore ramp-up throttling step. Defaults to: 500. ## User-Defined functions (UDFs) diff --git a/v1/README_Datastore_to_GCS_Text.md b/v1/README_Datastore_to_GCS_Text.md index 4a513cf37a..cd7eebdf74 100644 --- a/v1/README_Datastore_to_GCS_Text.md +++ b/v1/README_Datastore_to_GCS_Text.md @@ -19,15 +19,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **datastoreReadGqlQuery** : A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. -* **datastoreReadProjectId** : The ID of the Google Cloud project that contains the Datastore instance that you want to read data from. -* **textWritePrefix** : The Cloud Storage path prefix that specifies where the data is written. (Example: gs://mybucket/somefolder/). +* **datastoreReadGqlQuery**: A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. +* **datastoreReadProjectId**: The ID of the Google Cloud project that contains the Datastore instance that you want to read data from. +* **textWritePrefix**: The Cloud Storage path prefix that specifies where the data is written. For example, `gs://mybucket/somefolder/`. ### Optional parameters -* **datastoreReadNamespace** : The namespace of the requested entities. To use the default namespace, leave this parameter blank. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **datastoreReadNamespace**: The namespace of the requested entities. To use the default namespace, leave this parameter blank. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). ## User-Defined functions (UDFs) @@ -219,7 +219,7 @@ resource "google_dataflow_job" "datastore_to_gcs_text" { parameters = { datastoreReadGqlQuery = "" datastoreReadProjectId = "" - textWritePrefix = "gs://mybucket/somefolder/" + textWritePrefix = "" # datastoreReadNamespace = "" # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" diff --git a/v1/README_Firestore_to_Firestore_Delete.md b/v1/README_Firestore_to_Firestore_Delete.md index eb15ffd886..5801238a6b 100644 --- a/v1/README_Firestore_to_Firestore_Delete.md +++ b/v1/README_Firestore_to_Firestore_Delete.md @@ -18,16 +18,16 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **firestoreReadGqlQuery** : A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. -* **firestoreReadProjectId** : The ID of the Google Cloud project that contains the Firestore instance that you want to read data from. -* **firestoreDeleteProjectId** : Google Cloud Project Id of where to delete the firestore entities. +* **firestoreReadGqlQuery**: A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. +* **firestoreReadProjectId**: The ID of the Google Cloud project that contains the Firestore instance that you want to read data from. +* **firestoreDeleteProjectId**: Google Cloud Project Id of where to delete the firestore entities. ### Optional parameters -* **firestoreReadNamespace** : The namespace of the requested entities. To use the default namespace, leave this parameter blank. -* **firestoreHintNumWorkers** : Hint for the expected number of workers in the Firestore ramp-up throttling step. Defaults to: 500. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **firestoreReadNamespace**: The namespace of the requested entities. To use the default namespace, leave this parameter blank. +* **firestoreHintNumWorkers**: Hint for the expected number of workers in the Firestore ramp-up throttling step. Defaults to: 500. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). ## User-Defined functions (UDFs) diff --git a/v1/README_Firestore_to_GCS_Text.md b/v1/README_Firestore_to_GCS_Text.md index 569bbec1bf..3110a1a190 100644 --- a/v1/README_Firestore_to_GCS_Text.md +++ b/v1/README_Firestore_to_GCS_Text.md @@ -19,15 +19,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **firestoreReadGqlQuery** : A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. -* **firestoreReadProjectId** : The ID of the Google Cloud project that contains the Firestore instance that you want to read data from. -* **textWritePrefix** : The Cloud Storage path prefix that specifies where the data is written. (Example: gs://mybucket/somefolder/). +* **firestoreReadGqlQuery**: A GQL (https://cloud.google.com/datastore/docs/reference/gql_reference) query that specifies which entities to grab. For example, `SELECT * FROM MyKind`. +* **firestoreReadProjectId**: The ID of the Google Cloud project that contains the Firestore instance that you want to read data from. +* **textWritePrefix**: The Cloud Storage path prefix that specifies where the data is written. For example, `gs://mybucket/somefolder/`. ### Optional parameters -* **firestoreReadNamespace** : The namespace of the requested entities. To use the default namespace, leave this parameter blank. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **firestoreReadNamespace**: The namespace of the requested entities. To use the default namespace, leave this parameter blank. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). ## User-Defined functions (UDFs) @@ -219,7 +219,7 @@ resource "google_dataflow_job" "firestore_to_gcs_text" { parameters = { firestoreReadGqlQuery = "" firestoreReadProjectId = "" - textWritePrefix = "gs://mybucket/somefolder/" + textWritePrefix = "" # firestoreReadNamespace = "" # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" diff --git a/v1/README_GCS_Avro_to_Cloud_Bigtable.md b/v1/README_GCS_Avro_to_Cloud_Bigtable.md index 3999f51f8d..e50f50f4b9 100644 --- a/v1/README_GCS_Avro_to_Cloud_Bigtable.md +++ b/v1/README_GCS_Avro_to_Cloud_Bigtable.md @@ -18,14 +18,14 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProjectId** : The ID of the Google Cloud project that contains the Bigtable instance that you want to write data to. -* **bigtableInstanceId** : The ID of the Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Bigtable table to import. -* **inputFilePattern** : The Cloud Storage path pattern where data is located. (Example: gs:////*). +* **bigtableProjectId**: The ID of the Google Cloud project that contains the Bigtable instance that you want to write data to. +* **bigtableInstanceId**: The ID of the Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Bigtable table to import. +* **inputFilePattern**: The Cloud Storage path pattern where data is located. For example, `gs:///FOLDER/PREFIX*`. ### Optional parameters -* **splitLargeRows** : The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. . +* **splitLargeRows**: The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. @@ -205,7 +205,7 @@ resource "google_dataflow_job" "gcs_avro_to_cloud_bigtable" { bigtableProjectId = "" bigtableInstanceId = "" bigtableTableId = "" - inputFilePattern = "gs:////*" + inputFilePattern = "" # splitLargeRows = "" } } diff --git a/v1/README_GCS_Avro_to_Cloud_Spanner.md b/v1/README_GCS_Avro_to_Cloud_Spanner.md index 6b57828c39..ee4cf0ccaf 100644 --- a/v1/README_GCS_Avro_to_Cloud_Spanner.md +++ b/v1/README_GCS_Avro_to_Cloud_Spanner.md @@ -18,21 +18,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **instanceId** : The instance ID of the Spanner database. -* **databaseId** : The database ID of the Spanner database. -* **inputDir** : The Cloud Storage path where the Avro files are imported from. +* **instanceId**: The instance ID of the Spanner database. +* **databaseId**: The database ID of the Spanner database. +* **inputDir**: The Cloud Storage path where the Avro files are imported from. ### Optional parameters -* **spannerHost** : The Cloud Spanner endpoint to call in the template. Only used for testing. (Example: https://batch-spanner.googleapis.com). Defaults to: https://batch-spanner.googleapis.com. -* **waitForIndexes** : If `true`, the pipeline waits for indexes to be created. If `false`, the job might complete while indexes are still being created in the background. The default value is `false`. -* **waitForForeignKeys** : If `true`, the pipeline waits for foreign keys to be created. If `false`, the job might complete while foreign keys are still being created in the background. The default value is `false`. -* **waitForChangeStreams** : If `true`, the pipeline waits for change streams to be created. If `false`, the job might complete while change streams are still being created in the background. The default value is `true`. -* **waitForSequences** : By default, the import pipeline is blocked on sequence creation. If `false`, the import pipeline might complete with sequences still being created in the background. -* **earlyIndexCreateFlag** : Specifies whether early index creation is enabled. If the template runs a large number of DDL statements, it's more efficient to create indexes before loading data. Therefore, the default behavior is to create the indexes first when the number of DDL statements exceeds a threshold. To disable this feature, set `earlyIndexCreateFlag` to `false`. The default value is `true`. -* **spannerProjectId** : The ID of the Google Cloud project that contains the Spanner database. If not set, the default Google Cloud project is used. -* **ddlCreationTimeoutInMinutes** : The timeout in minutes for DDL statements performed by the template. The default value is 30 minutes. -* **spannerPriority** : The request priority for Spanner calls. Possible values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. Only used for testing. For example, `https://batch-spanner.googleapis.com`. Defaults to: https://batch-spanner.googleapis.com. +* **waitForIndexes**: If `true`, the pipeline waits for indexes to be created. If `false`, the job might complete while indexes are still being created in the background. The default value is `false`. +* **waitForForeignKeys**: If `true`, the pipeline waits for foreign keys to be created. If `false`, the job might complete while foreign keys are still being created in the background. The default value is `false`. +* **waitForChangeStreams**: If `true`, the pipeline waits for change streams to be created. If `false`, the job might complete while change streams are still being created in the background. The default value is `true`. +* **waitForSequences**: By default, the import pipeline is blocked on sequence creation. If `false`, the import pipeline might complete with sequences still being created in the background. +* **earlyIndexCreateFlag**: Specifies whether early index creation is enabled. If the template runs a large number of DDL statements, it's more efficient to create indexes before loading data. Therefore, the default behavior is to create the indexes first when the number of DDL statements exceeds a threshold. To disable this feature, set `earlyIndexCreateFlag` to `false`. The default value is `true`. +* **spannerProjectId**: The ID of the Google Cloud project that contains the Spanner database. If not set, the default Google Cloud project is used. +* **ddlCreationTimeoutInMinutes**: The timeout in minutes for DDL statements performed by the template. The default value is 30 minutes. +* **spannerPriority**: The request priority for Spanner calls. Possible values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`. diff --git a/v1/README_GCS_CSV_to_BigQuery.md b/v1/README_GCS_CSV_to_BigQuery.md index 2d742cff3d..392774e6a6 100644 --- a/v1/README_GCS_CSV_to_BigQuery.md +++ b/v1/README_GCS_CSV_to_BigQuery.md @@ -16,18 +16,18 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The Cloud Storage path to the CSV file that contains the text to process. (Example: gs://your-bucket/path/*.csv). -* **schemaJSONPath** : The Cloud Storage path to the JSON file that defines your BigQuery schema. -* **outputTable** : The name of the BigQuery table that stores your processed data. If you reuse an existing BigQuery table, the data is appended to the destination table. -* **bigQueryLoadingTemporaryDirectory** : The temporary directory to use during the BigQuery loading process. (Example: gs://your-bucket/your-files/temp_dir). -* **badRecordsOutputTable** : The name of the BigQuery table to use to store the rejected data when processing the CSV files. If you reuse an existing BigQuery table, the data is appended to the destination table. The schema of this table must match the error table schema (https://cloud.google.com/dataflow/docs/guides/templates/provided/cloud-storage-csv-to-bigquery#GcsCSVToBigQueryBadRecordsSchema). -* **delimiter** : The column delimiter that the CSV file uses. (Example: ,). -* **csvFormat** : The CSV format according to Apache Commons CSV format. Defaults to: Default. +* **inputFilePattern**: The Cloud Storage path to the CSV file that contains the text to process. For example, `gs://your-bucket/path/*.csv`. +* **schemaJSONPath**: The Cloud Storage path to the JSON file that defines your BigQuery schema. +* **outputTable**: The name of the BigQuery table that stores your processed data. If you reuse an existing BigQuery table, the data is appended to the destination table. +* **bigQueryLoadingTemporaryDirectory**: The temporary directory to use during the BigQuery loading process. For example, `gs://your-bucket/your-files/temp_dir`. +* **badRecordsOutputTable**: The name of the BigQuery table to use to store the rejected data when processing the CSV files. If you reuse an existing BigQuery table, the data is appended to the destination table. The schema of this table must match the error table schema (https://cloud.google.com/dataflow/docs/guides/templates/provided/cloud-storage-csv-to-bigquery#GcsCSVToBigQueryBadRecordsSchema). +* **delimiter**: The column delimiter that the CSV file uses. For example, `,`. +* **csvFormat**: The CSV format according to Apache Commons CSV format. Defaults to: `Default`. ### Optional parameters -* **containsHeaders** : Whether headers are included in the CSV file. Defaults to: false. -* **csvFileEncoding** : The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16. Defaults to: UTF-8. +* **containsHeaders**: Whether headers are included in the CSV file. Defaults to: `false`. +* **csvFileEncoding**: The CSV file character encoding format. Allowed Values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`. Defaults to: UTF-8. @@ -216,12 +216,12 @@ resource "google_dataflow_job" "gcs_csv_to_bigquery" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://your-bucket/path/*.csv" + inputFilePattern = "" schemaJSONPath = "" outputTable = "" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" + bigQueryLoadingTemporaryDirectory = "" badRecordsOutputTable = "" - delimiter = "," + delimiter = "" csvFormat = "" # containsHeaders = "false" # csvFileEncoding = "UTF-8" diff --git a/v1/README_GCS_Parquet_to_Cloud_Bigtable.md b/v1/README_GCS_Parquet_to_Cloud_Bigtable.md index 8ea1677b77..012aefa96d 100644 --- a/v1/README_GCS_Parquet_to_Cloud_Bigtable.md +++ b/v1/README_GCS_Parquet_to_Cloud_Bigtable.md @@ -18,14 +18,14 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProjectId** : The Google Cloud project ID associated with the Bigtable instance. -* **bigtableInstanceId** : The ID of the Cloud Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Bigtable table to import. -* **inputFilePattern** : The Cloud Storage path with the files that contain the data. (Example: gs://your-bucket/your-files/*.parquet). +* **bigtableProjectId**: The Google Cloud project ID associated with the Bigtable instance. +* **bigtableInstanceId**: The ID of the Cloud Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Bigtable table to import. +* **inputFilePattern**: The Cloud Storage path with the files that contain the data. For example, `gs://your-bucket/your-files/*.parquet`. ### Optional parameters -* **splitLargeRows** : The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. . +* **splitLargeRows**: The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. @@ -205,7 +205,7 @@ resource "google_dataflow_job" "gcs_parquet_to_cloud_bigtable" { bigtableProjectId = "" bigtableInstanceId = "" bigtableTableId = "" - inputFilePattern = "gs://your-bucket/your-files/*.parquet" + inputFilePattern = "" # splitLargeRows = "" } } diff --git a/v1/README_GCS_SequenceFile_to_Cloud_Bigtable.md b/v1/README_GCS_SequenceFile_to_Cloud_Bigtable.md index 85e1464716..883d48356d 100644 --- a/v1/README_GCS_SequenceFile_to_Cloud_Bigtable.md +++ b/v1/README_GCS_SequenceFile_to_Cloud_Bigtable.md @@ -18,15 +18,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigtableProject** : The ID of the Google Cloud project that contains the Bigtable instance that you want to write data to. -* **bigtableInstanceId** : The ID of the Bigtable instance that contains the table. -* **bigtableTableId** : The ID of the Bigtable table to import. -* **sourcePattern** : The Cloud Storage path pattern to the location of the data. (Example: gs://your-bucket/your-path/prefix*). +* **bigtableProject**: The ID of the Google Cloud project that contains the Bigtable instance that you want to write data to. +* **bigtableInstanceId**: The ID of the Bigtable instance that contains the table. +* **bigtableTableId**: The ID of the Bigtable table to import. +* **sourcePattern**: The Cloud Storage path pattern to the location of the data. For example, `gs://your-bucket/your-path/prefix*`. ### Optional parameters -* **bigtableAppProfileId** : The ID of the Bigtable application profile to use for the import. If you don't specify an application profile, Bigtable uses the instance's default application profile (https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile). -* **mutationThrottleLatencyMs** : Optional Set mutation latency throttling (enables the feature). Value in milliseconds. Defaults to: 0. +* **bigtableAppProfileId**: The ID of the Bigtable application profile to use for the import. If you don't specify an application profile, Bigtable uses the instance's default application profile (https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile). +* **mutationThrottleLatencyMs**: Optional Set mutation latency throttling (enables the feature). Value in milliseconds. Defaults to: 0. @@ -209,7 +209,7 @@ resource "google_dataflow_job" "gcs_sequencefile_to_cloud_bigtable" { bigtableProject = "" bigtableInstanceId = "" bigtableTableId = "" - sourcePattern = "gs://your-bucket/your-path/prefix*" + sourcePattern = "" # bigtableAppProfileId = "" # mutationThrottleLatencyMs = "0" } diff --git a/v1/README_GCS_Text_to_BigQuery.md b/v1/README_GCS_Text_to_BigQuery.md index 11feb39ca2..49815457aa 100644 --- a/v1/README_GCS_Text_to_BigQuery.md +++ b/v1/README_GCS_Text_to_BigQuery.md @@ -19,8 +19,8 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : Path of the file pattern glob to read from. (Example: gs://your-bucket/path/*.csv). -* **JSONPath** : JSON file with BigQuery Schema description. JSON Example: { +* **inputFilePattern**: Path of the file pattern glob to read from. For example, `gs://your-bucket/path/*.csv`. +* **JSONPath**: JSON file with BigQuery Schema description. JSON Example: { "BigQuery Schema": [ { "name": "location", @@ -44,13 +44,13 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat } ] }. -* **outputTable** : BigQuery table location to write the output to. The table's schema must match the input objects. -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process (Example: gs://your-bucket/your-files/temp_dir). +* **outputTable**: BigQuery table location to write the output to. The table's schema must match the input objects. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). ## User-Defined functions (UDFs) @@ -240,10 +240,10 @@ resource "google_dataflow_job" "gcs_text_to_bigquery" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://your-bucket/path/*.csv" + inputFilePattern = "" JSONPath = "" outputTable = "" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" + bigQueryLoadingTemporaryDirectory = "" # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" } diff --git a/v1/README_GCS_Text_to_Cloud_PubSub.md b/v1/README_GCS_Text_to_Cloud_PubSub.md index 76626c5237..a284548e9d 100644 --- a/v1/README_GCS_Text_to_Cloud_PubSub.md +++ b/v1/README_GCS_Text_to_Cloud_PubSub.md @@ -24,8 +24,8 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The input file pattern to read from. (Example: gs://bucket-name/files/*.json). -* **outputTopic** : The Pub/Sub input topic to write to. The name must be in the format `projects//topics/`. (Example: projects/your-project-id/topics/your-topic-name). +* **inputFilePattern**: The input file pattern to read from. For example, `gs://bucket-name/files/*.json`. +* **outputTopic**: The Pub/Sub input topic to write to. The name must be in the format `projects//topics/`. For example, `projects/your-project-id/topics/your-topic-name`. ### Optional parameters @@ -196,8 +196,8 @@ resource "google_dataflow_job" "gcs_text_to_cloud_pubsub" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://bucket-name/files/*.json" - outputTopic = "projects/your-project-id/topics/your-topic-name" + inputFilePattern = "" + outputTopic = "" } } ``` diff --git a/v1/README_GCS_Text_to_Cloud_Spanner.md b/v1/README_GCS_Text_to_Cloud_Spanner.md index de286dae99..01ce3a7800 100644 --- a/v1/README_GCS_Text_to_Cloud_Spanner.md +++ b/v1/README_GCS_Text_to_Cloud_Spanner.md @@ -17,24 +17,24 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **instanceId** : The instance ID of the Spanner database. -* **databaseId** : The database ID of the Spanner database. -* **importManifest** : The path in Cloud Storage to use when importing manifest files. (Example: gs://your-bucket/your-folder/your-manifest.json). +* **instanceId**: The instance ID of the Spanner database. +* **databaseId**: The database ID of the Spanner database. +* **importManifest**: The path in Cloud Storage to use when importing manifest files. For example, `gs://your-bucket/your-folder/your-manifest.json`. ### Optional parameters -* **spannerHost** : The Cloud Spanner endpoint to call in the template. Only used for testing. (Example: https://batch-spanner.googleapis.com). Defaults to: https://batch-spanner.googleapis.com. -* **columnDelimiter** : The column delimiter that the source file uses. The default value is ','. (Example: ,). -* **fieldQualifier** : The character that must surround any value in the source file that contains the columnDelimiter. The default value is ". -* **trailingDelimiter** : Specifies whether the lines in the source files have trailing delimiters, that is, whether the `columnDelimiter` character appears at the end of each line, after the last column value). The default value is `true`. -* **escape** : The escape character the source file uses. By default, this parameter is not set and the template does not use the escape character. -* **nullString** : The string that represents a `NULL` value. By default, this parameter is not set and the template does not use the null string. -* **dateFormat** : The format used to parse date columns. By default, the pipeline tries to parse the date columns as `yyyy-M-d[' 00:00:00']`, for example, as 2019-01-31 or 2019-1-1 00:00:00. If your date format is different, specify the format using the java.time.format.DateTimeFormatter (https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/time/format/DateTimeFormatter.html) patterns. -* **timestampFormat** : The format used to parse timestamp columns. If the timestamp is a long integer, then it is parsed as Unix epoch time. Otherwise, it is parsed as a string using the java.time.format.DateTimeFormatter.ISO_INSTANT (https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/time/format/DateTimeFormatter.html#ISO_INSTANT) format. For other cases, specify your own pattern string, for example, using `MMM dd yyyy HH:mm:ss.SSSVV` for timestamps in the form of `"Jan 21 1998 01:02:03.456+08:00"`. -* **spannerProjectId** : The ID of the Google Cloud project that contains the Spanner database. If not set, the project ID of the default Google Cloud project is used. -* **spannerPriority** : The request priority for Spanner calls. Possible values are HIGH, MEDIUM, and LOW. The default value is MEDIUM. -* **handleNewLine** : If `true`, the input data can contain newline characters. Otherwise, newline characters cause an error. The default value is `false`. Enabling newline handling can reduce performance. -* **invalidOutputPath** : The Cloud Storage path to use when writing rows that cannot be imported. (Example: gs://your-bucket/your-path). Defaults to empty. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. Only used for testing. For example, `https://batch-spanner.googleapis.com`. Defaults to: https://batch-spanner.googleapis.com. +* **columnDelimiter**: The column delimiter that the source file uses. The default value is `,`. For example, `,`. +* **fieldQualifier**: The character that must surround any value in the source file that contains the columnDelimiter. The default value is double quotes. +* **trailingDelimiter**: Specifies whether the lines in the source files have trailing delimiters, that is, whether the `columnDelimiter` character appears at the end of each line, after the last column value. The default value is `true`. +* **escape**: The escape character the source file uses. By default, this parameter is not set and the template does not use the escape character. +* **nullString**: The string that represents a `NULL` value. By default, this parameter is not set and the template does not use the null string. +* **dateFormat**: The format used to parse date columns. By default, the pipeline tries to parse the date columns as `yyyy-M-d[' 00:00:00']`, for example, as `2019-01-31` or `2019-1-1 00:00:00`. If your date format is different, specify the format using the java.time.format.DateTimeFormatter (https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/time/format/DateTimeFormatter.html) patterns. +* **timestampFormat**: The format used to parse timestamp columns. If the timestamp is a long integer, then it is parsed as Unix epoch time. Otherwise, it is parsed as a string using the java.time.format.DateTimeFormatter.ISO_INSTANT (https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/time/format/DateTimeFormatter.html#ISO_INSTANT) format. For other cases, specify your own pattern string, for example, using `MMM dd yyyy HH:mm:ss.SSSVV` for timestamps in the form of `Jan 21 1998 01:02:03.456+08:00`. +* **spannerProjectId**: The ID of the Google Cloud project that contains the Spanner database. If not set, the project ID of the default Google Cloud project is used. +* **spannerPriority**: The request priority for Spanner calls. Possible values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`. +* **handleNewLine**: If `true`, the input data can contain newline characters. Otherwise, newline characters cause an error. The default value is `false`. Enabling newline handling can reduce performance. +* **invalidOutputPath**: The Cloud Storage path to use when writing rows that cannot be imported. For example, `gs://your-bucket/your-path`. Defaults to empty. @@ -243,7 +243,7 @@ resource "google_dataflow_job" "gcs_text_to_cloud_spanner" { parameters = { instanceId = "" databaseId = "" - importManifest = "gs://your-bucket/your-folder/your-manifest.json" + importManifest = "" # spannerHost = "https://batch-spanner.googleapis.com" # columnDelimiter = "," # fieldQualifier = """ @@ -255,7 +255,7 @@ resource "google_dataflow_job" "gcs_text_to_cloud_spanner" { # spannerProjectId = "" # spannerPriority = "" # handleNewLine = "false" - # invalidOutputPath = "gs://your-bucket/your-path" + # invalidOutputPath = "" } } ``` diff --git a/v1/README_GCS_Text_to_Datastore.md b/v1/README_GCS_Text_to_Datastore.md index a7ac8e2d9f..179d9c9570 100644 --- a/v1/README_GCS_Text_to_Datastore.md +++ b/v1/README_GCS_Text_to_Datastore.md @@ -20,15 +20,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **textReadPattern** : A Cloud Storage path pattern that specifies the location of your text data files. For example, `gs://mybucket/somepath/*.json`. -* **datastoreWriteProjectId** : The ID of the Google Cloud project to write the Datastore entities to. -* **errorWritePath** : The error log output file to use for write failures that occur during processing. (Example: gs://your-bucket/errors/). +* **textReadPattern**: A Cloud Storage path pattern that specifies the location of your text data files. For example, `gs://mybucket/somepath/*.json`. +* **datastoreWriteProjectId**: The ID of the Google Cloud project to write the Datastore entities to. +* **errorWritePath**: The error log output file to use for write failures that occur during processing. For example, `gs://your-bucket/errors/`. ### Optional parameters -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **datastoreHintNumWorkers** : Hint for the expected number of workers in the Datastore ramp-up throttling step. Default is `500`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **datastoreHintNumWorkers**: Hint for the expected number of workers in the Datastore ramp-up throttling step. Defaults to `500`. ## User-Defined functions (UDFs) @@ -220,7 +220,7 @@ resource "google_dataflow_job" "gcs_text_to_datastore" { parameters = { textReadPattern = "" datastoreWriteProjectId = "" - errorWritePath = "gs://your-bucket/errors/" + errorWritePath = "" # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # datastoreHintNumWorkers = "500" diff --git a/v1/README_GCS_Text_to_Firestore.md b/v1/README_GCS_Text_to_Firestore.md index f0e1f4a4bd..522ffd0498 100644 --- a/v1/README_GCS_Text_to_Firestore.md +++ b/v1/README_GCS_Text_to_Firestore.md @@ -20,15 +20,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **textReadPattern** : A Cloud Storage path pattern that specifies the location of your text data files. For example, `gs://mybucket/somepath/*.json`. -* **firestoreWriteProjectId** : The ID of the Google Cloud project to write the Firestore entities to. -* **errorWritePath** : The error log output file to use for write failures that occur during processing. (Example: gs://your-bucket/errors/). +* **textReadPattern**: A Cloud Storage path pattern that specifies the location of your text data files. For example, `gs://mybucket/somepath/*.json`. +* **firestoreWriteProjectId**: The ID of the Google Cloud project to write the Firestore entities to. +* **errorWritePath**: The error log output file to use for write failures that occur during processing. For example, `gs://your-bucket/errors/`. ### Optional parameters -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **firestoreHintNumWorkers** : Hint for the expected number of workers in the Firestore ramp-up throttling step. Default is 500. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **firestoreHintNumWorkers**: Hint for the expected number of workers in the Firestore ramp-up throttling step. The default value is `500`. ## User-Defined functions (UDFs) @@ -220,7 +220,7 @@ resource "google_dataflow_job" "gcs_text_to_firestore" { parameters = { textReadPattern = "" firestoreWriteProjectId = "" - errorWritePath = "gs://your-bucket/errors/" + errorWritePath = "" # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # firestoreHintNumWorkers = "" diff --git a/v1/README_Jdbc_to_BigQuery.md b/v1/README_Jdbc_to_BigQuery.md index 435929e589..6ebdb7b27c 100644 --- a/v1/README_Jdbc_to_BigQuery.md +++ b/v1/README_Jdbc_to_BigQuery.md @@ -25,22 +25,22 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverJars** : Comma separate Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **connectionURL** : Url connection string to connect to the JDBC source. (Example: jdbc:mysql://some-host:3306/sampledb). -* **query** : Query to be executed on the source to extract the data. If a Cloud Storage path is given (gs://...), the query will be fetched from that file. (Example: select * from sampledb.sample_table). -* **outputTable** : BigQuery table location to write the output to. The table's schema must match the input objects. -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process (Example: gs://your-bucket/your-files/temp_dir). +* **driverJars**: Comma separate Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **connectionURL**: Url connection string to connect to the JDBC source. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **query**: Query to be executed on the source to extract the data. If a Cloud Storage path is given (gs://...), the query will be fetched from that file. For example, `select * from sampledb.sample_table`. +* **outputTable**: BigQuery table location to write the output to. The table's schema must match the input objects. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **username** : User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **password** : Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **KMSEncryptionKey** : If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **useColumnAlias** : If enabled (set to true) the pipeline will consider column alias ("AS") instead of the column name to map the rows to BigQuery. Defaults to false. -* **disabledAlgorithms** : Comma-separated list of algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma-separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **username**: User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **password**: Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **KMSEncryptionKey**: If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **useColumnAlias**: If enabled (set to true) the pipeline will consider column alias ("AS") instead of the column name to map the rows to BigQuery. Defaults to false. +* **disabledAlgorithms**: Comma-separated list of algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma-separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -241,19 +241,19 @@ resource "google_dataflow_job" "jdbc_to_bigquery" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - driverClassName = "com.mysql.jdbc.Driver" - connectionURL = "jdbc:mysql://some-host:3306/sampledb" - query = "select * from sampledb.sample_table" + driverJars = "" + driverClassName = "" + connectionURL = "" + query = "" outputTable = "" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" + bigQueryLoadingTemporaryDirectory = "" + # connectionProperties = "" # username = "" # password = "" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # KMSEncryptionKey = "" # useColumnAlias = "false" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v1/README_PubSub_Subscription_to_BigQuery.md b/v1/README_PubSub_Subscription_to_BigQuery.md index 4b3607e31b..abbe209540 100644 --- a/v1/README_PubSub_Subscription_to_BigQuery.md +++ b/v1/README_PubSub_Subscription_to_BigQuery.md @@ -20,15 +20,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : The BigQuery output table location, in the format `:.`. -* **inputSubscription** : The Pub/Sub input subscription to read from, in the format `projects//subscriptions/`. +* **outputTableSpec**: The BigQuery output table location, in the format `:.`. +* **inputSubscription**: The Pub/Sub input subscription to read from, in the format `projects//subscriptions/`. ### Optional parameters -* **outputDeadletterTable** : The BigQuery table to use for messages that fail to reach the output table, in the format of `:.`. If the table doesn't exist, it is created during pipeline execution. If not specified, `OUTPUT_TABLE_SPEC_error_records` is used. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. +* **outputDeadletterTable**: The BigQuery table to use for messages that fail to reach the output table, in the format of `:.`. If the table doesn't exist, it is created during pipeline execution. If not specified, `OUTPUT_TABLE_SPEC_error_records` is used. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. ## User-Defined functions (UDFs) diff --git a/v1/README_PubSub_to_BigQuery.md b/v1/README_PubSub_to_BigQuery.md index fdab41b3af..bf88206103 100644 --- a/v1/README_PubSub_to_BigQuery.md +++ b/v1/README_PubSub_to_BigQuery.md @@ -20,15 +20,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : The BigQuery output table location, in the format `:.`. -* **inputTopic** : The Pub/Sub topic to read the input from. +* **outputTableSpec**: The BigQuery output table location, in the format `:.`. +* **inputTopic**: The Pub/Sub topic to read the input from. ### Optional parameters -* **outputDeadletterTable** : The BigQuery table to use for messages that fail to reach the output table, in the format of `:.`. If the table doesn't exist, it is created during pipeline execution. If not specified, `OUTPUT_TABLE_SPEC_error_records` is used. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. +* **outputDeadletterTable**: The BigQuery table to use for messages that fail to reach the output table, in the format of `:.`. If the table doesn't exist, it is created during pipeline execution. If not specified, `OUTPUT_TABLE_SPEC_error_records` is used. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. ## User-Defined functions (UDFs) diff --git a/v1/README_Spanner_to_GCS_Text.md b/v1/README_Spanner_to_GCS_Text.md index 65e637306d..93ab53cfe8 100644 --- a/v1/README_Spanner_to_GCS_Text.md +++ b/v1/README_Spanner_to_GCS_Text.md @@ -18,19 +18,19 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **spannerTable** : The Spanner table to read the data from. -* **spannerProjectId** : The ID of the Google Cloud project that contains the Spanner database to read data from. -* **spannerInstanceId** : The instance ID of the requested table. -* **spannerDatabaseId** : The database ID of the requested table. -* **textWritePrefix** : The Cloud Storage path prefix that specifies where the data is written. (Example: gs://mybucket/somefolder/). +* **spannerTable**: The Spanner table to read the data from. +* **spannerProjectId**: The ID of the Google Cloud project that contains the Spanner database to read data from. +* **spannerInstanceId**: The instance ID of the requested table. +* **spannerDatabaseId**: The database ID of the requested table. +* **textWritePrefix**: The Cloud Storage path prefix that specifies where the data is written. For example, `gs://mybucket/somefolder/`. ### Optional parameters -* **csvTempDirectory** : The Cloud Storage path where temporary CSV files are written. (Example: gs://your-bucket/your-path). -* **spannerPriority** : The request priority (https://cloud.google.com/spanner/docs/reference/rest/v1/RequestOptions) for Spanner calls. Possible values are `HIGH`, `MEDIUM`, `LOW`. The default value is `MEDIUM`. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. Only used for testing. (Example: https://batch-spanner.googleapis.com). Defaults to: https://batch-spanner.googleapis.com. -* **spannerSnapshotTime** : The timestamp that corresponds to the version of the Spanner database that you want to read from. The timestamp must be specified in the RFC 3339 (https://tools.ietf.org/html/rfc3339) UTC "Zulu" format. The timestamp must be in the past and maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies. (Example: 1990-12-31T23:59:60Z). Defaults to empty. -* **dataBoostEnabled** : Set to `true` to use the compute resources of Spanner Data Boost to run the job with near-zero impact on Spanner OLTP workflows. When true, requires the `spanner.databases.useDataBoost` Identity and Access Management (IAM) permission. For more information, see Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: false. +* **csvTempDirectory**: The Cloud Storage path where temporary CSV files are written. For example, `gs://your-bucket/your-path`. +* **spannerPriority**: The request priority (https://cloud.google.com/spanner/docs/reference/rest/v1/RequestOptions) for Spanner calls. Possible values are `HIGH`, `MEDIUM`, `LOW`. The default value is `MEDIUM`. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. Only used for testing. For example, `https://batch-spanner.googleapis.com`. Defaults to: https://batch-spanner.googleapis.com. +* **spannerSnapshotTime**: The timestamp that corresponds to the version of the Spanner database that you want to read from. The timestamp must be specified in the RFC 3339 (https://tools.ietf.org/html/rfc3339) UTC Zulu Time format. The timestamp must be in the past and maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies. For example, `1990-12-31T23:59:60Z`. Defaults to empty. +* **dataBoostEnabled**: Set to `true` to use the compute resources of Spanner Data Boost to run the job with near-zero impact on Spanner OLTP workflows. When true, requires the `spanner.databases.useDataBoost` Identity and Access Management (IAM) permission. For more information, see Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: false. @@ -226,11 +226,11 @@ resource "google_dataflow_job" "spanner_to_gcs_text" { spannerProjectId = "" spannerInstanceId = "" spannerDatabaseId = "" - textWritePrefix = "gs://mybucket/somefolder/" - # csvTempDirectory = "gs://your-bucket/your-path" + textWritePrefix = "" + # csvTempDirectory = "" # spannerPriority = "" # spannerHost = "https://batch-spanner.googleapis.com" - # spannerSnapshotTime = "1990-12-31T23:59:60Z" + # spannerSnapshotTime = "" # dataBoostEnabled = "false" } } diff --git a/v1/README_Stream_DLP_GCS_Text_to_BigQuery.md b/v1/README_Stream_DLP_GCS_Text_to_BigQuery.md index 7f8079200a..41f81a6f12 100644 --- a/v1/README_Stream_DLP_GCS_Text_to_BigQuery.md +++ b/v1/README_Stream_DLP_GCS_Text_to_BigQuery.md @@ -34,15 +34,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The CSV files to read input data records from. Wildcards are also accepted. (Example: gs://mybucket/my_csv_filename.csv or gs://mybucket/file-*.csv). -* **deidentifyTemplateName** : The Sensitive Data Protection de-identification template to use for API requests, specified with the pattern projects//deidentifyTemplates/. (Example: projects/your-project-id/locations/global/deidentifyTemplates/generated_template_id). -* **datasetName** : The BigQuery dataset to use when sending tokenized results. The dataset must exist prior to execution. -* **dlpProjectId** : The ID for the Google Cloud project that owns the DLP API resource. This project can be the same project that owns the Sensitive Data Protection templates, or it can be a separate project. +* **inputFilePattern**: The CSV files to read input data records from. Wildcards are also accepted. For example, `gs://mybucket/my_csv_filename.csv or gs://mybucket/file-*.csv`. +* **deidentifyTemplateName**: The Sensitive Data Protection de-identification template to use for API requests, specified with the pattern `projects//deidentifyTemplates/`. For example, `projects/your-project-id/locations/global/deidentifyTemplates/generated_template_id`. +* **datasetName**: The BigQuery dataset to use when sending tokenized results. The dataset must exist prior to execution. +* **dlpProjectId**: The ID for the Google Cloud project that owns the DLP API resource. This project can be the same project that owns the Sensitive Data Protection templates, or it can be a separate project. ### Optional parameters -* **inspectTemplateName** : The Sensitive Data Protection inspection template to use for API requests, specified with the pattern projects//identifyTemplates/. (Example: projects/your-project-id/locations/global/inspectTemplates/generated_template_id). -* **batchSize** : The chunking or batch size to use for sending data to inspect and detokenize. For a CSV file, the value of `batchSize` is the number of rows in a batch. Determine the batch size based on the size of the records and the sizing of the file. The DLP API has a payload size limit of 524 KB per API call. +* **inspectTemplateName**: The Sensitive Data Protection inspection template to use for API requests, specified with the pattern `projects//identifyTemplates/`. For example, `projects/your-project-id/locations/global/inspectTemplates/generated_template_id`. +* **batchSize**: The chunking or batch size to use for sending data to inspect and detokenize. For a CSV file, the value of `batchSize` is the number of rows in a batch. Determine the batch size based on the size of the records and the sizing of the file. The DLP API has a payload size limit of 524 KB per API call. @@ -222,11 +222,11 @@ resource "google_dataflow_job" "stream_dlp_gcs_text_to_bigquery" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://mybucket/my_csv_filename.csv or gs://mybucket/file-*.csv" - deidentifyTemplateName = "projects/your-project-id/locations/global/deidentifyTemplates/generated_template_id" + inputFilePattern = "" + deidentifyTemplateName = "" datasetName = "" dlpProjectId = "" - # inspectTemplateName = "projects/your-project-id/locations/global/inspectTemplates/generated_template_id" + # inspectTemplateName = "" # batchSize = "" } } diff --git a/v1/README_Stream_GCS_Text_to_BigQuery.md b/v1/README_Stream_GCS_Text_to_BigQuery.md index edaa98aea2..5abd7dd324 100644 --- a/v1/README_Stream_GCS_Text_to_BigQuery.md +++ b/v1/README_Stream_GCS_Text_to_BigQuery.md @@ -30,8 +30,8 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : Path of the file pattern glob to read from. (Example: gs://your-bucket/path/*.csv). -* **JSONPath** : JSON file with BigQuery Schema description. JSON Example: { +* **inputFilePattern**: Path of the file pattern glob to read from. For example, `gs://your-bucket/path/*.csv`. +* **JSONPath**: JSON file with BigQuery Schema description. JSON Example: { "BigQuery Schema": [ { "name": "location", @@ -55,15 +55,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat } ] }. -* **outputTable** : BigQuery table location to write the output to. The table's schema must match the input objects. -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process (Example: gs://your-bucket/your-files/temp_dir). +* **outputTable**: BigQuery table location to write the output to. The table's schema must match the input objects. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **outputDeadletterTable** : BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. (Example: your-project-id:your-dataset.your-table-name). -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. +* **outputDeadletterTable**: BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. For example, `your-project-id:your-dataset.your-table-name`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Define the interval that workers may check for JavaScript UDF changes to reload the files. Defaults to: 0. ## User-Defined functions (UDFs) @@ -259,11 +259,11 @@ resource "google_dataflow_job" "stream_gcs_text_to_bigquery" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://your-bucket/path/*.csv" + inputFilePattern = "" JSONPath = "" outputTable = "" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" + bigQueryLoadingTemporaryDirectory = "" + # outputDeadletterTable = "" # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" diff --git a/v1/README_Stream_GCS_Text_to_Cloud_PubSub.md b/v1/README_Stream_GCS_Text_to_Cloud_PubSub.md index bb93150e7b..0786cd1296 100644 --- a/v1/README_Stream_GCS_Text_to_Cloud_PubSub.md +++ b/v1/README_Stream_GCS_Text_to_Cloud_PubSub.md @@ -32,8 +32,8 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The input file pattern to read from. (Example: gs://bucket-name/files/*.json). -* **outputTopic** : The Pub/Sub input topic to write to. The name must be in the format `projects//topics/`. (Example: projects/your-project-id/topics/your-topic-name). +* **inputFilePattern**: The input file pattern to read from. For example, `gs://bucket-name/files/*.json`. +* **outputTopic**: The Pub/Sub input topic to write to. The name must be in the format `projects//topics/`. For example, `projects/your-project-id/topics/your-topic-name`. ### Optional parameters @@ -204,8 +204,8 @@ resource "google_dataflow_job" "stream_gcs_text_to_cloud_pubsub" { region = var.region temp_gcs_location = "gs://bucket-name-here/temp" parameters = { - inputFilePattern = "gs://bucket-name/files/*.json" - outputTopic = "projects/your-project-id/topics/your-topic-name" + inputFilePattern = "" + outputTopic = "" } } ``` diff --git a/v1/README_Word_Count.md b/v1/README_Word_Count.md index 0763b723f1..2e7ec5e8c1 100644 --- a/v1/README_Word_Count.md +++ b/v1/README_Word_Count.md @@ -14,8 +14,8 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFile** : The input file pattern Dataflow reads from. Use the example file (gs://dataflow-samples/shakespeare/kinglear.txt) or enter the path to your own using the same format: gs://your-bucket/your-file.txt. -* **output** : Path and filename prefix for writing output files. Ex: gs://your-bucket/counts. +* **inputFile**: The input file pattern Dataflow reads from. Use the example file (gs://dataflow-samples/shakespeare/kinglear.txt) or enter the path to your own using the same format: gs://your-bucket/your-file.txt. +* **output**: Path and filename prefix for writing output files. Ex: gs://your-bucket/counts. ### Optional parameters diff --git a/v1/src/main/java/com/google/cloud/teleport/bigtable/AvroToBigtable.java b/v1/src/main/java/com/google/cloud/teleport/bigtable/AvroToBigtable.java index 77aa88891e..0d89795f3b 100644 --- a/v1/src/main/java/com/google/cloud/teleport/bigtable/AvroToBigtable.java +++ b/v1/src/main/java/com/google/cloud/teleport/bigtable/AvroToBigtable.java @@ -114,7 +114,7 @@ public interface Options extends PipelineOptions { groupName = "Source", description = "Input Cloud Storage File(s)", helpText = "The Cloud Storage path pattern where data is located.", - example = "gs:////*") + example = "gs:///FOLDER/PREFIX*") ValueProvider getInputFilePattern(); @SuppressWarnings("unused") @@ -125,7 +125,7 @@ public interface Options extends PipelineOptions { optional = true, description = "If true, large rows will be split into multiple MutateRows requests", helpText = - "The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. ") + "The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic.") ValueProvider getSplitLargeRows(); void setSplitLargeRows(ValueProvider splitLargeRows); diff --git a/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToJson.java b/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToJson.java index 56e6ecb163..e3fb5ffcdb 100644 --- a/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToJson.java +++ b/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToJson.java @@ -128,7 +128,7 @@ public interface Options extends PipelineOptions { optional = true, description = "JSON file prefix", helpText = - "The prefix of the JSON file name. For example, \"table1-\". If no value is provided, defaults to `part`.") + "The prefix of the JSON file name. For example, `table1-`. If no value is provided, defaults to `part`.") @Default.String("part") ValueProvider getFilenamePrefix(); diff --git a/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToParquet.java b/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToParquet.java index 4e3c5e7798..cc0855032e 100644 --- a/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToParquet.java +++ b/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToParquet.java @@ -111,7 +111,7 @@ public interface Options extends PipelineOptions { groupName = "Target", description = "Output file directory in Cloud Storage", helpText = - "The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse the directory path for date and time formatters. For example: gs://your-bucket/your-path.") + "The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse the directory path for date and time formatters. For example: `gs://your-bucket/your-path`.") ValueProvider getOutputDirectory(); @SuppressWarnings("unused") @@ -122,7 +122,7 @@ public interface Options extends PipelineOptions { groupName = "Target", description = "Parquet file prefix", helpText = - "The prefix of the Parquet file name. For example, \"table1-\". Defaults to: part.") + "The prefix of the Parquet file name. For example, `table1-`. Defaults to: `part`.") @Default.String("part") ValueProvider getFilenamePrefix(); diff --git a/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToVectorEmbeddings.java b/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToVectorEmbeddings.java index 953d62f341..4f51154910 100644 --- a/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToVectorEmbeddings.java +++ b/v1/src/main/java/com/google/cloud/teleport/bigtable/BigtableToVectorEmbeddings.java @@ -134,7 +134,7 @@ public interface Options extends PipelineOptions { optional = true, description = "JSON file prefix", helpText = - "The prefix of the JSON filename. For example: \"table1-\". If no value is provided, defaults to \"part\".") + "The prefix of the JSON filename. For example: `table1-`. If no value is provided, defaults to `part`.") @Default.String("part") ValueProvider getFilenamePrefix(); @@ -145,7 +145,7 @@ public interface Options extends PipelineOptions { order = 6, description = "ID column", helpText = - "The fully qualified column name where the ID is stored. In the format cf:col or _key.") + "The fully qualified column name where the ID is stored. In the format `cf:col` or `_key`.") ValueProvider getIdColumn(); @SuppressWarnings("unused") @@ -155,7 +155,7 @@ public interface Options extends PipelineOptions { order = 7, description = "Embedding column", helpText = - "The fully qualified column name where the embeddings are stored. In the format cf:col or _key.") + "The fully qualified column name where the embeddings are stored. In the format `cf:col` or `_key`.") ValueProvider getEmbeddingColumn(); @SuppressWarnings("unused") @@ -166,7 +166,7 @@ public interface Options extends PipelineOptions { optional = true, description = "Crowding tag column", helpText = - "The fully qualified column name where the crowding tag is stored. In the format cf:col or _key.") + "The fully qualified column name where the crowding tag is stored. In the format `cf:col` or `_key`.") ValueProvider getCrowdingTagColumn(); @SuppressWarnings("unused") @@ -177,7 +177,7 @@ public interface Options extends PipelineOptions { optional = true, description = "The byte size of the embeddings array. Can be 4 or 8.", helpText = - "The byte size of each entry in the embeddings array. For float, use the value 4. For double, use the value 8. Defaults to 4.") + "The byte size of each entry in the embeddings array. For float, use the value `4`. For double, use the value `8`. Defaults to `4`.") @Default.Integer(4) ValueProvider getEmbeddingByteSize(); @@ -189,7 +189,7 @@ public interface Options extends PipelineOptions { optional = true, description = "Allow restricts mappings", helpText = - "The comma-separated, fully qualified column names for the columns to use as the allow restricts, with their aliases. In the format cf:col->alias.") + "The comma-separated, fully qualified column names for the columns to use as the allow restricts, with their aliases. In the format `cf:col->alias`.") ValueProvider getAllowRestrictsMappings(); @SuppressWarnings("unused") @@ -200,7 +200,7 @@ public interface Options extends PipelineOptions { optional = true, description = "Deny restricts mappings", helpText = - "The comma-separated, fully qualified column names for the columns to use as the deny restricts, with their aliases. In the format cf:col->alias.") + "The comma-separated, fully qualified column names for the columns to use as the deny restricts, with their aliases. In the format `cf:col->alias`.") ValueProvider getDenyRestrictsMappings(); @SuppressWarnings("unused") @@ -211,7 +211,7 @@ public interface Options extends PipelineOptions { optional = true, description = "Integer numeric restricts mappings", helpText = - "The comma-separated, fully qualified column names of the columns to use as integer numeric_restricts, with their aliases. In the format cf:col->alias.") + "The comma-separated, fully qualified column names of the columns to use as integer numeric_restricts, with their aliases. In the format `cf:col->alias`.") ValueProvider getIntNumericRestrictsMappings(); @SuppressWarnings("unused") @@ -222,7 +222,7 @@ public interface Options extends PipelineOptions { optional = true, description = "Float numeric restricts mappings", helpText = - "The comma-separated, fully qualified column names of the columns to use as float (4 bytes) numeric_restricts, with their aliases. In the format cf:col->alias.") + "The comma-separated, fully qualified column names of the columns to use as float (4 bytes) numeric_restricts, with their aliases. In the format `cf:col->alias`.") ValueProvider getFloatNumericRestrictsMappings(); @SuppressWarnings("unused") @@ -233,7 +233,7 @@ public interface Options extends PipelineOptions { optional = true, description = "Double numeric restricts mappings", helpText = - "The comma-separated, fully qualified column names of the columns to use as double (8 bytes) numeric_restricts, with their aliases. In the format cf:col->alias.") + "The comma-separated, fully qualified column names of the columns to use as double (8 bytes) numeric_restricts, with their aliases. In the format `cf:col->alias`.") ValueProvider getDoubleNumericRestrictsMappings(); @SuppressWarnings("unused") diff --git a/v1/src/main/java/com/google/cloud/teleport/bigtable/CassandraToBigtable.java b/v1/src/main/java/com/google/cloud/teleport/bigtable/CassandraToBigtable.java index 71d99346d6..b60ca3cf90 100644 --- a/v1/src/main/java/com/google/cloud/teleport/bigtable/CassandraToBigtable.java +++ b/v1/src/main/java/com/google/cloud/teleport/bigtable/CassandraToBigtable.java @@ -87,7 +87,7 @@ public interface Options extends PipelineOptions { optional = true, description = "Cassandra Port", helpText = - "The TCP port to use to reach Apache Cassandra on the nodes. The default value is 9042.") + "The TCP port to use to reach Apache Cassandra on the nodes. The default value is `9042`.") @Default.Integer(9042) ValueProvider getCassandraPort(); @@ -155,7 +155,7 @@ public interface Options extends PipelineOptions { regexes = {"[-_.a-zA-Z0-9]+"}, description = "The Default Bigtable Column Family", helpText = - "The name of the column family of the Bigtable table. The default value is default.") + "The name of the column family of the Bigtable table. The default value is `default`.") @Default.String("default") ValueProvider getDefaultColumnFamily(); @@ -167,7 +167,7 @@ public interface Options extends PipelineOptions { groupName = "Target", optional = true, description = "The Row Key Separator", - helpText = "The separator used to build row-keys. The default value is '#'.") + helpText = "The separator used to build row-keys. The default value is `#`.") @Default.String("#") ValueProvider getRowKeySeparator(); diff --git a/v1/src/main/java/com/google/cloud/teleport/bigtable/ParquetToBigtable.java b/v1/src/main/java/com/google/cloud/teleport/bigtable/ParquetToBigtable.java index ddd71186ac..90408e38fb 100644 --- a/v1/src/main/java/com/google/cloud/teleport/bigtable/ParquetToBigtable.java +++ b/v1/src/main/java/com/google/cloud/teleport/bigtable/ParquetToBigtable.java @@ -129,7 +129,7 @@ public interface Options extends PipelineOptions { optional = true, description = "If true, large rows will be split into multiple MutateRows requests", helpText = - "The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic. ") + "The flag for enabling splitting of large rows into multiple MutateRows requests. Note that when a large row is split between multiple API calls, the updates to the row are not atomic.") ValueProvider getSplitLargeRows(); void setSplitLargeRows(ValueProvider splitLargeRows); diff --git a/v1/src/main/java/com/google/cloud/teleport/options/WindowedFilenamePolicyOptions.java b/v1/src/main/java/com/google/cloud/teleport/options/WindowedFilenamePolicyOptions.java index 73354f402c..b548399cca 100644 --- a/v1/src/main/java/com/google/cloud/teleport/options/WindowedFilenamePolicyOptions.java +++ b/v1/src/main/java/com/google/cloud/teleport/options/WindowedFilenamePolicyOptions.java @@ -72,7 +72,7 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { helpText = "Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no" + " difference in the year. Optionally, wrap the pattern with characters that" - + " aren't alphanumeric or the directory ('/') character. Defaults to `YYYY`.") + + " aren't alphanumeric or the directory (`/`) character. Defaults to `YYYY`.") ValueProvider getYearPattern(); void setYearPattern(ValueProvider yearPattern); @@ -85,7 +85,7 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { helpText = "Pattern for formatting the month. Must be one or more of the `M` character. " + "Optionally, wrap the pattern with characters that aren't alphanumeric or the " - + "directory ('/') character. Defaults to `MM`.") + + "directory (`/`) character. Defaults to `MM`.") ValueProvider getMonthPattern(); void setMonthPattern(ValueProvider monthPattern); @@ -98,7 +98,7 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { helpText = "Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for" + " day of year. Optionally," - + " wrap the pattern with characters that aren't alphanumeric or the directory ('/')" + + " wrap the pattern with characters that aren't alphanumeric or the directory (`/`)" + " character. Defaults to `dd`.") ValueProvider getDayPattern(); @@ -112,7 +112,7 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { helpText = "Pattern for formatting the hour. Must be one or more of the `H` character. Optionally," + " wrap the pattern with characters that aren't alphanumeric or the directory" - + " ('/') character. Defaults to `HH`.") + + " (`/`) character. Defaults to `HH`.") ValueProvider getHourPattern(); void setHourPattern(ValueProvider hourPattern); @@ -125,7 +125,7 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { helpText = "Pattern for formatting the minute. Must be one or more of the `m` character. Optionally," + " wrap the pattern with characters that aren't alphanumeric or the directory" - + " ('/') character. Defaults to `mm`.") + + " (`/`) character. Defaults to `mm`.") ValueProvider getMinutePattern(); void setMinutePattern(ValueProvider minutePattern); diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverter.java b/v1/src/main/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverter.java index 4047d86f90..39c6097a58 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverter.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverter.java @@ -18,6 +18,7 @@ import static com.google.cloud.teleport.spanner.AvroUtil.DEFAULT_EXPRESSION; import static com.google.cloud.teleport.spanner.AvroUtil.GENERATION_EXPRESSION; import static com.google.cloud.teleport.spanner.AvroUtil.HIDDEN; +import static com.google.cloud.teleport.spanner.AvroUtil.IDENTITY_COLUMN; import static com.google.cloud.teleport.spanner.AvroUtil.INPUT; import static com.google.cloud.teleport.spanner.AvroUtil.NOT_NULL; import static com.google.cloud.teleport.spanner.AvroUtil.OUTPUT; @@ -103,9 +104,6 @@ public Ddl toDdl(Collection avroSchemas) { builder.addChangeStream(toChangeStream(null, schema)); } else if (schema.getProp(SPANNER_SEQUENCE_OPTION + "0") != null || schema.getProp(SPANNER_SEQUENCE_KIND) != null) { - // Cloud Sequence always requires at least one option, - // `sequence_kind='bit_reversed_positive`, so `sequenceOption_0` must - // always be valid. builder.addSequence(toSequence(null, schema)); } else if (SPANNER_NAMED_SCHEMA.equals(schema.getProp(SPANNER_ENTITY))) { builder.addSchema(toSchema(null, schema)); @@ -454,7 +452,8 @@ public Sequence toSequence(String sequenceName, Schema schema) { LOG.debug("Converting to Ddl sequenceName {}", sequenceName); Sequence.Builder builder = Sequence.builder(dialect).name(sequenceName); - if (schema.getProp(SPANNER_SEQUENCE_KIND) != null) { + if (schema.getProp(SPANNER_SEQUENCE_KIND) != null + && schema.getProp(SPANNER_SEQUENCE_KIND).equals("bit_reversed_positive")) { builder.sequenceKind(schema.getProp(SPANNER_SEQUENCE_KIND)); } if (schema.getProp(SPANNER_SEQUENCE_SKIP_RANGE_MIN) != null @@ -469,7 +468,12 @@ public Sequence toSequence(String sequenceName, Schema schema) { ImmutableList.Builder sequenceOptions = ImmutableList.builder(); for (int i = 0; schema.getProp(SPANNER_SEQUENCE_OPTION + i) != null; i++) { - sequenceOptions.add(schema.getProp(SPANNER_SEQUENCE_OPTION + i)); + String prop = schema.getProp(SPANNER_SEQUENCE_OPTION + i); + if (prop.equals("sequence_kind=default")) { + // Specify no sequence kind by using the default_sequence_kind database option. + continue; + } + sequenceOptions.add(prop); } builder.options(sequenceOptions.build()); @@ -509,6 +513,22 @@ public Table toTable(String tableName, Schema schema) { Column.Builder column = table.column(f.name()); String sqlType = f.getProp(SQL_TYPE); String expression = f.getProp(GENERATION_EXPRESSION); + String identityColumn = f.getProp(IDENTITY_COLUMN); + if (identityColumn != null && Boolean.parseBoolean(identityColumn)) { + column.isIdentityColumn(true); + if (f.getProp(SPANNER_SEQUENCE_KIND) != null) { + column.sequenceKind(f.getProp(SPANNER_SEQUENCE_KIND)); + } + if (f.getProp(SPANNER_SEQUENCE_SKIP_RANGE_MIN) != null + && f.getProp(SPANNER_SEQUENCE_SKIP_RANGE_MAX) != null) { + column + .skipRangeMin(Long.valueOf(f.getProp(SPANNER_SEQUENCE_SKIP_RANGE_MIN))) + .skipRangeMax(Long.valueOf(f.getProp(SPANNER_SEQUENCE_SKIP_RANGE_MAX))); + } + if (f.getProp(SPANNER_SEQUENCE_COUNTER_START) != null) { + column.counterStartValue(Long.valueOf(f.getProp(SPANNER_SEQUENCE_COUNTER_START))); + } + } if (expression != null) { // This is a generated column. if (Strings.isNullOrEmpty(sqlType)) { diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/AvroUtil.java b/v1/src/main/java/com/google/cloud/teleport/spanner/AvroUtil.java index 544b3debf4..8e3782f862 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/AvroUtil.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/AvroUtil.java @@ -27,6 +27,7 @@ private AvroUtil() {} public static final String GENERATION_EXPRESSION = "generationExpression"; public static final String GOOGLE_FORMAT_VERSION = "googleFormatVersion"; public static final String GOOGLE_STORAGE = "googleStorage"; + public static final String IDENTITY_COLUMN = "identityColumn"; public static final String INPUT = "Input"; public static final String NOT_NULL = "notNull"; public static final String OUTPUT = "Output"; diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverter.java b/v1/src/main/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverter.java index c322c470fa..b0d1954a8a 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverter.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverter.java @@ -20,6 +20,7 @@ import static com.google.cloud.teleport.spanner.AvroUtil.GOOGLE_FORMAT_VERSION; import static com.google.cloud.teleport.spanner.AvroUtil.GOOGLE_STORAGE; import static com.google.cloud.teleport.spanner.AvroUtil.HIDDEN; +import static com.google.cloud.teleport.spanner.AvroUtil.IDENTITY_COLUMN; import static com.google.cloud.teleport.spanner.AvroUtil.INPUT; import static com.google.cloud.teleport.spanner.AvroUtil.NOT_NULL; import static com.google.cloud.teleport.spanner.AvroUtil.OUTPUT; @@ -175,7 +176,16 @@ public Collection convert(Ddl ddl) { // which are semantically logical entities. fieldBuilder.type(SchemaBuilder.builder().nullType()).withDefault(null); } else { - if (cm.defaultExpression() != null) { + if (cm.isIdentityColumn()) { + fieldBuilder.prop(IDENTITY_COLUMN, Boolean.toString(cm.isIdentityColumn())); + if (cm.sequenceKind() != null) { + fieldBuilder.prop(SPANNER_SEQUENCE_KIND, cm.sequenceKind()); + } + fieldBuilder.prop( + SPANNER_SEQUENCE_COUNTER_START, String.valueOf(cm.counterStartValue())); + fieldBuilder.prop(SPANNER_SEQUENCE_SKIP_RANGE_MIN, String.valueOf(cm.skipRangeMin())); + fieldBuilder.prop(SPANNER_SEQUENCE_SKIP_RANGE_MAX, String.valueOf(cm.skipRangeMax())); + } else if (cm.defaultExpression() != null) { fieldBuilder.prop(DEFAULT_EXPRESSION, cm.defaultExpression()); } Schema avroType = avroType(cm.type(), table.name() + "_" + columnOrdinal++); diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/ExportPipeline.java b/v1/src/main/java/com/google/cloud/teleport/spanner/ExportPipeline.java index 215664eb5d..393d0673d6 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/ExportPipeline.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/ExportPipeline.java @@ -172,7 +172,7 @@ public interface ExportPipelineOptions extends PipelineOptions { optional = true, description = "Export Timestamps as Timestamp-micros type", helpText = - "If true, timestamps are exported as a `long` type with `timestamp-micros` logical type. By default, this parameter is set to `false` and timestamps are exported as ISO-8601 strings at nanosecond precision.") + "If `true`, timestamps are exported as a `long` type with `timestamp-micros` logical type. By default, this parameter is set to `false` and timestamps are exported as ISO-8601 strings at nanosecond precision.") @Default.Boolean(false) ValueProvider getShouldExportTimestampAsLogicalType(); diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/TextImportPipeline.java b/v1/src/main/java/com/google/cloud/teleport/spanner/TextImportPipeline.java index b5c9618160..42bb5f6952 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/TextImportPipeline.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/TextImportPipeline.java @@ -152,7 +152,7 @@ public interface Options extends PipelineOptions { groupName = "Source", optional = true, description = "Column delimiter of the data files", - helpText = "The column delimiter that the source file uses. The default value is ','.", + helpText = "The column delimiter that the source file uses. The default value is `,`.", example = ",") @Default.Character(',') ValueProvider getColumnDelimiter(); @@ -166,7 +166,7 @@ public interface Options extends PipelineOptions { description = "Field qualifier used by the source file", helpText = "The character that must surround any value in the source file that " - + "contains the columnDelimiter. The default value is \".") + + "contains the columnDelimiter. The default value is double quotes.") @Default.Character('"') ValueProvider getFieldQualifier(); @@ -179,7 +179,7 @@ public interface Options extends PipelineOptions { description = "If true, the lines has trailing delimiters", helpText = "Specifies whether the lines in the source files have trailing delimiters, that is, whether the " - + "`columnDelimiter` character appears at the end of each line, after the last column value). " + + "`columnDelimiter` character appears at the end of each line, after the last column value. " + "The default value is `true`.") @Default.Boolean(true) ValueProvider getTrailingDelimiter(); @@ -218,7 +218,7 @@ public interface Options extends PipelineOptions { description = "Date format", helpText = "The format used to parse date columns. By default, the pipeline tries to parse the date columns " - + "as `yyyy-M-d[' 00:00:00']`, for example, as 2019-01-31 or 2019-1-1 00:00:00. If your date format " + + "as `yyyy-M-d[' 00:00:00']`, for example, as `2019-01-31` or `2019-1-1 00:00:00`. If your date format " + "is different, specify the format using the java.time.format.DateTimeFormatter " + "(https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/time/format/DateTimeFormatter.html) patterns.") ValueProvider getDateFormat(); @@ -232,11 +232,10 @@ public interface Options extends PipelineOptions { description = "Timestamp format", helpText = "The format used to parse timestamp columns. If the timestamp is a long integer, then it is parsed " - + "as Unix epoch time. Otherwise, it is parsed as a string using the " - + "java.time.format.DateTimeFormatter.ISO_INSTANT " + + "as Unix epoch time. Otherwise, it is parsed as a string using the java.time.format.DateTimeFormatter.ISO_INSTANT " + "(https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/time/format/DateTimeFormatter.html#ISO_INSTANT) format. " + "For other cases, specify your own pattern string, for example, using `MMM dd yyyy HH:mm:ss.SSSVV` " - + "for timestamps in the form of `\"Jan 21 1998 01:02:03.456+08:00\"`.") + + "for timestamps in the form of `Jan 21 1998 01:02:03.456+08:00`.") ValueProvider getTimestampFormat(); void setTimestampFormat(ValueProvider value); @@ -271,7 +270,7 @@ public interface Options extends PipelineOptions { description = "Priority for Spanner RPC invocations", helpText = "The request priority for Spanner calls. Possible values " - + "are HIGH, MEDIUM, and LOW. The default value is MEDIUM.") + + "are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`.") ValueProvider getSpannerPriority(); void setSpannerPriority(ValueProvider value); diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Column.java b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Column.java index 6ef754f01a..4d3f43614a 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Column.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Column.java @@ -24,6 +24,8 @@ import com.google.common.collect.ImmutableList; import java.io.IOException; import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; import javax.annotation.Nullable; /** Cloud Spanner column. */ @@ -53,6 +55,20 @@ public abstract class Column implements Serializable { public abstract boolean isStored(); + public abstract boolean isIdentityColumn(); + + @Nullable + public abstract String sequenceKind(); + + @Nullable + public abstract Long counterStartValue(); + + @Nullable + public abstract Long skipRangeMin(); + + @Nullable + public abstract Long skipRangeMax(); + public abstract boolean isPlacementKey(); public abstract Dialect dialect(); @@ -68,6 +84,7 @@ public static Builder builder(Dialect dialect) { .columnOptions(ImmutableList.of()) .notNull(false) .isGenerated(false) + .isIdentityColumn(false) .isHidden(false) .generationExpression("") .isStored(false) @@ -96,6 +113,25 @@ public void prettyPrint(Appendable appendable) throws IOException { appendable.append(" (").append(defaultExpression()).append(")"); } } + if (isIdentityColumn()) { + appendable.append(" GENERATED BY DEFAULT AS IDENTITY"); + List options = new ArrayList<>(3); + if (sequenceKind() != null && sequenceKind().equalsIgnoreCase("bit_reversed_positive")) { + options.add("BIT_REVERSED_POSITIVE"); + } + if (skipRangeMin() != null && skipRangeMax() != null) { + options.add( + String.format( + "SKIP RANGE %d%s %d", + skipRangeMin(), dialect() == Dialect.POSTGRESQL ? "" : ",", skipRangeMax())); + } + if (counterStartValue() != null) { + options.add(String.format("START COUNTER WITH %d", counterStartValue())); + } + if (options.size() > 0) { + appendable.append(" (").append(String.join(" ", options)).append(")"); + } + } if (isGenerated()) { if (dialect() == Dialect.POSTGRESQL) { appendable.append(" GENERATED ALWAYS"); @@ -197,6 +233,16 @@ public Builder generatedAs(String expression) { public abstract Builder isStored(boolean generated); + public abstract Builder isIdentityColumn(boolean identityColumn); + + public abstract Builder sequenceKind(String sequenceKind); + + public abstract Builder counterStartValue(Long value); + + public abstract Builder skipRangeMin(Long value); + + public abstract Builder skipRangeMax(Long value); + public Builder stored() { return isStored(true); } diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/DatabaseOptionAllowlist.java b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/DatabaseOptionAllowlist.java index f0575e3e50..ed1679449b 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/DatabaseOptionAllowlist.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/DatabaseOptionAllowlist.java @@ -24,8 +24,9 @@ public class DatabaseOptionAllowlist { // allow list. private DatabaseOptionAllowlist() {} - // Only those databse options whose name are included in the allowlist will be processed in + // Only those database options whose name are included in the allowlist will be processed in // export/import pipelines. public static final ImmutableList DATABASE_OPTION_ALLOWLIST = - ImmutableList.of("version_retention_period", "opt_in_dataplacement_preview"); + ImmutableList.of( + "version_retention_period", "opt_in_dataplacement_preview", "default_sequence_kind"); } diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScanner.java b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScanner.java index 91dd9c2e39..3d79d217b5 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScanner.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScanner.java @@ -301,6 +301,31 @@ private void listTables(Ddl.Builder builder) { } } + private Long updateCounterForIdentityColumn(Long initialCounter, String qualifiedColumnName) { + Statement sequenceCounterStatement; + switch (dialect) { + case GOOGLE_STANDARD_SQL: + sequenceCounterStatement = + Statement.of("SELECT GET_TABLE_COLUMN_IDENTITY_STATE('" + qualifiedColumnName + "')"); + break; + case POSTGRESQL: + sequenceCounterStatement = + Statement.of( + "SELECT spanner.GET_TABLE_COLUMN_IDENTITY_STATE('" + qualifiedColumnName + "')"); + break; + default: + throw new IllegalArgumentException("Unrecognized dialect: " + dialect); + } + ResultSet resultSetForCounter = context.executeQuery(sequenceCounterStatement); + if (resultSetForCounter.next() && !resultSetForCounter.isNull(0)) { + // Add a buffer to accommodate writes that may happen after import + // is run. Note that this is not 100% failproof, since more writes may + // happen and they will make the sequence advances past the buffer. + return resultSetForCounter.getLong(0) + Sequence.SEQUENCE_COUNTER_BUFFER; + } + return initialCounter; + } + private void listColumns(Ddl.Builder builder) { Statement statement = listColumnsSQL(); @@ -320,11 +345,27 @@ private void listColumns(Ddl.Builder builder) { String generationExpression = resultSet.isNull(7) ? "" : resultSet.getString(7); boolean isStored = !resultSet.isNull(8) && resultSet.getString(8).equalsIgnoreCase("YES"); String defaultExpression = resultSet.isNull(9) ? null : resultSet.getString(9); - boolean isHidden = dialect == Dialect.GOOGLE_STANDARD_SQL ? resultSet.getBoolean(10) : false; + boolean isIdentity = resultSet.getString(10).equalsIgnoreCase("YES"); + String identityKind = resultSet.isNull(11) ? null : resultSet.getString(11); + // The start_with_counter value is the initial value and cannot represent the actual state of + // the counter. We need to apply the current counter to the DDL builder, instead of the one + // retrieved from Information Schema. + Long identityStartWithCounter = + resultSet.isNull(12) ? null : Long.valueOf(resultSet.getString(12)); + if (isIdentity) { + identityStartWithCounter = + updateCounterForIdentityColumn( + identityStartWithCounter, tableSchema + "." + columnName); + } + Long identitySkipRangeMin = + resultSet.isNull(13) ? null : Long.valueOf(resultSet.getString(13)); + Long identitySkipRangeMax = + resultSet.isNull(14) ? null : Long.valueOf(resultSet.getString(14)); + boolean isHidden = dialect == Dialect.GOOGLE_STANDARD_SQL ? resultSet.getBoolean(15) : false; boolean isPlacementKey = dialect == Dialect.GOOGLE_STANDARD_SQL - ? resultSet.getBoolean(11) - : resultSet.getBoolean(10); + ? resultSet.getBoolean(16) + : resultSet.getBoolean(15); builder .createTable(tableName) @@ -336,6 +377,11 @@ private void listColumns(Ddl.Builder builder) { .generationExpression(generationExpression) .isStored(isStored) .defaultExpression(defaultExpression) + .isIdentityColumn(isIdentity) + .sequenceKind(identityKind) + .counterStartValue(identityStartWithCounter) + .skipRangeMin(identitySkipRangeMin) + .skipRangeMax(identitySkipRangeMax) .isPlacementKey(isPlacementKey) .endColumn() .endTable(); @@ -357,7 +403,8 @@ Statement listColumnsSQL() { "SELECT c.table_schema, c.table_name, c.column_name," + " c.ordinal_position, c.spanner_type, c.is_nullable," + " c.is_generated, c.generation_expression, c.is_stored," - + " c.column_default, c.is_hidden," + + " c.column_default, c.is_identity, c.identity_kind, c.identity_start_with_counter," + + " c.identity_skip_range_min, c.identity_skip_range_max, c.is_hidden," + " pkc.constraint_name IS NOT NULL AS is_placement_key" + " FROM information_schema.columns as c" + " LEFT JOIN placementkeycolumns AS pkc" @@ -372,6 +419,8 @@ Statement listColumnsSQL() { "SELECT c.table_schema, c.table_name, c.column_name," + " c.ordinal_position, c.spanner_type, c.is_nullable," + " c.is_generated, c.generation_expression, c.is_stored, c.column_default," + + " c.is_identity, c.identity_kind, c.identity_start_with_counter," + + " c.identity_skip_range_min, c.identity_skip_range_max," + " pkc.constraint_name IS NOT NULL AS is_placement_key" + " FROM information_schema.columns as c" + " LEFT JOIN placementkeycolumns AS pkc" @@ -1637,6 +1686,15 @@ private void listSequenceOptionsGoogleSQL( options.add(optionName + "=" + optionValue); } } + // If the sequence kind is not specified, assign it to 'default'. + for (var entry : allOptions.entrySet()) { + if (!entry.getValue().toString().contains(Sequence.SEQUENCE_KIND)) { + entry + .getValue() + .add( + Sequence.SEQUENCE_KIND + "=" + GSQL_LITERAL_QUOTE + "default" + GSQL_LITERAL_QUOTE); + } + } // Inject the current counter value to sequences that are in use. for (Map.Entry entry : currentCounters.entrySet()) { @@ -1684,8 +1742,7 @@ private void listSequenceOptionsPostgreSQL( Long skipRangeMax = resultSet.isNull(5) ? null : resultSet.getLong(5); if (sequenceKind == null) { - throw new IllegalArgumentException( - "Sequence kind for sequence " + sequenceName + " cannot be null"); + sequenceKind = "default"; } if (currentCounters.containsKey(sequenceName)) { // The sequence is in use, we need to apply the current counter to diff --git a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Sequence.java b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Sequence.java index fedf940d18..e407522565 100644 --- a/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Sequence.java +++ b/v1/src/main/java/com/google/cloud/teleport/spanner/ddl/Sequence.java @@ -31,6 +31,7 @@ public abstract class Sequence implements Serializable { private static final long serialVersionUID = 1L; public static final long SEQUENCE_COUNTER_BUFFER = 1000L; public static final String SEQUENCE_START_WITH_COUNTER = "start_with_counter"; + public static final String SEQUENCE_KIND = "sequence_kind"; public abstract String name(); @@ -75,11 +76,14 @@ public void prettyPrint(Appendable appendable) throws IOException { } if (dialect() == Dialect.POSTGRESQL) { - if (!sequenceKind().equalsIgnoreCase("bit_reversed_positive")) { - throw new IllegalArgumentException( - String.format("Unrecognized sequence kind: %s.", sequenceKind())); + if (sequenceKind() != null && !sequenceKind().equalsIgnoreCase("default")) { + if (sequenceKind().equalsIgnoreCase("bit_reversed_positive")) { + appendable.append(" BIT_REVERSED_POSITIVE"); + } else { + throw new IllegalArgumentException( + String.format("Unrecognized sequence kind: %s.", sequenceKind())); + } } - appendable.append(" BIT_REVERSED_POSITIVE"); if (skipRangeMin() != null && skipRangeMax() != null) { appendable .append(" SKIP RANGE ") diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/BigQueryToTFRecord.java b/v1/src/main/java/com/google/cloud/teleport/templates/BigQueryToTFRecord.java index fd6cde8cbe..0d6d4a3317 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/BigQueryToTFRecord.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/BigQueryToTFRecord.java @@ -333,7 +333,7 @@ public interface Options extends BigQueryReadOptions { groupName = "Target", description = "Output Cloud Storage directory.", helpText = - "The top-level Cloud Storage path prefix to use when writing the training, testing, and validation TFRecord files. Subdirectories for resulting training, testing, and validation TFRecord files are automatically generated from `outputDirectory`. For example, `gs://mybucket/output/train`", + "The top-level Cloud Storage path prefix to use when writing the training, testing, and validation TFRecord files. Subdirectories for resulting training, testing, and validation TFRecord files are automatically generated from `outputDirectory`.", example = "gs://mybucket/output") ValueProvider getOutputDirectory(); @@ -357,7 +357,7 @@ public interface Options extends BigQueryReadOptions { optional = true, description = "Percentage of data to be in the training set ", helpText = - "The percentage of query data allocated to training TFRecord files. The default value is 1, or 100%.") + "The percentage of query data allocated to training TFRecord files. The default value is `1`, or `100%`.") @Default.Float(1) ValueProvider getTrainingPercentage(); @@ -368,7 +368,7 @@ public interface Options extends BigQueryReadOptions { optional = true, description = "Percentage of data to be in the testing set ", helpText = - "The percentage of query data allocated to testing TFRecord files. The default value is 0, or 0%.") + "The percentage of query data allocated to testing TFRecord files. The default value is `0`, or `0%`.") @Default.Float(0) ValueProvider getTestingPercentage(); @@ -379,7 +379,7 @@ public interface Options extends BigQueryReadOptions { optional = true, description = "Percentage of data to be in the validation set ", helpText = - "The percentage of query data allocated to validation TFRecord files. The default value is 0, or 0%.") + "The percentage of query data allocated to validation TFRecord files. The default value is `0`, or `0%`.") @Default.Float(0) ValueProvider getValidationPercentage(); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/DLPTextToBigQueryStreaming.java b/v1/src/main/java/com/google/cloud/teleport/templates/DLPTextToBigQueryStreaming.java index a9e6d61c05..f1190b991f 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/DLPTextToBigQueryStreaming.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/DLPTextToBigQueryStreaming.java @@ -304,7 +304,7 @@ public interface TokenizePipelineOptions extends DataflowPipelineOptions { }, description = "Cloud DLP deidentify template name", helpText = - "The Sensitive Data Protection de-identification template to use for API requests, specified with the pattern projects//deidentifyTemplates/.", + "The Sensitive Data Protection de-identification template to use for API requests, specified with the pattern `projects//deidentifyTemplates/`.", example = "projects/your-project-id/locations/global/deidentifyTemplates/generated_template_id") @Required @@ -322,7 +322,7 @@ public interface TokenizePipelineOptions extends DataflowPipelineOptions { description = "Cloud DLP inspect template name", helpText = "The Sensitive Data Protection inspection template to use for API requests, specified" - + " with the pattern projects//identifyTemplates/.", + + " with the pattern `projects//identifyTemplates/`.", example = "projects/your-project-id/locations/global/inspectTemplates/generated_template_id") ValueProvider getInspectTemplateName(); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/PubsubToAvro.java b/v1/src/main/java/com/google/cloud/teleport/templates/PubsubToAvro.java index 2e710bad8d..b1e755707f 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/PubsubToAvro.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/PubsubToAvro.java @@ -97,10 +97,8 @@ public interface Options order = 1, groupName = "Source", description = "Pub/Sub input subscription", - helpText = - "Pub/Sub subscription to read the input from, in the format of" - + " 'projects/your-project-id/subscriptions/your-subscription-name'", - example = "projects/your-project-id/subscriptions/your-subscription-name") + helpText = "The Pub/Sub subscription to read the input from.", + example = "projects//subscriptions/") ValueProvider getInputSubscription(); void setInputSubscription(ValueProvider value); @@ -110,7 +108,7 @@ public interface Options groupName = "Source", description = "Pub/Sub input topic", helpText = - "The Pub/Sub topic to subscribe to for message consumption. The topic name must be in the format projects//topics/.") + "The Pub/Sub topic to subscribe to for message consumption. The topic name must be in the format `projects//topics/`.") ValueProvider getInputTopic(); void setInputTopic(ValueProvider value); @@ -128,7 +126,7 @@ public interface Options groupName = "Target", description = "Output file directory in Cloud Storage", helpText = - "The output directory where output Avro files are archived. Must contain / at the end. For example: gs://example-bucket/example-directory/") + "The output directory where output Avro files are archived. Must contain `/` at the end. For example: `gs://example-bucket/example-directory/`") @Required ValueProvider getOutputDirectory(); @@ -161,7 +159,7 @@ public interface Options order = 7, description = "Temporary Avro write directory", helpText = - "The directory for temporary Avro files. Must contain / at the end. For example: gs://example-bucket/example-directory/.") + "The directory for temporary Avro files. Must contain `/` at the end. For example: `gs://example-bucket/example-directory/`.") @Required ValueProvider getAvroTempDirectory(); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/SpannerVectorEmbeddingExport.java b/v1/src/main/java/com/google/cloud/teleport/templates/SpannerVectorEmbeddingExport.java index 1c6fed5028..be84cf59f5 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/SpannerVectorEmbeddingExport.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/SpannerVectorEmbeddingExport.java @@ -177,7 +177,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions { }, description = "Timestamp to read stale data from a version in the past.", helpText = - "If set, specifies the time when the database version must be taken. The value is a string in the RFC-3339 date format in Unix epoch time. For example: 1990-12-31T23:59:60Z. The timestamp must be in the past, and maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies. If not set, a strong bound (https://cloud.google.com/spanner/docs/timestamp-bounds#strong) is used to read the latest data. Defaults to empty.", + "If set, specifies the time when the database version must be taken. The value is a string in the RFC-3339 date format in Unix epoch time. For example: `1990-12-31T23:59:60Z`. The timestamp must be in the past, and maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies. If not set, a strong bound (https://cloud.google.com/spanner/docs/timestamp-bounds#strong) is used to read the latest data. Defaults to `empty`.", example = "1990-12-31T23:59:60Z") @Default.String(value = "") ValueProvider getSpannerVersionTime(); @@ -190,7 +190,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions { optional = true, description = "Use independent compute resource (Spanner DataBoost).", helpText = - "When set to true, the template uses Spanner on-demand compute. The export job runs on independent compute resources that don't impact current Spanner workloads. Using this option incurs additional charges in Spanner. For more information, see Spanner Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: false.") + "When set to `true`, the template uses Spanner on-demand compute. The export job runs on independent compute resources that don't impact current Spanner workloads. Using this option incurs additional charges in Spanner. For more information, see Spanner Data Boost overview (https://cloud.google.com/spanner/docs/databoost/databoost-overview). Defaults to: `false`.") @Default.Boolean(false) ValueProvider getSpannerDataBoostEnabled(); @@ -207,7 +207,7 @@ public interface SpannerToVectorEmbeddingJsonOptions extends PipelineOptions { optional = true, description = "Priority for Spanner RPC invocations", helpText = - "The request priority for Spanner calls. The allowed values are HIGH, MEDIUM, and LOW. The default value is MEDIUM.") + "The request priority for Spanner calls. The allowed values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `MEDIUM`.") ValueProvider getSpannerPriority(); void setSpannerPriority(ValueProvider value); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/common/BigQueryConverters.java b/v1/src/main/java/com/google/cloud/teleport/templates/common/BigQueryConverters.java index 6ed1017412..ed0ebbf91b 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/common/BigQueryConverters.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/common/BigQueryConverters.java @@ -85,7 +85,7 @@ public interface BigQueryReadOptions extends PipelineOptions { order = 1, description = "Input SQL query", helpText = - "A BigQuery SQL query that extracts data from the source. For example, select * from dataset1.sample_table.") + "A BigQuery SQL query that extracts data from the source. For example, `select * from dataset1.sample_table`.") ValueProvider getReadQuery(); void setReadQuery(ValueProvider value); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/common/CsvConverters.java b/v1/src/main/java/com/google/cloud/teleport/templates/common/CsvConverters.java index 04739b3f60..cdc001d154 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/common/CsvConverters.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/common/CsvConverters.java @@ -75,7 +75,7 @@ public interface CsvPipelineOptions extends PipelineOptions { order = 1, optional = true, description = "Whether input CSV files contain a header record.", - helpText = "Whether headers are included in the CSV file. Defaults to: false.") + helpText = "Whether headers are included in the CSV file. Defaults to: `false`.") @Default.Boolean(false) ValueProvider getContainsHeaders(); @@ -93,7 +93,7 @@ public interface CsvPipelineOptions extends PipelineOptions { @TemplateParameter.Text( order = 3, description = "CSV Format to use for parsing records.", - helpText = "The CSV format according to Apache Commons CSV format. Defaults to: Default.") + helpText = "The CSV format according to Apache Commons CSV format. Defaults to: `Default`.") ValueProvider getCsvFormat(); void setCsvFormat(ValueProvider csvFormat); @@ -104,7 +104,7 @@ public interface CsvPipelineOptions extends PipelineOptions { regexes = {"^(US-ASCII|ISO-8859-1|UTF-8|UTF-16)$"}, description = "CSV file encoding", helpText = - "The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16.") + "The CSV file character encoding format. Allowed Values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`.") @Default.String("UTF-8") ValueProvider getCsvFileEncoding(); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/common/DatastoreConverters.java b/v1/src/main/java/com/google/cloud/teleport/templates/common/DatastoreConverters.java index 1b7777fe63..9ac8ac7104 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/common/DatastoreConverters.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/common/DatastoreConverters.java @@ -221,7 +221,7 @@ public interface DatastoreWriteOptions extends PipelineOptions { optional = true, description = "Expected number of workers", helpText = - "Hint for the expected number of workers in the Datastore ramp-up throttling step. Default is `500`.") + "Hint for the expected number of workers in the Datastore ramp-up throttling step. Defaults to `500`.") @Default.Integer(500) @Hidden @Deprecated @@ -268,7 +268,7 @@ public interface DatastoreWriteOptions extends PipelineOptions { description = "Expected number of workers", helpText = "Hint for the expected number of workers in the Firestore ramp-up throttling step." - + " Default is 500.") + + " The default value is `500`.") // @Default can not be used here as it will make it use Firestore on a Datastore template. ValueProvider getFirestoreHintNumWorkers(); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/common/PubsubConverters.java b/v1/src/main/java/com/google/cloud/teleport/templates/common/PubsubConverters.java index a41ca848b8..92ae2610ad 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/common/PubsubConverters.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/common/PubsubConverters.java @@ -58,7 +58,7 @@ public interface PubsubWriteDeadletterTopicOptions extends PipelineOptions { order = 1, description = "Output deadletter Pub/Sub topic", helpText = - "The Pub/Sub topic to forward undeliverable messages to. For example, projects//topics/.") + "The Pub/Sub topic to forward undeliverable messages to. For example, `projects//topics/`.") @Validation.Required ValueProvider getOutputDeadletterTopic(); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/common/SpannerConverters.java b/v1/src/main/java/com/google/cloud/teleport/templates/common/SpannerConverters.java index e219b4d0cf..a353b323b3 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/common/SpannerConverters.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/common/SpannerConverters.java @@ -152,7 +152,7 @@ public interface SpannerReadOptions extends PipelineOptions { description = "Snapshot time", helpText = "The timestamp that corresponds to the version of the Spanner database that you want to read from." - + " The timestamp must be specified in the RFC 3339 (https://tools.ietf.org/html/rfc3339) UTC \"Zulu\" format." + + " The timestamp must be specified in the RFC 3339 (https://tools.ietf.org/html/rfc3339) UTC Zulu Time format." + " The timestamp must be in the past and" + " maximum timestamp staleness (https://cloud.google.com/spanner/docs/timestamp-bounds#maximum_timestamp_staleness) applies.", example = "1990-12-31T23:59:60Z") diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/common/SplunkConverters.java b/v1/src/main/java/com/google/cloud/teleport/templates/common/SplunkConverters.java index 46e1a2eb27..1f3e4bcc13 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/common/SplunkConverters.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/common/SplunkConverters.java @@ -117,7 +117,7 @@ public interface SplunkOptions extends PipelineOptions { optional = true, description = "Batch size for sending multiple events to Splunk HEC.", helpText = - "The batch size for sending multiple events to Splunk. Defaults to 1 (no batching).") + "The batch size for sending multiple events to Splunk. Defaults to `1` (no batching).") ValueProvider getBatchCount(); void setBatchCount(ValueProvider batchCount); @@ -127,7 +127,7 @@ public interface SplunkOptions extends PipelineOptions { optional = true, description = "Disable SSL certificate validation.", helpText = - "Disable SSL certificate validation. Default false (validation enabled). If true, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored.") + "Disable SSL certificate validation. Default `false` (validation enabled). If `true`, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored.") ValueProvider getDisableCertificateValidation(); void setDisableCertificateValidation(ValueProvider disableCertificateValidation); @@ -136,7 +136,7 @@ public interface SplunkOptions extends PipelineOptions { order = 5, optional = true, description = "Maximum number of parallel requests.", - helpText = "The maximum number of parallel requests. Defaults to 1 (no parallelism).") + helpText = "The maximum number of parallel requests. Defaults to `1` (no parallelism).") ValueProvider getParallelism(); void setParallelism(ValueProvider parallelism); @@ -146,7 +146,7 @@ public interface SplunkOptions extends PipelineOptions { optional = true, description = "Include full Pub/Sub message in the payload.", helpText = - "Include the full Pub/Sub message in the payload. Default false (only the data element is included in the payload).") + "Include the full Pub/Sub message in the payload. Default `false` (only the data element is included in the payload).") ValueProvider getIncludePubsubMessage(); void setIncludePubsubMessage(ValueProvider includePubsubMessage); @@ -156,7 +156,7 @@ public interface SplunkOptions extends PipelineOptions { optional = true, description = "Google Cloud KMS encryption key for the token", helpText = - "The Cloud KMS key to use to decrypt the HEC token string. This parameter must be provided when tokenSource is set to KMS. If the Cloud KMS key is provided, the HEC token string `must` be passed in encrypted.", + "The Cloud KMS key to use to decrypt the HEC token string. This parameter must be provided when tokenSource is set to KMS. If the Cloud KMS key is provided, the HEC token string must be passed in encrypted.", example = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name") ValueProvider getTokenKMSEncryptionKey(); @@ -171,7 +171,7 @@ public interface SplunkOptions extends PipelineOptions { }, description = "Google Cloud Secret Manager ID.", helpText = - "The Secret Manager secret ID for the token. This parameter must provided when the tokenSource is set to SECRET_MANAGER.", + "The Secret Manager secret ID for the token. This parameter must provided when the tokenSource is set to `SECRET_MANAGER`.", example = "projects/your-project-id/secrets/your-secret/versions/your-secret-version") ValueProvider getTokenSecretId(); diff --git a/v1/src/main/java/com/google/cloud/teleport/templates/common/TextConverters.java b/v1/src/main/java/com/google/cloud/teleport/templates/common/TextConverters.java index 16ece3faf6..7c7de0188c 100644 --- a/v1/src/main/java/com/google/cloud/teleport/templates/common/TextConverters.java +++ b/v1/src/main/java/com/google/cloud/teleport/templates/common/TextConverters.java @@ -61,7 +61,7 @@ public interface FilesystemWindowedWriteOptions extends PipelineOptions { description = "Output file directory in Cloud Storage", helpText = "The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters.", - example = "gs://your-bucket/your-path") + example = "gs://your-bucket/your-path/") @Validation.Required ValueProvider getOutputDirectory(); diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverterTest.java b/v1/src/test/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverterTest.java index 4be7a46a48..0100384039 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverterTest.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/AvroSchemaToDdlConverterTest.java @@ -83,6 +83,23 @@ public void simple() { + " \"generationExpression\" : \"CONCAT(first_name, ' ', last_name)\"," + " \"stored\" : \"true\"" + " }, {" + + " \"name\" : \"identity_column\"," + + " \"type\" : [ \"null\", \"long\" ]," + + " \"sqlType\" : \"INT64\"," + + " \"identityColumn\" : \"true\"," + + " \"sequenceKind\" : \"bit_reversed_positive\"," + + " \"skipRangeMin\" : \"2000\"," + + " \"skipRangeMax\" : \"3000\"," + + " \"counterStartValue\" : \"1000\"" + + " }, {" + + " \"name\" : \"identity_column_no_kind\"," + + " \"type\" : [ \"null\", \"long\" ]," + + " \"sqlType\" : \"INT64\"," + + " \"identityColumn\" : \"true\"," + + " \"skipRangeMin\" : \"2000\"," + + " \"skipRangeMax\" : \"3000\"," + + " \"counterStartValue\" : \"1000\"" + + " }, {" + " \"name\" : \"numeric\"," + " \"type\" : [\"null\", {\"type\":\"bytes\",\"logicalType\":\"decimal\"}]," + " \"sqlType\" : \"NUMERIC\"" @@ -231,6 +248,10 @@ public void simple() { + " `first_name` STRING(10) DEFAULT ('John')," + " `last_name` STRING(MAX)," + " `full_name` STRING(MAX) AS (CONCAT(first_name, ' ', last_name)) STORED," + + " `identity_column` INT64 GENERATED BY DEFAULT AS IDENTITY (" + + "BIT_REVERSED_POSITIVE SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + + " `identity_column_no_kind` INT64 GENERATED BY DEFAULT AS IDENTITY (" + + "SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + " `numeric` NUMERIC," + " `numeric2` NUMERIC," + " `notNumeric` BYTES(MAX)," @@ -310,6 +331,23 @@ public void pgSimple() { + " \"generationExpression\" : \"CONCAT(first_name, ' ', last_name)\"," + " \"stored\" : \"true\"" + " }, {" + + " \"name\" : \"identity_column\"," + + " \"type\" : [ \"null\", \"long\" ]," + + " \"sqlType\" : \"bigint\"," + + " \"identityColumn\" : \"true\"," + + " \"sequenceKind\" : \"bit_reversed_positive\"," + + " \"skipRangeMin\" : \"2000\"," + + " \"skipRangeMax\" : \"3000\"," + + " \"counterStartValue\" : \"1000\"" + + " }, {" + + " \"name\" : \"identity_column_no_kind\"," + + " \"type\" : [ \"null\", \"long\" ]," + + " \"sqlType\" : \"bigint\"," + + " \"identityColumn\" : \"true\"," + + " \"skipRangeMin\" : \"2000\"," + + " \"skipRangeMax\" : \"3000\"," + + " \"counterStartValue\" : \"1000\"" + + " }, {" + " \"name\" : \"numeric\"," + " \"type\" : [\"null\", {\"type\":\"bytes\",\"logicalType\":\"decimal\"}]," + " \"sqlType\" : \"numeric\"" @@ -422,6 +460,10 @@ public void pgSimple() { + " \"last_name\" character varying," + " \"full_name\" character varying GENERATED ALWAYS AS" + " (CONCAT(first_name, ' ', last_name)) STORED," + + " \"identity_column\" bigint GENERATED BY DEFAULT AS IDENTITY (" + + "BIT_REVERSED_POSITIVE SKIP RANGE 2000 3000 START COUNTER WITH 1000)," + + " \"identity_column_no_kind\" bigint GENERATED BY DEFAULT AS IDENTITY (" + + "SKIP RANGE 2000 3000 START COUNTER WITH 1000)," + " \"numeric\" numeric," + " \"numeric2\" numeric," + " \"notNumeric\" bytea," @@ -938,15 +980,26 @@ public void sequences() { + " \"googleFormatVersion\" : \"booleans\"," + " \"sequenceOption_0\" : \"sequence_kind=\\\"bit_reversed_positive\\\"\"" + "}"; + String avroString4 = + "{" + + " \"type\" : \"record\"," + + " \"name\" : \"Sequence4\"," + + " \"fields\" : []," + + " \"namespace\" : \"spannertest\"," + + " \"googleStorage\" : \"CloudSpanner\"," + + " \"googleFormatVersion\" : \"booleans\"," + + " \"sequenceOption_0\" : \"sequence_kind=default\"" + + "}"; Collection schemas = new ArrayList<>(); Schema.Parser parser = new Schema.Parser(); schemas.add(parser.parse(avroString1)); schemas.add(parser.parse(avroString2)); schemas.add(parser.parse(avroString3)); + schemas.add(parser.parse(avroString4)); AvroSchemaToDdlConverter converter = new AvroSchemaToDdlConverter(); Ddl ddl = converter.toDdl(schemas); - assertThat(ddl.sequences(), hasSize(3)); + assertThat(ddl.sequences(), hasSize(4)); assertThat( ddl.prettyPrint(), equalToCompressingWhiteSpace( @@ -957,7 +1010,8 @@ public void sequences() { + "OPTIONS (sequence_kind=\"bit_reversed_positive\", " + "start_with_counter=9999)\n" + "CREATE SEQUENCE `Sequence3`\n\t" - + "OPTIONS (sequence_kind=\"bit_reversed_positive\")")); + + "OPTIONS (sequence_kind=\"bit_reversed_positive\")\n" + + "CREATE SEQUENCE `Sequence4`")); } @Test @@ -996,16 +1050,27 @@ public void pgSequences() { + " \"googleFormatVersion\" : \"booleans\"," + " \"sequenceKind\" : \"bit_reversed_positive\"" + "}"; + String avroString4 = + "{" + + " \"type\" : \"record\"," + + " \"name\" : \"Sequence4\"," + + " \"fields\" : []," + + " \"namespace\" : \"spannertest\"," + + " \"googleStorage\" : \"CloudSpanner\"," + + " \"googleFormatVersion\" : \"booleans\"," + + " \"sequenceKind\" : \"default\"" + + "}"; Collection schemas = new ArrayList<>(); Schema.Parser parser = new Schema.Parser(); schemas.add(parser.parse(avroString1)); schemas.add(parser.parse(avroString2)); schemas.add(parser.parse(avroString3)); + schemas.add(parser.parse(avroString4)); AvroSchemaToDdlConverter converter = new AvroSchemaToDdlConverter(Dialect.POSTGRESQL); Ddl ddl = converter.toDdl(schemas); assertEquals(ddl.dialect(), Dialect.POSTGRESQL); - assertThat(ddl.sequences(), hasSize(3)); + assertThat(ddl.sequences(), hasSize(4)); assertThat( ddl.prettyPrint(), equalToCompressingWhiteSpace( @@ -1013,7 +1078,8 @@ public void pgSequences() { + "SKIP RANGE 1 1000 START COUNTER WITH 50" + "\nCREATE SEQUENCE \"Sequence2\" BIT_REVERSED_POSITIVE " + "START COUNTER WITH 9999" - + "\nCREATE SEQUENCE \"Sequence3\" BIT_REVERSED_POSITIVE")); + + "\nCREATE SEQUENCE \"Sequence3\" BIT_REVERSED_POSITIVE" + + "\nCREATE SEQUENCE \"Sequence4\"")); } @Test diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/CopyDbTest.java b/v1/src/test/java/com/google/cloud/teleport/spanner/CopyDbTest.java index 4f054b6ca0..2d97d6a6ab 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/CopyDbTest.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/CopyDbTest.java @@ -905,10 +905,112 @@ public void pgChangeStreams() throws Exception { runTest(Dialect.POSTGRESQL); } + @Test + public void identityColumn() throws Exception { + // spotless:off + Ddl.Builder ddlBuilder = Ddl.builder(); + List dbOptionList = new ArrayList<>(); + dbOptionList.add( + Export.DatabaseOption.newBuilder() + .setOptionName("default_sequence_kind") + .setOptionValue("\"bit_reversed_positive\"") + .build()); + ddlBuilder.mergeDatabaseOptions(dbOptionList); + Ddl ddl = ddlBuilder + .createTable("IdentityTable") + .column("id") + .int64() + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("non_key_column") + .int64() + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("no_sequence_kind_column") + .int64() + .isIdentityColumn(true) + .sequenceKind("default") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("value").int64().endColumn() + .primaryKey().asc("id").end() + .endTable() + .build(); + // spotless:on + + createAndPopulate(ddl, 10); + runTest(); + } + + @Test + public void pgIdentityColumn() throws Exception { + // spotless:off + Ddl.Builder ddlBuilder = Ddl.builder(Dialect.POSTGRESQL); + List dbOptionList = new ArrayList<>(); + dbOptionList.add( + Export.DatabaseOption.newBuilder() + .setOptionName("default_sequence_kind") + .setOptionValue("\"bit_reversed_positive\"") + .build()); + ddlBuilder.mergeDatabaseOptions(dbOptionList); + Ddl ddl = ddlBuilder + .createTable("IdentityTable") + .column("id") + .int64() + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("non_key_column") + .int64() + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("no_sequence_kind_column") + .int64() + .isIdentityColumn(true) + .sequenceKind("default") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("value").int64().endColumn() + .primaryKey().asc("id").end() + .endTable() + .build(); + // spotless:on + + createAndPopulate(ddl, 10); + runTest(Dialect.POSTGRESQL); + } + @Test public void sequences() throws Exception { + Ddl.Builder ddlBuilder = Ddl.builder(); + List dbOptionList = new ArrayList<>(); + dbOptionList.add( + Export.DatabaseOption.newBuilder() + .setOptionName("default_sequence_kind") + .setOptionValue("\"bit_reversed_positive\"") + .build()); + ddlBuilder.mergeDatabaseOptions(dbOptionList); Ddl ddl = - Ddl.builder() + ddlBuilder .createSequence("Sequence1") .options( ImmutableList.of( @@ -925,6 +1027,14 @@ public void sequences() throws Exception { .createSequence("Sequence3") .options(ImmutableList.of("sequence_kind=\"bit_reversed_positive\"")) .endSequence() + .createSequence("Sequence4") + .options( + ImmutableList.of( + "sequence_kind=\"default\"", + "skip_range_min=0", + "skip_range_max=1000", + "start_with_counter=50")) + .endSequence() .createTable("UsersWithSequenceId") .column("id") .int64() @@ -946,8 +1056,16 @@ public void sequences() throws Exception { @Test public void pgSequences() throws Exception { + Ddl.Builder ddlBuilder = Ddl.builder(Dialect.POSTGRESQL); + List dbOptionList = new ArrayList<>(); + dbOptionList.add( + Export.DatabaseOption.newBuilder() + .setOptionName("default_sequence_kind") + .setOptionValue("\"bit_reversed_positive\"") + .build()); + ddlBuilder.mergeDatabaseOptions(dbOptionList); Ddl ddl = - Ddl.builder(Dialect.POSTGRESQL) + ddlBuilder .createSequence("PGSequence1") .sequenceKind("bit_reversed_positive") .counterStartValue(Long.valueOf(50)) @@ -961,6 +1079,12 @@ public void pgSequences() throws Exception { .createSequence("PGSequence3") .sequenceKind("bit_reversed_positive") .endSequence() + .createSequence("PGSequence4") + .sequenceKind("default") + .counterStartValue(Long.valueOf(50)) + .skipRangeMin(Long.valueOf(0)) + .skipRangeMax(Long.valueOf(1000)) + .endSequence() .createTable("PGUsersWithSequenceId") .column("id") .pgInt8() diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverterTest.java b/v1/src/test/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverterTest.java index acefcf177a..7a333f1d6f 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverterTest.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/DdlToAvroSchemaConverterTest.java @@ -20,6 +20,7 @@ import static com.google.cloud.teleport.spanner.AvroUtil.GOOGLE_FORMAT_VERSION; import static com.google.cloud.teleport.spanner.AvroUtil.GOOGLE_STORAGE; import static com.google.cloud.teleport.spanner.AvroUtil.HIDDEN; +import static com.google.cloud.teleport.spanner.AvroUtil.IDENTITY_COLUMN; import static com.google.cloud.teleport.spanner.AvroUtil.INPUT; import static com.google.cloud.teleport.spanner.AvroUtil.NOT_NULL; import static com.google.cloud.teleport.spanner.AvroUtil.OUTPUT; @@ -143,6 +144,21 @@ public void simple() { .max() .isHidden(true) .endColumn() + .column("identity_column") + .type(Type.int64()) + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("identity_column_no_kind") + .type(Type.int64()) + .isIdentityColumn(true) + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() .primaryKey() .asc("id") .asc("gen_id") @@ -183,7 +199,7 @@ public void simple() { List fields = avroSchema.getFields(); - assertThat(fields, hasSize(8)); + assertThat(fields, hasSize(10)); assertThat(fields.get(0).name(), equalTo("id")); // Not null @@ -252,6 +268,26 @@ public void simple() { assertThat(fields.get(7).getProp(STORED), equalTo(null)); assertThat(fields.get(7).getProp(HIDDEN), equalTo("true")); + assertThat(fields.get(8).name(), equalTo("identity_column")); + assertThat(fields.get(8).schema(), equalTo(nullableUnion(Schema.Type.LONG))); + assertThat(fields.get(8).getProp(SQL_TYPE), equalTo("INT64")); + assertThat(fields.get(8).getProp(NOT_NULL), equalTo(null)); + assertThat(fields.get(8).getProp(IDENTITY_COLUMN), equalTo("true")); + assertThat(fields.get(8).getProp(SPANNER_SEQUENCE_KIND), equalTo("bit_reversed_positive")); + assertThat(fields.get(8).getProp(SPANNER_SEQUENCE_COUNTER_START), equalTo("1000")); + assertThat(fields.get(8).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MIN), equalTo("2000")); + assertThat(fields.get(8).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MAX), equalTo("3000")); + + assertThat(fields.get(9).name(), equalTo("identity_column_no_kind")); + assertThat(fields.get(9).schema(), equalTo(nullableUnion(Schema.Type.LONG))); + assertThat(fields.get(9).getProp(SQL_TYPE), equalTo("INT64")); + assertThat(fields.get(9).getProp(NOT_NULL), equalTo(null)); + assertThat(fields.get(9).getProp(IDENTITY_COLUMN), equalTo("true")); + assertThat(fields.get(9).getProp(SPANNER_SEQUENCE_KIND), equalTo(null)); + assertThat(fields.get(9).getProp(SPANNER_SEQUENCE_COUNTER_START), equalTo("1000")); + assertThat(fields.get(9).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MIN), equalTo("2000")); + assertThat(fields.get(9).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MAX), equalTo("3000")); + // spanner pk assertThat(avroSchema.getProp(SPANNER_PRIMARY_KEY + "_0"), equalTo("`id` ASC")); assertThat(avroSchema.getProp(SPANNER_PRIMARY_KEY + "_1"), equalTo("`gen_id` ASC")); @@ -326,6 +362,21 @@ public void pgSimple() { .generatedAs("MOD(id+1, 64)") .stored() .endColumn() + .column("identity_column") + .type(Type.int64()) + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("identity_column_no_kind") + .type(Type.int64()) + .isIdentityColumn(true) + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() .primaryKey() .asc("id") .asc("gen_id") @@ -356,7 +407,7 @@ public void pgSimple() { List fields = avroSchema.getFields(); - assertThat(fields, hasSize(5)); + assertThat(fields, hasSize(7)); assertThat(fields.get(0).name(), equalTo("id")); // Not null @@ -401,6 +452,26 @@ public void pgSimple() { assertThat(fields.get(4).getProp(STORED), equalTo("true")); assertThat(fields.get(4).getProp(DEFAULT_EXPRESSION), equalTo(null)); + assertThat(fields.get(5).name(), equalTo("identity_column")); + assertThat(fields.get(5).schema(), equalTo(nullableUnion(Schema.Type.LONG))); + assertThat(fields.get(5).getProp(SQL_TYPE), equalTo("INT64")); + assertThat(fields.get(5).getProp(NOT_NULL), equalTo(null)); + assertThat(fields.get(5).getProp(IDENTITY_COLUMN), equalTo("true")); + assertThat(fields.get(5).getProp(SPANNER_SEQUENCE_KIND), equalTo("bit_reversed_positive")); + assertThat(fields.get(5).getProp(SPANNER_SEQUENCE_COUNTER_START), equalTo("1000")); + assertThat(fields.get(5).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MIN), equalTo("2000")); + assertThat(fields.get(5).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MAX), equalTo("3000")); + + assertThat(fields.get(6).name(), equalTo("identity_column_no_kind")); + assertThat(fields.get(6).schema(), equalTo(nullableUnion(Schema.Type.LONG))); + assertThat(fields.get(6).getProp(SQL_TYPE), equalTo("INT64")); + assertThat(fields.get(6).getProp(NOT_NULL), equalTo(null)); + assertThat(fields.get(6).getProp(IDENTITY_COLUMN), equalTo("true")); + assertThat(fields.get(6).getProp(SPANNER_SEQUENCE_KIND), equalTo(null)); + assertThat(fields.get(6).getProp(SPANNER_SEQUENCE_COUNTER_START), equalTo("1000")); + assertThat(fields.get(6).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MIN), equalTo("2000")); + assertThat(fields.get(6).getProp(SPANNER_SEQUENCE_SKIP_RANGE_MAX), equalTo("3000")); + // spanner pk assertThat(avroSchema.getProp(SPANNER_PRIMARY_KEY + "_0"), equalTo("\"id\" ASC")); assertThat(avroSchema.getProp(SPANNER_PRIMARY_KEY + "_1"), equalTo("\"gen_id\" ASC")); @@ -1557,10 +1628,13 @@ public void sequences() { .createSequence("Sequence3") .options(ImmutableList.of("sequence_kind=\"bit_reversed_positive\"")) .endSequence() + .createSequence("Sequence4") + .options(ImmutableList.of("sequence_kind=\"default\"")) + .endSequence() .build(); Collection result = converter.convert(ddl); - assertThat(result, hasSize(3)); + assertThat(result, hasSize(4)); for (Schema s : result) { assertThat(s.getNamespace(), equalTo("spannertest")); assertThat(s.getProp("googleFormatVersion"), equalTo("booleans")); @@ -1590,6 +1664,10 @@ public void sequences() { assertThat( avroSchema3.getProp("sequenceOption_0"), equalTo("sequence_kind=\"bit_reversed_positive\"")); + + Schema avroSchema4 = it.next(); + assertThat(avroSchema4.getName(), equalTo("Sequence4")); + assertThat(avroSchema4.getProp("sequenceOption_0"), equalTo("sequence_kind=\"default\"")); } @Test @@ -1611,10 +1689,13 @@ public void pgSequences() { .createSequence("PGSequence3") .sequenceKind("bit_reversed_positive") .endSequence() + .createSequence("PGSequence4") + .sequenceKind("default") + .endSequence() .build(); Collection result = converter.convert(ddl); - assertThat(result, hasSize(3)); + assertThat(result, hasSize(4)); for (Schema s : result) { assertThat(s.getNamespace(), equalTo("spannertest")); assertThat(s.getProp("googleFormatVersion"), equalTo("booleans")); @@ -1638,6 +1719,10 @@ public void pgSequences() { Schema avroSchema3 = it.next(); assertThat(avroSchema3.getName(), equalTo("PGSequence3")); assertThat(avroSchema3.getProp(SPANNER_SEQUENCE_KIND), equalTo("bit_reversed_positive")); + + Schema avroSchema4 = it.next(); + assertThat(avroSchema4.getName(), equalTo("PGSequence4")); + assertThat(avroSchema4.getProp(SPANNER_SEQUENCE_KIND), equalTo("default")); } @Test diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/ExportPipelineIT.java b/v1/src/test/java/com/google/cloud/teleport/spanner/ExportPipelineIT.java index bdeb29a5fd..3ae9bbdb83 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/ExportPipelineIT.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/ExportPipelineIT.java @@ -23,6 +23,7 @@ import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Mutation; +import com.google.cloud.teleport.metadata.SpannerStagingTest; import com.google.cloud.teleport.metadata.TemplateIntegrationTest; import java.io.IOException; import java.util.ArrayList; @@ -47,7 +48,7 @@ import org.junit.runners.Parameterized; /** Integration test for {@link ExportPipeline Spanner to GCS Avro} template. */ -@Category(TemplateIntegrationTest.class) +@Category({TemplateIntegrationTest.class, SpannerStagingTest.class}) @TemplateIntegrationTest(ExportPipeline.class) @RunWith(Parameterized.class) public class ExportPipelineIT extends SpannerTemplateITBase { diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/ImportFromAvroTest.java b/v1/src/test/java/com/google/cloud/teleport/spanner/ImportFromAvroTest.java index 1064dd846c..adcadd320a 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/ImportFromAvroTest.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/ImportFromAvroTest.java @@ -1040,6 +1040,61 @@ public void pgGeneratedColumns() throws Exception { Dialect.POSTGRESQL); } + @Test + public void identityColumns() throws Exception { + SchemaBuilder.RecordBuilder record = SchemaBuilder.record("identityColumns"); + SchemaBuilder.FieldAssembler fieldAssembler = record.fields(); + + fieldAssembler + // Primary key. + .requiredLong("id") + // Integer columns. + .optionalLong("optional_identity") + .optionalLong("value"); + Schema schema = fieldAssembler.endRecord(); + String spannerSchema = + "CREATE TABLE `AvroTable` (" + + "`id` INT64 NOT NULL GENERATED BY DEFAULT AS IDENTITY (" + + " SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + + "`optional_identity` INT64 GENERATED BY DEFAULT AS IDENTITY (" + + " BIT_REVERSED_POSITIVE SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + + "`value` INT64," + + ") PRIMARY KEY (`id`)"; + + runTest( + schema, + spannerSchema, + Arrays.asList(new GenericRecordBuilder(schema).set("value", 1L).build())); + } + + @Test + public void pgIdentityColumns() throws Exception { + SchemaBuilder.RecordBuilder record = SchemaBuilder.record("identityColumns"); + SchemaBuilder.FieldAssembler fieldAssembler = record.fields(); + + fieldAssembler + // Primary key. + .requiredLong("id") + // Integer columns. + .optionalLong("optional_identity") + .optionalLong("value"); + Schema schema = fieldAssembler.endRecord(); + String spannerSchema = + "CREATE TABLE \"AvroTable\" (" + + "\"id\" bigint NOT NULL GENERATED BY DEFAULT AS IDENTITY (" + + " SKIP RANGE 2000 3000 START COUNTER WITH 1000)," + + "\"optional_identity\" bigint GENERATED BY DEFAULT AS IDENTITY (" + + " BIT_REVERSED_POSITIVE SKIP RANGE 2000 3000 START COUNTER WITH 1000)," + + "\"value\" bigint," + + "PRIMARY KEY (\"id\")" + + ")"; + + runTest( + schema, + spannerSchema, + Arrays.asList(new GenericRecordBuilder(schema).set("value", 1L).build())); + } + @Test public void defaultColumns() throws Exception { SchemaBuilder.RecordBuilder record = SchemaBuilder.record("defaultColumns"); @@ -1648,6 +1703,13 @@ public void sequences() throws Exception { "CREATE SEQUENCE `Sequence2`" + " OPTIONS (sequence_kind=\"bit_reversed_positive\", " + " skip_range_min=0, skip_range_max=1000, start_with_counter=50)"; + String sequence2Def = + "CREATE SEQUENCE `Sequence3`" + + " OPTIONS (skip_range_min=0, skip_range_max=1000, start_with_counter=50)"; + String sequence3Def = "CREATE SEQUENCE `Sequence4` SKIP RANGE 0, 1000 START COUNTER WITH 50"; + String sequence4Def = + "CREATE SEQUENCE `Sequence5` BIT_REVERSED_POSITIVE " + + "SKIP RANGE 0, 1000 START COUNTER WITH 50"; String tableDef = "CREATE TABLE `T` (" + "`id` INT64 NOT NULL DEFAULT (GET_NEXT_SEQUENCE_VALUE(SEQUENCE Sequence2))," @@ -1655,7 +1717,8 @@ public void sequences() throws Exception { + "`c2` INT64," + ") PRIMARY KEY (`id`)"; - SPANNER_SERVER.createDatabase(dbName, Arrays.asList(sequenceDef, tableDef)); + SPANNER_SERVER.createDatabase( + dbName, Arrays.asList(sequenceDef, sequence2Def, sequence3Def, sequence4Def, tableDef)); // Run the import pipeline. importPipeline.apply( @@ -1689,6 +1752,13 @@ public void sequences() throws Exception { + " skip_range_max=1000," + " skip_range_min=0," + " start_with_counter=50)" + + "\nCREATE SEQUENCE `Sequence3`\n\tOPTIONS " + + "(skip_range_max=1000," + + " skip_range_min=0," + + " start_with_counter=50)" + + "\nCREATE SEQUENCE `Sequence4` SKIP RANGE 0, 1000 START COUNTER WITH 50" + + "\nCREATE SEQUENCE `Sequence5` BIT_REVERSED_POSITIVE" + + " SKIP RANGE 0, 1000 START COUNTER WITH 50" + "CREATE TABLE `T` (\n\t" + "`id` INT64 NOT NULL " + "DEFAULT (GET_NEXT_SEQUENCE_VALUE(SEQUENCE Sequence2)),\n\t" @@ -1740,13 +1810,15 @@ public void pgSequences() throws Exception { String sequenceDef = "CREATE SEQUENCE \"PGSequence2\" BIT_REVERSED_POSITIVE" + " SKIP RANGE 0 1000 START COUNTER WITH 50"; + String sequence2Def = + "CREATE SEQUENCE \"PGSequence3\"" + " SKIP RANGE 0 1000 START COUNTER WITH 50"; String tableDef = "CREATE TABLE \"T\" (" + "\"id\" bigint NOT NULL DEFAULT nextval('\"PGSequence2\"')," + "\"c\" bigint," + "PRIMARY KEY (\"id\"))"; - SPANNER_SERVER.createPgDatabase(dbName, Arrays.asList(sequenceDef, tableDef)); + SPANNER_SERVER.createPgDatabase(dbName, Arrays.asList(sequenceDef, sequence2Def, tableDef)); // Run the import pipeline. importPipeline.apply( @@ -1774,6 +1846,8 @@ public void pgSequences() throws Exception { + " SKIP RANGE 10 10000 START COUNTER WITH 99" + "\nCREATE SEQUENCE \"PGSequence2\" BIT_REVERSED_POSITIVE" + " SKIP RANGE 0 1000 START COUNTER WITH 50" + + "\nCREATE SEQUENCE \"PGSequence3\"" + + " SKIP RANGE 0 1000 START COUNTER WITH 50" + "CREATE TABLE \"T\" (" + "\n\t\"id\" bigint NOT NULL" + " DEFAULT nextval('\"PGSequence2\"'::text),\n\t" @@ -2111,6 +2185,14 @@ private void runTest( .setOptionName("version_retention_period") .setOptionValue(dialect == Dialect.GOOGLE_STANDARD_SQL ? "\"4d\"" : "'4d'") .build()) + .addDatabaseOptions( + ExportProtos.Export.DatabaseOption.newBuilder() + .setOptionName("default_sequence_kind") + .setOptionValue( + dialect == Dialect.GOOGLE_STANDARD_SQL + ? "\"bit_reversed_positive\"" + : "'bit_reversed_positive'") + .build()) .setDialect(ProtoDialect.valueOf(dialect.name())) .build(); JsonFormat.printer().print(exportProto); diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/ImportPipelineIT.java b/v1/src/test/java/com/google/cloud/teleport/spanner/ImportPipelineIT.java index 87d45a2dcb..8a0a75778d 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/ImportPipelineIT.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/ImportPipelineIT.java @@ -22,6 +22,7 @@ import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Struct; +import com.google.cloud.teleport.metadata.SpannerStagingTest; import com.google.cloud.teleport.metadata.TemplateIntegrationTest; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -44,7 +45,7 @@ import org.junit.runners.Parameterized; /** Integration test for {@link ImportPipeline} classic template. */ -@Category(TemplateIntegrationTest.class) +@Category({TemplateIntegrationTest.class, SpannerStagingTest.class}) @TemplateIntegrationTest(ImportPipeline.class) @RunWith(Parameterized.class) public class ImportPipelineIT extends SpannerTemplateITBase { diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/DdlTest.java b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/DdlTest.java index 78de2f8b2d..fc46dd17c3 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/DdlTest.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/DdlTest.java @@ -105,6 +105,21 @@ public void simple() { .max() .isHidden(true) .endColumn() + .column("identity_column") + .type(Type.int64()) + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("identity_column_no_kind") + .type(Type.int64()) + .isIdentityColumn(true) + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() .primaryKey() .asc("id") .asc("gen_id") @@ -133,6 +148,12 @@ public void simple() { .setOptionValue("4d") .setOptionType("STRING") .build()) + .addDatabaseOptions( + Export.DatabaseOption.newBuilder() + .setOptionName("default_sequence_kind") + .setOptionValue("bit_reversed_positive") + .setOptionType("STRING") + .build()) .build(); builder.mergeDatabaseOptions(export.getDatabaseOptionsList()); Ddl ddl = builder.build(); @@ -140,6 +161,7 @@ public void simple() { ddl.prettyPrint(), equalToCompressingWhiteSpace( "ALTER DATABASE `%db_name%` SET OPTIONS ( version_retention_period = \"4d\" )" + + " ALTER DATABASE `%db_name%` SET OPTIONS ( default_sequence_kind = \"bit_reversed_positive\" )" + " CREATE TABLE `Users` (" + " `id` INT64 NOT NULL," + " `gen_id` INT64 NOT NULL AS (MOD(id+1, 64)) STORED," @@ -147,6 +169,10 @@ public void simple() { + " `last_name` STRING(MAX)," + " `full_name` STRING(MAX) AS (CONCAT(first_name, ' ', last_name)) STORED," + " `HiddenColumn` STRING(MAX) HIDDEN," + + " `identity_column` INT64 GENERATED BY DEFAULT AS IDENTITY (" + + "BIT_REVERSED_POSITIVE SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + + " `identity_column_no_kind` INT64 GENERATED BY DEFAULT AS IDENTITY (" + + "SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + " CONSTRAINT `ck` CHECK (`first_name` != `last_name`)," + " ) PRIMARY KEY (`id` ASC, `gen_id` ASC)" + " CREATE INDEX `UsersByFirstName` ON `Users` (`first_name`)" @@ -162,7 +188,7 @@ public void simple() { + " FOREIGN KEY (`last_name`) REFERENCES " + "`AllowedNames` (`last_name`) ENFORCED")); List statements = ddl.statements(); - assertEquals(8, statements.size()); + assertEquals(9, statements.size()); assertThat( statements.get(0), equalToCompressingWhiteSpace( @@ -173,6 +199,10 @@ public void simple() { + " `last_name` STRING(MAX)," + " `full_name` STRING(MAX) AS (CONCAT(first_name, ' ', last_name)) STORED," + " `HiddenColumn` STRING(MAX) HIDDEN," + + " `identity_column` INT64 GENERATED BY DEFAULT AS IDENTITY (" + + "BIT_REVERSED_POSITIVE SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + + " `identity_column_no_kind` INT64 GENERATED BY DEFAULT AS IDENTITY (" + + "SKIP RANGE 2000, 3000 START COUNTER WITH 1000)," + " CONSTRAINT `ck` CHECK (`first_name` != `last_name`)," + " ) PRIMARY KEY (`id` ASC, `gen_id` ASC)")); assertThat( @@ -209,6 +239,10 @@ public void simple() { statements.get(7), equalToCompressingWhiteSpace( "ALTER DATABASE `%db_name%` SET OPTIONS ( version_retention_period = \"4d\" )")); + assertThat( + statements.get(8), + equalToCompressingWhiteSpace( + "ALTER DATABASE `%db_name%` SET OPTIONS ( default_sequence_kind = \"bit_reversed_positive\" )")); assertNotNull(ddl.hashCode()); } @@ -243,6 +277,21 @@ public void pgSimple() { .generatedAs("CONCAT(first_name, ' ', last_name)") .stored() .endColumn() + .column("identity_column") + .pgInt8() + .isIdentityColumn(true) + .sequenceKind("bit_reversed_positive") + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() + .column("identity_column_no_kind") + .pgInt8() + .isIdentityColumn(true) + .counterStartValue(1000L) + .skipRangeMin(2000L) + .skipRangeMax(3000L) + .endColumn() .column("update_time") .pgSpannerCommitTimestamp() .notNull() @@ -270,6 +319,12 @@ public void pgSimple() { .setOptionValue("4d") .setOptionType("character varying") .build()) + .addDatabaseOptions( + Export.DatabaseOption.newBuilder() + .setOptionName("default_sequence_kind") + .setOptionValue("bit_reversed_positive") + .setOptionType("character varying") + .build()) .build(); builder.mergeDatabaseOptions(export.getDatabaseOptionsList()); Ddl ddl = builder.build(); @@ -277,6 +332,7 @@ public void pgSimple() { ddl.prettyPrint(), equalToCompressingWhiteSpace( "ALTER DATABASE \"%db_name%\" SET spanner.version_retention_period = '4d'" + + " ALTER DATABASE \"%db_name%\" SET spanner.default_sequence_kind = 'bit_reversed_positive'" + " CREATE TABLE \"Users\" (" + " \"id\" bigint NOT NULL," + " \"gen_id\" bigint NOT NULL GENERATED ALWAYS AS (MOD(id+1, 64)) STORED," @@ -284,6 +340,10 @@ public void pgSimple() { + " \"last_name\" character varying DEFAULT Lennon," + " \"full_name\" character varying GENERATED ALWAYS AS" + " (CONCAT(first_name, ' ', last_name)) STORED," + + " \"identity_column\" bigint GENERATED BY DEFAULT AS IDENTITY (" + + "BIT_REVERSED_POSITIVE SKIP RANGE 2000 3000 START COUNTER WITH 1000)," + + " \"identity_column_no_kind\" bigint GENERATED BY DEFAULT AS IDENTITY (" + + "SKIP RANGE 2000 3000 START COUNTER WITH 1000)," + " \"update_time\" spanner.commit_timestamp NOT NULL," + " CONSTRAINT \"ck\" CHECK (\"first_name\" != \"last_name\")," + " PRIMARY KEY (\"id\", \"gen_id\")" @@ -1047,22 +1107,34 @@ public void sequences() { "skip_range_max=1000", "start_with_counter=50")) .endSequence() + .createSequence("MySequence2") + .options( + ImmutableList.of( + "skip_range_min=0", "skip_range_max=1000", "start_with_counter=50")) + .endSequence() .build(); assertThat( ddl.prettyPrint(), equalToCompressingWhiteSpace( "CREATE SEQUENCE `MySequence`" + " OPTIONS (sequence_kind=\"bit_reversed_positive\", " - + " skip_range_min=0, skip_range_max=1000, start_with_counter=50)")); + + " skip_range_min=0, skip_range_max=1000, start_with_counter=50)" + + " CREATE SEQUENCE `MySequence2`" + + " OPTIONS (skip_range_min=0, skip_range_max=1000, start_with_counter=50)")); List statements = ddl.statements(); - assertEquals(1, statements.size()); + assertEquals(2, statements.size()); assertThat( statements.get(0), equalToCompressingWhiteSpace( "CREATE SEQUENCE `MySequence`" + " OPTIONS (sequence_kind=\"bit_reversed_positive\", " + " skip_range_min=0, skip_range_max=1000, start_with_counter=50)")); + assertThat( + statements.get(1), + equalToCompressingWhiteSpace( + "CREATE SEQUENCE `MySequence2`" + + " OPTIONS (skip_range_min=0, skip_range_max=1000, start_with_counter=50)")); assertNotNull(ddl.hashCode()); } @@ -1079,16 +1151,22 @@ public void pgSequences() { .createSequence("MyPGSequence2") .sequenceKind("bit_reversed_positive") .endSequence() + .createSequence("MyPGSequence3") + .counterStartValue(Long.valueOf(30)) + .skipRangeMin(Long.valueOf(1)) + .skipRangeMax(Long.valueOf(1000)) + .endSequence() .build(); assertThat( ddl.prettyPrint(), equalToCompressingWhiteSpace( "\nCREATE SEQUENCE \"MyPGSequence\" BIT_REVERSED_POSITIVE" + " SKIP RANGE 1 1000 START COUNTER WITH 30 " - + "\nCREATE SEQUENCE \"MyPGSequence2\" BIT_REVERSED_POSITIVE")); + + "\nCREATE SEQUENCE \"MyPGSequence2\" BIT_REVERSED_POSITIVE" + + "\nCREATE SEQUENCE \"MyPGSequence3\" SKIP RANGE 1 1000 START COUNTER WITH 30")); List statements = ddl.statements(); - assertEquals(2, statements.size()); + assertEquals(3, statements.size()); assertThat( statements.get(0), equalToCompressingWhiteSpace( @@ -1097,6 +1175,10 @@ public void pgSequences() { assertThat( statements.get(1), equalToCompressingWhiteSpace("CREATE SEQUENCE \"MyPGSequence2\" BIT_REVERSED_POSITIVE")); + assertThat( + statements.get(2), + equalToCompressingWhiteSpace( + "CREATE SEQUENCE \"MyPGSequence3\" SKIP RANGE 1 1000 START COUNTER WITH 30")); assertNotNull(ddl.hashCode()); } diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerIT.java b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerIT.java index 380e5abf37..03e4aaa894 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerIT.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerIT.java @@ -972,6 +972,40 @@ public void pgDefaultColumns() throws Exception { assertThat(ddl.prettyPrint(), equalToCompressingWhiteSpace(statement)); } + @Test + public void identityColumns() throws Exception { + List statements = + Arrays.asList( + "ALTER DATABASE `" + + dbId + + "` SET OPTIONS ( default_sequence_kind = \"bit_reversed_positive\" )", + "CREATE TABLE `T` (" + + " `id` INT64 NOT NULL GENERATED BY DEFAULT AS IDENTITY," + + " `non_key_col` INT64 NOT NULL GENERATED BY DEFAULT AS IDENTITY (BIT_REVERSED_POSITIVE)," + + " ) PRIMARY KEY (`id` ASC)"); + + SPANNER_SERVER.createDatabase(dbId, statements); + Ddl ddl = getDatabaseDdl(); + assertThat(ddl.prettyPrint(), equalToCompressingWhiteSpace(String.join("", statements))); + } + + @Test + public void pgIdentityColumns() throws Exception { + List statements = + Arrays.asList( + "ALTER DATABASE \"" + + dbId + + "\" SET spanner.default_sequence_kind = 'bit_reversed_positive'", + "CREATE TABLE \"T\" (" + + " \"id\" bigint NOT NULL GENERATED BY DEFAULT AS IDENTITY," + + " \"non_key_col\" bigint NOT NULL GENERATED BY DEFAULT AS IDENTITY (BIT_REVERSED_POSITIVE)," + + " PRIMARY KEY (\"id\") )"); + + SPANNER_SERVER.createPgDatabase(dbId, statements); + Ddl ddl = getPgDatabaseDdl(); + assertThat(ddl.prettyPrint(), equalToCompressingWhiteSpace(String.join("", statements))); + } + @Test public void databaseOptions() throws Exception { List statements = @@ -1080,12 +1114,21 @@ public void pgChangeStreams() throws Exception { public void sequences() throws Exception { List statements = Arrays.asList( + "ALTER DATABASE `" + + dbId + + "` SET OPTIONS ( default_sequence_kind = \"bit_reversed_positive\" )", "CREATE SEQUENCE `MySequence` OPTIONS (" + "sequence_kind = \"bit_reversed_positive\")", "CREATE SEQUENCE `MySequence2` OPTIONS (" + "sequence_kind = \"bit_reversed_positive\"," + "skip_range_min = 1," + "skip_range_max = 1000," + "start_with_counter = 100)", + "CREATE SEQUENCE `MySequence3` OPTIONS (" + + "skip_range_min = 1," + + "skip_range_max = 1000," + + "start_with_counter = 100)", + "CREATE SEQUENCE `MySequence4`", + "CREATE SEQUENCE `MySequence5` BIT_REVERSED_POSITIVE SKIP RANGE 1, 1000 START COUNTER WITH 100", "CREATE TABLE `Account` (" + " `id` INT64 DEFAULT (GET_NEXT_SEQUENCE_VALUE(SEQUENCE MySequence))," + " `balanceId` INT64 NOT NULL," @@ -1094,13 +1137,22 @@ public void sequences() throws Exception { SPANNER_SERVER.createDatabase(dbId, statements); Ddl ddl = getDatabaseDdl(); String expectedDdl = - "\nCREATE SEQUENCE `MySequence`\n\tOPTIONS " + "ALTER DATABASE `" + + dbId + + "` SET OPTIONS ( default_sequence_kind = \"bit_reversed_positive\" )" + + "\nCREATE SEQUENCE `MySequence`\n\tOPTIONS " + "(sequence_kind=\"bit_reversed_positive\")\n" - + "CREATE SEQUENCE `MySequence2`\n\tOPTIONS " + + "\nCREATE SEQUENCE `MySequence2`\n\tOPTIONS " + "(sequence_kind=\"bit_reversed_positive\"," + " skip_range_max=1000," + " skip_range_min=1," + " start_with_counter=100)" + + "\nCREATE SEQUENCE `MySequence3`\n\tOPTIONS " + + "(skip_range_max=1000," + + " skip_range_min=1," + + " start_with_counter=100)" + + "\nCREATE SEQUENCE `MySequence4`" + + "\nCREATE SEQUENCE `MySequence5` BIT_REVERSED_POSITIVE SKIP RANGE 1, 1000 START COUNTER WITH 100" + "CREATE TABLE `Account` (" + "\n\t`id` INT64 DEFAULT" + " (GET_NEXT_SEQUENCE_VALUE(SEQUENCE MySequence))," @@ -1113,9 +1165,13 @@ public void sequences() throws Exception { public void pgSequences() throws Exception { List statements = Arrays.asList( + "ALTER DATABASE \"" + + dbId + + "\" SET spanner.default_sequence_kind = 'bit_reversed_positive'", "CREATE SEQUENCE \"MyPGSequence\" BIT_REVERSED_POSITIVE", "CREATE SEQUENCE \"MyPGSequence2\" BIT_REVERSED_POSITIVE" + " SKIP RANGE 1 1000 START COUNTER WITH 100", + "CREATE SEQUENCE \"MyPGSequence3\"" + " SKIP RANGE 1 1000 START COUNTER WITH 100", "CREATE TABLE \"Account\" (" + " \"id\" bigint DEFAULT nextval('\"MyPGSequence\"')," + " \"balanceId\" bigint NOT NULL," @@ -1124,10 +1180,15 @@ public void pgSequences() throws Exception { SPANNER_SERVER.createPgDatabase(dbId, statements); Ddl ddl = getPgDatabaseDdl(); String expectedDdl = - "\nCREATE SEQUENCE \"MyPGSequence\" BIT_REVERSED_POSITIVE" + "ALTER DATABASE \"" + + dbId + + "\" SET spanner.default_sequence_kind = 'bit_reversed_positive'" + + "\nCREATE SEQUENCE \"MyPGSequence\" BIT_REVERSED_POSITIVE" + " START COUNTER WITH 1" + "\nCREATE SEQUENCE \"MyPGSequence2\" BIT_REVERSED_POSITIVE" + " SKIP RANGE 1 1000 START COUNTER WITH 100" + + "\nCREATE SEQUENCE \"MyPGSequence3\"" + + " SKIP RANGE 1 1000 START COUNTER WITH 100" + "CREATE TABLE \"Account\" (" + "\n\t\"id\" bigint NOT NULL" + " DEFAULT nextval('\"MyPGSequence\"'::text),\n\t" diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerTest.java b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerTest.java index aa78f7af83..2adef05947 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerTest.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/InformationSchemaScannerTest.java @@ -53,7 +53,9 @@ public void testListColumnsSQL() { + " FROM information_schema.constraint_column_usage AS c" + " WHERE c.constraint_name = CONCAT('PLACEMENT_KEY_', c.table_name))" + " SELECT c.table_schema, c.table_name, c.column_name, c.ordinal_position, c.spanner_type, c.is_nullable," - + " c.is_generated, c.generation_expression, c.is_stored, c.column_default, c.is_hidden," + + " c.is_generated, c.generation_expression, c.is_stored," + + " c.column_default, c.is_identity, c.identity_kind, c.identity_start_with_counter," + + " c.identity_skip_range_min, c.identity_skip_range_max, c.is_hidden," + " pkc.constraint_name IS NOT NULL AS is_placement_key" + " FROM information_schema.columns as c" + " LEFT JOIN placementkeycolumns AS pkc" @@ -70,6 +72,8 @@ public void testListColumnsSQL() { + " WHERE c.constraint_name = CONCAT('PLACEMENT_KEY_', c.table_name))" + " SELECT c.table_schema, c.table_name, c.column_name, c.ordinal_position, c.spanner_type, c.is_nullable," + " c.is_generated, c.generation_expression, c.is_stored, c.column_default," + + " c.is_identity, c.identity_kind, c.identity_start_with_counter," + + " c.identity_skip_range_min, c.identity_skip_range_max," + " pkc.constraint_name IS NOT NULL AS is_placement_key" + " FROM information_schema.columns as c" + " LEFT JOIN placementkeycolumns AS pkc" diff --git a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/RandomInsertMutationGenerator.java b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/RandomInsertMutationGenerator.java index b49ee692fc..87d75cf591 100644 --- a/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/RandomInsertMutationGenerator.java +++ b/v1/src/test/java/com/google/cloud/teleport/spanner/ddl/RandomInsertMutationGenerator.java @@ -155,7 +155,7 @@ public TableSupplier(Table table) { } } for (Column column : table.columns()) { - if (!column.isGenerated()) { + if (!column.isGenerated() && !column.isIdentityColumn()) { valueGenerators.put( column.name(), randomValueGenerator diff --git a/v1/src/test/java/com/google/cloud/teleport/templates/SpannerToTextIT.java b/v1/src/test/java/com/google/cloud/teleport/templates/SpannerToTextIT.java index 5240495a16..b75c057793 100644 --- a/v1/src/test/java/com/google/cloud/teleport/templates/SpannerToTextIT.java +++ b/v1/src/test/java/com/google/cloud/teleport/templates/SpannerToTextIT.java @@ -22,6 +22,7 @@ import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Mutation; +import com.google.cloud.teleport.metadata.SpannerStagingTest; import com.google.cloud.teleport.metadata.TemplateIntegrationTest; import java.io.IOException; import java.util.ArrayList; @@ -43,7 +44,7 @@ import org.junit.runners.Parameterized; /** Integration test for {@link SpannerToText Spanner to GCS Text} template. */ -@Category(TemplateIntegrationTest.class) +@Category({TemplateIntegrationTest.class, SpannerStagingTest.class}) @TemplateIntegrationTest(SpannerToText.class) @RunWith(Parameterized.class) public class SpannerToTextIT extends SpannerTemplateITBase { diff --git a/v1/src/test/java/com/google/cloud/teleport/templates/TextImportPipelineIT.java b/v1/src/test/java/com/google/cloud/teleport/templates/TextImportPipelineIT.java index 3f43b14ec0..e2893bcf83 100644 --- a/v1/src/test/java/com/google/cloud/teleport/templates/TextImportPipelineIT.java +++ b/v1/src/test/java/com/google/cloud/teleport/templates/TextImportPipelineIT.java @@ -23,6 +23,7 @@ import com.google.cloud.spanner.Dialect; import com.google.cloud.spanner.Struct; +import com.google.cloud.teleport.metadata.SpannerStagingTest; import com.google.cloud.teleport.metadata.TemplateIntegrationTest; import com.google.cloud.teleport.spanner.TextImportPipeline; import com.google.common.collect.ImmutableList; @@ -47,7 +48,7 @@ import org.junit.runners.JUnit4; /** Integration test for {@link TextImportPipeline}. */ -@Category(TemplateIntegrationTest.class) +@Category({TemplateIntegrationTest.class, SpannerStagingTest.class}) @TemplateIntegrationTest(TextImportPipeline.class) @RunWith(JUnit4.class) public final class TextImportPipelineIT extends TemplateTestBase { diff --git a/v2/astradb-to-bigquery/README_AstraDB_To_BigQuery.md b/v2/astradb-to-bigquery/README_AstraDB_To_BigQuery.md index 35f187cc37..a4573d470e 100644 --- a/v2/astradb-to-bigquery/README_AstraDB_To_BigQuery.md +++ b/v2/astradb-to-bigquery/README_AstraDB_To_BigQuery.md @@ -28,17 +28,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **astraToken** : The token value or secret resource ID. (Example: AstraCS:abcdefghij). -* **astraDatabaseId** : The database unique identifier (UUID). (Example: cf7af129-d33a-498f-ad06-d97a6ee6eb7). -* **astraKeyspace** : The name of the Cassandra keyspace inside of the Astra database. -* **astraTable** : The name of the table inside of the Cassandra database. (Example: my_table). +* **astraToken**: The token value or secret resource ID. For example, `AstraCS:abcdefghij`. +* **astraDatabaseId**: The database unique identifier (UUID). For example, `cf7af129-d33a-498f-ad06-d97a6ee6eb7`. +* **astraKeyspace**: The name of the Cassandra keyspace inside of the Astra database. +* **astraTable**: The name of the table inside of the Cassandra database. For example, `my_table`. ### Optional parameters -* **astraQuery** : The query to use to filter rows instead of reading the whole table. -* **astraDatabaseRegion** : If not provided, a default is chosen, which is useful with multi-region databases. -* **minTokenRangesCount** : The minimal number of splits to use to distribute the query. -* **outputTableSpec** : The BigQuery table location to write the output to. Use the format `:.`. The table's schema must match the input objects. +* **astraQuery**: The query to use to filter rows instead of reading the whole table. +* **astraDatabaseRegion**: If not provided, a default is chosen, which is useful with multi-region databases. +* **minTokenRangesCount**: The minimal number of splits to use to distribute the query. +* **outputTableSpec**: The BigQuery table location to write the output to. Use the format `:.`. The table's schema must match the input objects. @@ -221,10 +221,10 @@ resource "google_dataflow_flex_template_job" "astradb_to_bigquery" { name = "astradb-to-bigquery" region = var.region parameters = { - astraToken = "AstraCS:abcdefghij" - astraDatabaseId = "cf7af129-d33a-498f-ad06-d97a6ee6eb7" + astraToken = "" + astraDatabaseId = "" astraKeyspace = "" - astraTable = "my_table" + astraTable = "" # astraQuery = "" # astraDatabaseRegion = "" # minTokenRangesCount = "" diff --git a/v2/azure-eventhub-to-pubsub/README_Azure_Eventhub_to_PubSub.md b/v2/azure-eventhub-to-pubsub/README_Azure_Eventhub_to_PubSub.md index 856d174b99..9ca0f45ab1 100644 --- a/v2/azure-eventhub-to-pubsub/README_Azure_Eventhub_to_PubSub.md +++ b/v2/azure-eventhub-to-pubsub/README_Azure_Eventhub_to_PubSub.md @@ -16,10 +16,10 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **brokerServer** : Server IP or DNS for Azure Eventhub Endpoint (Example: mynamespace.servicebus.windows.net:9093). -* **inputTopic** : Azure Eventhub topic(s) to read the input from (Example: topic). -* **outputTopic** : The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). -* **secret** : Secret Version, it can be a number like 1,2 or 3 or can be 'latest' (Example: projects/{project}/secrets/{secret}/versions/{secret_version}). +* **brokerServer**: Server IP or DNS for Azure Eventhub Endpoint For example, `mynamespace.servicebus.windows.net:9093`. +* **inputTopic**: Azure Eventhub topic(s) to read the input from For example, `topic`. +* **outputTopic**: The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' For example, `projects/your-project-id/topics/your-topic-name`. +* **secret**: Secret Version, it can be a number like 1,2 or 3 or can be 'latest' For example, `projects/{project}/secrets/{secret}/versions/{secret_version}`. ### Optional parameters @@ -193,10 +193,10 @@ resource "google_dataflow_flex_template_job" "azure_eventhub_to_pubsub" { name = "azure-eventhub-to-pubsub" region = var.region parameters = { - brokerServer = "mynamespace.servicebus.windows.net:9093" - inputTopic = "topic" - outputTopic = "projects/your-project-id/topics/your-topic-name" - secret = "projects/{project}/secrets/{secret}/versions/{secret_version}" + brokerServer = "" + inputTopic = "" + outputTopic = "" + secret = "" } } ``` diff --git a/v2/bigquery-to-bigtable/README_BigQuery_to_Bigtable.md b/v2/bigquery-to-bigtable/README_BigQuery_to_Bigtable.md index 3219141278..7583403447 100644 --- a/v2/bigquery-to-bigtable/README_BigQuery_to_Bigtable.md +++ b/v2/bigquery-to-bigtable/README_BigQuery_to_Bigtable.md @@ -16,27 +16,28 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **readIdColumn** : The name of the BigQuery column storing the unique identifier of the row. -* **bigtableWriteInstanceId** : The ID of the Bigtable instance that contains the table. -* **bigtableWriteTableId** : The ID of the Bigtable table to write to. -* **bigtableWriteColumnFamily** : The name of the column family of the Bigtable table to write data into. +* **readIdColumn**: The name of the BigQuery column storing the unique identifier of the row. +* **bigtableWriteInstanceId**: The ID of the Bigtable instance that contains the table. +* **bigtableWriteTableId**: The ID of the Bigtable table to write to. +* **bigtableWriteColumnFamily**: The name of the column family of the Bigtable table to write data into. ### Optional parameters -* **inputTableSpec** : The BigQuery table to read from. Format: `projectId:datasetId.tablename`. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the BigQuery Storage Read API (https://cloud.google.com/bigquery/docs/reference/storage). For information about limitations in the Storage Read API, see https://cloud.google.com/bigquery/docs/reference/storage#limitations. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. (Example: bigquery-project:dataset.input_table). -* **outputDeadletterTable** : The BigQuery table for messages that failed to reach the output table, in the format :.. If a table doesn't exist, is is created during pipeline execution. If not specified, `_error_records` is used. (Example: your-project-id:your-dataset.your-table-name). -* **query** : The SQL query to use to read data from BigQuery. If the BigQuery dataset is in a different project than the Dataflow job, specify the full dataset name in the SQL query, for example: ... By default, the `query` parameter uses GoogleSQL (https://cloud.google.com/bigquery/docs/introduction-sql), unless `useLegacySql` is `true`. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. (Example: select * from sampledb.sample_table). -* **useLegacySql** : Set to true to use legacy SQL. This parameter only applies when using the `query` parameter. Defaults to: false. -* **queryLocation** : Needed when reading from an authorized view without underlying table's permission. (Example: US). -* **queryTempDataset** : With this option, you can set an existing dataset to create the temporary table to store the results of the query. (Example: temp_dataset). -* **bigtableRpcAttemptTimeoutMs** : The timeout for each Bigtable RPC attempt in milliseconds. -* **bigtableRpcTimeoutMs** : The total timeout for a Bigtable RPC operation in milliseconds. -* **bigtableAdditionalRetryCodes** : The additional retry codes. (Example: RESOURCE_EXHAUSTED,DEADLINE_EXCEEDED). -* **bigtableWriteAppProfile** : The ID of the Bigtable application profile to use for the export. If you do not specify an app profile, Bigtable uses the default app profile (https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile) of the instance. -* **bigtableWriteProjectId** : The ID of the Google Cloud project that contains the Bigtable instanceto write data to. -* **bigtableBulkWriteLatencyTargetMs** : The latency target of Bigtable in milliseconds for latency-based throttling. -* **bigtableBulkWriteMaxRowKeyCount** : The maximum number of row keys in a Bigtable batch write operation. -* **bigtableBulkWriteMaxRequestSizeBytes** : The maximum bytes to include per Bigtable batch write operation. +* **inputTableSpec**: The BigQuery table to read from. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the BigQuery Storage Read API (https://cloud.google.com/bigquery/docs/reference/storage). For information about limitations in the Storage Read API, see https://cloud.google.com/bigquery/docs/reference/storage#limitations. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. For example, `:.`. +* **outputDeadletterTable**: The BigQuery table for messages that failed to reach the output table. If a table doesn't exist, it is created during pipeline execution. If not specified, `_error_records` is used. For example, `:.`. +* **query**: The SQL query to use to read data from BigQuery. If the BigQuery dataset is in a different project than the Dataflow job, specify the full dataset name in the SQL query, for example: ... By default, the `query` parameter uses GoogleSQL (https://cloud.google.com/bigquery/docs/introduction-sql), unless `useLegacySql` is `true`. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. For example, `select * from sampledb.sample_table`. +* **useLegacySql**: Set to `true` to use legacy SQL. This parameter only applies when using the `query` parameter. Defaults to `false`. +* **queryLocation**: Needed when reading from an authorized view without underlying table's permission. For example, `US`. +* **queryTempDataset**: With this option, you can set an existing dataset to create the temporary table to store the results of the query. For example, `temp_dataset`. +* **KMSEncryptionKey**: If reading from BigQuery using query source, use this Cloud KMS key to encrypt any temporary tables created. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **bigtableRpcAttemptTimeoutMs**: The timeout for each Bigtable RPC attempt in milliseconds. +* **bigtableRpcTimeoutMs**: The total timeout for a Bigtable RPC operation in milliseconds. +* **bigtableAdditionalRetryCodes**: The additional retry codes. For example, `RESOURCE_EXHAUSTED,DEADLINE_EXCEEDED`. +* **bigtableWriteAppProfile**: The ID of the Bigtable application profile to use for the export. If you do not specify an app profile, Bigtable uses the default app profile (https://cloud.google.com/bigtable/docs/app-profiles#default-app-profile) of the instance. +* **bigtableWriteProjectId**: The ID of the Google Cloud project that contains the Bigtable instanceto write data to. +* **bigtableBulkWriteLatencyTargetMs**: The latency target of Bigtable in milliseconds for latency-based throttling. +* **bigtableBulkWriteMaxRowKeyCount**: The maximum number of row keys in a Bigtable batch write operation. +* **bigtableBulkWriteMaxRequestSizeBytes**: The maximum bytes to include per Bigtable batch write operation. @@ -127,6 +128,7 @@ export QUERY= export USE_LEGACY_SQL=false export QUERY_LOCATION= export QUERY_TEMP_DATASET= +export KMSENCRYPTION_KEY= export BIGTABLE_RPC_ATTEMPT_TIMEOUT_MS= export BIGTABLE_RPC_TIMEOUT_MS= export BIGTABLE_ADDITIONAL_RETRY_CODES= @@ -147,6 +149,7 @@ gcloud dataflow flex-template run "bigquery-to-bigtable-job" \ --parameters "useLegacySql=$USE_LEGACY_SQL" \ --parameters "queryLocation=$QUERY_LOCATION" \ --parameters "queryTempDataset=$QUERY_TEMP_DATASET" \ + --parameters "KMSEncryptionKey=$KMSENCRYPTION_KEY" \ --parameters "bigtableRpcAttemptTimeoutMs=$BIGTABLE_RPC_ATTEMPT_TIMEOUT_MS" \ --parameters "bigtableRpcTimeoutMs=$BIGTABLE_RPC_TIMEOUT_MS" \ --parameters "bigtableAdditionalRetryCodes=$BIGTABLE_ADDITIONAL_RETRY_CODES" \ @@ -188,6 +191,7 @@ export QUERY= export USE_LEGACY_SQL=false export QUERY_LOCATION= export QUERY_TEMP_DATASET= +export KMSENCRYPTION_KEY= export BIGTABLE_RPC_ATTEMPT_TIMEOUT_MS= export BIGTABLE_RPC_TIMEOUT_MS= export BIGTABLE_ADDITIONAL_RETRY_CODES= @@ -204,7 +208,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="bigquery-to-bigtable-job" \ -DtemplateName="BigQuery_to_Bigtable" \ --Dparameters="readIdColumn=$READ_ID_COLUMN,inputTableSpec=$INPUT_TABLE_SPEC,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,query=$QUERY,useLegacySql=$USE_LEGACY_SQL,queryLocation=$QUERY_LOCATION,queryTempDataset=$QUERY_TEMP_DATASET,bigtableRpcAttemptTimeoutMs=$BIGTABLE_RPC_ATTEMPT_TIMEOUT_MS,bigtableRpcTimeoutMs=$BIGTABLE_RPC_TIMEOUT_MS,bigtableAdditionalRetryCodes=$BIGTABLE_ADDITIONAL_RETRY_CODES,bigtableWriteInstanceId=$BIGTABLE_WRITE_INSTANCE_ID,bigtableWriteTableId=$BIGTABLE_WRITE_TABLE_ID,bigtableWriteColumnFamily=$BIGTABLE_WRITE_COLUMN_FAMILY,bigtableWriteAppProfile=$BIGTABLE_WRITE_APP_PROFILE,bigtableWriteProjectId=$BIGTABLE_WRITE_PROJECT_ID,bigtableBulkWriteLatencyTargetMs=$BIGTABLE_BULK_WRITE_LATENCY_TARGET_MS,bigtableBulkWriteMaxRowKeyCount=$BIGTABLE_BULK_WRITE_MAX_ROW_KEY_COUNT,bigtableBulkWriteMaxRequestSizeBytes=$BIGTABLE_BULK_WRITE_MAX_REQUEST_SIZE_BYTES" \ +-Dparameters="readIdColumn=$READ_ID_COLUMN,inputTableSpec=$INPUT_TABLE_SPEC,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,query=$QUERY,useLegacySql=$USE_LEGACY_SQL,queryLocation=$QUERY_LOCATION,queryTempDataset=$QUERY_TEMP_DATASET,KMSEncryptionKey=$KMSENCRYPTION_KEY,bigtableRpcAttemptTimeoutMs=$BIGTABLE_RPC_ATTEMPT_TIMEOUT_MS,bigtableRpcTimeoutMs=$BIGTABLE_RPC_TIMEOUT_MS,bigtableAdditionalRetryCodes=$BIGTABLE_ADDITIONAL_RETRY_CODES,bigtableWriteInstanceId=$BIGTABLE_WRITE_INSTANCE_ID,bigtableWriteTableId=$BIGTABLE_WRITE_TABLE_ID,bigtableWriteColumnFamily=$BIGTABLE_WRITE_COLUMN_FAMILY,bigtableWriteAppProfile=$BIGTABLE_WRITE_APP_PROFILE,bigtableWriteProjectId=$BIGTABLE_WRITE_PROJECT_ID,bigtableBulkWriteLatencyTargetMs=$BIGTABLE_BULK_WRITE_LATENCY_TARGET_MS,bigtableBulkWriteMaxRowKeyCount=$BIGTABLE_BULK_WRITE_MAX_ROW_KEY_COUNT,bigtableBulkWriteMaxRequestSizeBytes=$BIGTABLE_BULK_WRITE_MAX_REQUEST_SIZE_BYTES" \ -f v2/bigquery-to-bigtable ``` @@ -253,15 +257,16 @@ resource "google_dataflow_flex_template_job" "bigquery_to_bigtable" { bigtableWriteInstanceId = "" bigtableWriteTableId = "" bigtableWriteColumnFamily = "" - # inputTableSpec = "bigquery-project:dataset.input_table" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" - # query = "select * from sampledb.sample_table" + # inputTableSpec = "" + # outputDeadletterTable = "" + # query = "" # useLegacySql = "false" - # queryLocation = "US" - # queryTempDataset = "temp_dataset" + # queryLocation = "" + # queryTempDataset = "" + # KMSEncryptionKey = "" # bigtableRpcAttemptTimeoutMs = "" # bigtableRpcTimeoutMs = "" - # bigtableAdditionalRetryCodes = "RESOURCE_EXHAUSTED,DEADLINE_EXCEEDED" + # bigtableAdditionalRetryCodes = "" # bigtableWriteAppProfile = "default" # bigtableWriteProjectId = "" # bigtableBulkWriteLatencyTargetMs = "" diff --git a/v2/bigquery-to-parquet/README_BigQuery_to_Parquet.md b/v2/bigquery-to-parquet/README_BigQuery_to_Parquet.md index c2772d6da5..d74487573a 100644 --- a/v2/bigquery-to-parquet/README_BigQuery_to_Parquet.md +++ b/v2/bigquery-to-parquet/README_BigQuery_to_Parquet.md @@ -20,14 +20,14 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **tableRef** : The BigQuery input table location. (Example: your-project:your-dataset.your-table-name). -* **bucket** : The Cloud Storage folder to write the Parquet files to. (Example: gs://your-bucket/export/). +* **tableRef**: The BigQuery input table location. For example, `your-project:your-dataset.your-table-name`. +* **bucket**: The Cloud Storage folder to write the Parquet files to. For example, `gs://your-bucket/export/`. ### Optional parameters -* **numShards** : The number of output file shards. The default value is 1. -* **fields** : A comma-separated list of fields to select from the input BigQuery table. -* **rowRestriction** : Read only rows which match the specified filter, which must be a SQL expression compatible with Google standard SQL (https://cloud.google.com/bigquery/docs/reference/standard-sql). If no value is specified, then all rows are returned. +* **numShards**: The number of output file shards. The default value is `1`. +* **fields**: A comma-separated list of fields to select from the input BigQuery table. +* **rowRestriction**: Read only rows which match the specified filter, which must be a SQL expression compatible with Google standard SQL (https://cloud.google.com/bigquery/docs/reference/standard-sql). If no value is specified, then all rows are returned. @@ -201,8 +201,8 @@ resource "google_dataflow_flex_template_job" "bigquery_to_parquet" { name = "bigquery-to-parquet" region = var.region parameters = { - tableRef = "your-project:your-dataset.your-table-name" - bucket = "gs://your-bucket/export/" + tableRef = "" + bucket = "" # numShards = "0" # fields = "" # rowRestriction = "" diff --git a/v2/bigquery-to-parquet/src/main/java/com/google/cloud/teleport/v2/templates/BigQueryToParquet.java b/v2/bigquery-to-parquet/src/main/java/com/google/cloud/teleport/v2/templates/BigQueryToParquet.java index 2a9b75340e..fa3f96947d 100644 --- a/v2/bigquery-to-parquet/src/main/java/com/google/cloud/teleport/v2/templates/BigQueryToParquet.java +++ b/v2/bigquery-to-parquet/src/main/java/com/google/cloud/teleport/v2/templates/BigQueryToParquet.java @@ -169,7 +169,7 @@ public interface BigQueryToParquetOptions extends PipelineOptions { order = 3, optional = true, description = "Maximum output shards", - helpText = "The number of output file shards. The default value is 1.") + helpText = "The number of output file shards. The default value is `1`.") @Default.Integer(0) Integer getNumShards(); diff --git a/v2/bigtable-changestreams-to-hbase/README_Bigtable_Change_Streams_to_HBase.md b/v2/bigtable-changestreams-to-hbase/README_Bigtable_Change_Streams_to_HBase.md index 56d82d8fcc..6c7523c3f6 100644 --- a/v2/bigtable-changestreams-to-hbase/README_Bigtable_Change_Streams_to_HBase.md +++ b/v2/bigtable-changestreams-to-hbase/README_Bigtable_Change_Streams_to_HBase.md @@ -13,33 +13,33 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **hbaseRootDir** : Hbase root directory, corresponds to hbase.rootdir. -* **hbaseZookeeperQuorumHost** : Zookeeper quorum host, corresponds to hbase.zookeeper.quorum host. -* **bigtableChangeStreamAppProfile** : The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. -* **bigtableReadInstanceId** : The source Bigtable instance ID. -* **bigtableReadTableId** : The source Bigtable table ID. +* **hbaseRootDir**: Hbase root directory, corresponds to hbase.rootdir. +* **hbaseZookeeperQuorumHost**: Zookeeper quorum host, corresponds to hbase.zookeeper.quorum host. +* **bigtableChangeStreamAppProfile**: The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. +* **bigtableReadInstanceId**: The source Bigtable instance ID. +* **bigtableReadTableId**: The source Bigtable table ID. ### Optional parameters -* **bidirectionalReplicationEnabled** : Whether bidirectional replication between hbase and bigtable is enabled, adds additional logic to filter out hbase-replicated mutations. Defaults to: false. -* **cbtQualifier** : Bidirectional replication source CBT qualifier. Defaults to: BIDIRECTIONAL_REPL_SOURCE_CBT. -* **dryRunEnabled** : When dry run is enabled, pipeline will not write to Hbase. Defaults to: false. -* **filterGCMutations** : Filters out garbage collection Delete mutations from CBT. Defaults to: false. -* **hbaseQualifier** : Bidirectional replication source Hbase qualifier. Defaults to: BIDIRECTIONAL_REPL_SOURCE_HBASE. -* **hbaseZookeeperQuorumPort** : Zookeeper quorum port, corresponds to hbase.zookeeper.quorum port. Defaults to: 2181. -* **bigtableChangeStreamMetadataInstanceId** : The Bigtable change streams metadata instance ID. Defaults to empty. -* **bigtableChangeStreamMetadataTableTableId** : The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. -* **bigtableChangeStreamCharset** : The Bigtable change streams charset name. Defaults to: UTF-8. -* **bigtableChangeStreamStartTimestamp** : The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. -* **bigtableChangeStreamIgnoreColumnFamilies** : A comma-separated list of column family name changes to ignore. Defaults to empty. -* **bigtableChangeStreamIgnoreColumns** : A comma-separated list of column name changes to ignore. Defaults to empty. -* **bigtableChangeStreamName** : A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. -* **bigtableChangeStreamResume** : When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. -* **bigtableReadProjectId** : The Bigtable project ID. The default is the project for the Dataflow job. -* **bigtableReadAppProfile** : Bigtable App Profile to use for reads. The default for this parameter is the Bigtable instance's default app profile. -* **bigtableRpcAttemptTimeoutMs** : The timeout for each Bigtable RPC attempt in milliseconds. -* **bigtableRpcTimeoutMs** : The total timeout for a Bigtable RPC operation in milliseconds. -* **bigtableAdditionalRetryCodes** : The additional retry codes. (Example: RESOURCE_EXHAUSTED,DEADLINE_EXCEEDED). +* **bidirectionalReplicationEnabled**: Whether bidirectional replication between hbase and bigtable is enabled, adds additional logic to filter out hbase-replicated mutations. Defaults to: false. +* **cbtQualifier**: Bidirectional replication source CBT qualifier. Defaults to: BIDIRECTIONAL_REPL_SOURCE_CBT. +* **dryRunEnabled**: When dry run is enabled, pipeline will not write to Hbase. Defaults to: false. +* **filterGCMutations**: Filters out garbage collection Delete mutations from CBT. Defaults to: false. +* **hbaseQualifier**: Bidirectional replication source Hbase qualifier. Defaults to: BIDIRECTIONAL_REPL_SOURCE_HBASE. +* **hbaseZookeeperQuorumPort**: Zookeeper quorum port, corresponds to hbase.zookeeper.quorum port. Defaults to: 2181. +* **bigtableChangeStreamMetadataInstanceId**: The Bigtable change streams metadata instance ID. Defaults to empty. +* **bigtableChangeStreamMetadataTableTableId**: The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. +* **bigtableChangeStreamCharset**: The Bigtable change streams charset name. Defaults to: UTF-8. +* **bigtableChangeStreamStartTimestamp**: The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. +* **bigtableChangeStreamIgnoreColumnFamilies**: A comma-separated list of column family name changes to ignore. Defaults to empty. +* **bigtableChangeStreamIgnoreColumns**: A comma-separated list of column name changes to ignore. Defaults to empty. +* **bigtableChangeStreamName**: A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. +* **bigtableChangeStreamResume**: When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. +* **bigtableReadProjectId**: The Bigtable project ID. The default is the project for the Dataflow job. +* **bigtableReadAppProfile**: Bigtable App Profile to use for reads. The default for this parameter is the Bigtable instance's default app profile. +* **bigtableRpcAttemptTimeoutMs**: The timeout for each Bigtable RPC attempt in milliseconds. +* **bigtableRpcTimeoutMs**: The total timeout for a Bigtable RPC operation in milliseconds. +* **bigtableAdditionalRetryCodes**: The additional retry codes. For example, `RESOURCE_EXHAUSTED,DEADLINE_EXCEEDED`. @@ -293,7 +293,7 @@ resource "google_dataflow_flex_template_job" "bigtable_change_streams_to_hbase" # bigtableReadAppProfile = "default" # bigtableRpcAttemptTimeoutMs = "" # bigtableRpcTimeoutMs = "" - # bigtableAdditionalRetryCodes = "RESOURCE_EXHAUSTED,DEADLINE_EXCEEDED" + # bigtableAdditionalRetryCodes = "" } } ``` diff --git a/v2/cdc-parent/cdc-agg/README_Cdc_To_BigQuery_Template.md b/v2/cdc-parent/cdc-agg/README_Cdc_To_BigQuery_Template.md index c9a974b092..b1a9a9fad0 100644 --- a/v2/cdc-parent/cdc-agg/README_Cdc_To_BigQuery_Template.md +++ b/v2/cdc-parent/cdc-agg/README_Cdc_To_BigQuery_Template.md @@ -16,19 +16,19 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscriptions** : The comma-separated list of Pub/Sub input subscriptions to read from, in the format `,, ...`. -* **changeLogDataset** : The BigQuery dataset to store the staging tables in, in the format . -* **replicaDataset** : The location of the BigQuery dataset to store the replica tables in, in the format . +* **inputSubscriptions**: The comma-separated list of Pub/Sub input subscriptions to read from, in the format `,, ...`. +* **changeLogDataset**: The BigQuery dataset to store the staging tables in, in the format . +* **replicaDataset**: The location of the BigQuery dataset to store the replica tables in, in the format . ### Optional parameters -* **inputTopics** : Comma-separated list of PubSub topics to where CDC data is being pushed. -* **updateFrequencySecs** : The interval at which the pipeline updates the BigQuery table replicating the MySQL database. -* **useSingleTopic** : Set this to true if you have configured your Debezium connector to publish all table updates to a single topic. Defaults to: false. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **inputTopics**: Comma-separated list of PubSub topics to where CDC data is being pushed. +* **updateFrequencySecs**: The interval at which the pipeline updates the BigQuery table replicating the MySQL database. +* **useSingleTopic**: Set this to `true` if you configure your Debezium connector to publish all table updates to a single topic. Defaults to: false. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. diff --git a/v2/cdc-parent/cdc-change-applier/README_Cdc_To_BigQuery_Template.md b/v2/cdc-parent/cdc-change-applier/README_Cdc_To_BigQuery_Template.md index f95ec2f098..859ef3c389 100644 --- a/v2/cdc-parent/cdc-change-applier/README_Cdc_To_BigQuery_Template.md +++ b/v2/cdc-parent/cdc-change-applier/README_Cdc_To_BigQuery_Template.md @@ -16,19 +16,19 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscriptions** : The comma-separated list of Pub/Sub input subscriptions to read from, in the format `,, ...`. -* **changeLogDataset** : The BigQuery dataset to store the staging tables in, in the format . -* **replicaDataset** : The location of the BigQuery dataset to store the replica tables in, in the format . +* **inputSubscriptions**: The comma-separated list of Pub/Sub input subscriptions to read from, in the format `,, ...`. +* **changeLogDataset**: The BigQuery dataset to store the staging tables in, in the format . +* **replicaDataset**: The location of the BigQuery dataset to store the replica tables in, in the format . ### Optional parameters -* **inputTopics** : Comma-separated list of PubSub topics to where CDC data is being pushed. -* **updateFrequencySecs** : The interval at which the pipeline updates the BigQuery table replicating the MySQL database. -* **useSingleTopic** : Set this to true if you have configured your Debezium connector to publish all table updates to a single topic. Defaults to: false. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **inputTopics**: Comma-separated list of PubSub topics to where CDC data is being pushed. +* **updateFrequencySecs**: The interval at which the pipeline updates the BigQuery table replicating the MySQL database. +* **useSingleTopic**: Set this to `true` if you configure your Debezium connector to publish all table updates to a single topic. Defaults to: false. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. diff --git a/v2/cdc-parent/cdc-change-applier/src/main/java/com/google/cloud/dataflow/cdc/applier/CdcToBigQueryChangeApplierPipeline.java b/v2/cdc-parent/cdc-change-applier/src/main/java/com/google/cloud/dataflow/cdc/applier/CdcToBigQueryChangeApplierPipeline.java index 46776037d7..662c1b4696 100644 --- a/v2/cdc-parent/cdc-change-applier/src/main/java/com/google/cloud/dataflow/cdc/applier/CdcToBigQueryChangeApplierPipeline.java +++ b/v2/cdc-parent/cdc-change-applier/src/main/java/com/google/cloud/dataflow/cdc/applier/CdcToBigQueryChangeApplierPipeline.java @@ -152,7 +152,7 @@ public interface CdcApplierOptions extends PipelineOptions, BigQueryStorageApiSt optional = true, description = "Whether to use a single topic for all MySQL table changes.", helpText = - "Set this to true if you have configured your Debezium connector to publish all table" + "Set this to `true` if you configure your Debezium connector to publish all table" + " updates to a single topic") @Default.Boolean(false) Boolean getUseSingleTopic(); diff --git a/v2/common/src/main/java/com/google/cloud/teleport/v2/auto/blocks/WriteToPubSub.java b/v2/common/src/main/java/com/google/cloud/teleport/v2/auto/blocks/WriteToPubSub.java index f6d64add79..0025fa36ad 100644 --- a/v2/common/src/main/java/com/google/cloud/teleport/v2/auto/blocks/WriteToPubSub.java +++ b/v2/common/src/main/java/com/google/cloud/teleport/v2/auto/blocks/WriteToPubSub.java @@ -32,9 +32,8 @@ public interface WriteToPubSubOptions extends PipelineOptions { order = 8, groupName = "Target", description = "Output Pub/Sub topic", - helpText = - "The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name'", - example = "projects/your-project-id/topics/your-topic-name") + helpText = "The name of the topic to publish data to.", + example = "projects//topics/") @Validation.Required String getOutputTopic(); diff --git a/v2/common/src/main/java/com/google/cloud/teleport/v2/options/CommonTemplateOptions.java b/v2/common/src/main/java/com/google/cloud/teleport/v2/options/CommonTemplateOptions.java index 88601095a6..606be47993 100644 --- a/v2/common/src/main/java/com/google/cloud/teleport/v2/options/CommonTemplateOptions.java +++ b/v2/common/src/main/java/com/google/cloud/teleport/v2/options/CommonTemplateOptions.java @@ -26,7 +26,7 @@ public interface CommonTemplateOptions extends PipelineOptions { optional = true, description = "Disabled algorithms to override jdk.tls.disabledAlgorithms", helpText = - "Comma separated algorithms to disable. If this value is set to none, no algorithm is " + "Comma separated algorithms to disable. If this value is set to `none`, no algorithm is " + "disabled. Use this parameter with caution, because the algorithms disabled " + "by default might have vulnerabilities or performance issues.", example = "SSLv3, RC4") @@ -45,7 +45,7 @@ public interface CommonTemplateOptions extends PipelineOptions { "Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. " + "These files are saved in the /extra_files directory in each worker.", example = - "gs:///file.txt,projects//secrets//versions/") + "gs:///file.txt,projects//secrets//versions/") String getExtraFilesToStage(); void setExtraFilesToStage(String extraFilesToStage); diff --git a/v2/common/src/main/java/com/google/cloud/teleport/v2/options/WindowedFilenamePolicyOptions.java b/v2/common/src/main/java/com/google/cloud/teleport/v2/options/WindowedFilenamePolicyOptions.java index 26f579671d..a0033cb0c8 100644 --- a/v2/common/src/main/java/com/google/cloud/teleport/v2/options/WindowedFilenamePolicyOptions.java +++ b/v2/common/src/main/java/com/google/cloud/teleport/v2/options/WindowedFilenamePolicyOptions.java @@ -61,8 +61,8 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { "The window duration is the interval in which data is written to the output directory. " + "Configure the duration based on the pipeline's throughput. For example, a higher " + "throughput might require smaller window sizes so that the data fits into memory. " - + "Defaults to 5m (5 minutes), with a minimum of 1s (1 second). Allowed formats are: [int]s (for seconds, example: 5s), " - + "[int]m (for minutes, example: 12m), [int]h (for hours, example: 2h).", + + "Defaults to `5m` (5 minutes), with a minimum of `1s` (1 second). Allowed formats are: `[int]s` (for seconds, example: `5s`), " + + "`[int]m` (for minutes, example: `12m`), `[int]h` (for hours, example: `2h`).", example = "5m") @Default.String("5m") String getWindowDuration(); @@ -75,9 +75,9 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { regexes = {"^[^A-Za-z0-9/](y+|Y+)[^A-Za-z0-9/]$"}, description = "Custom Year Pattern to use for the output directory", helpText = - "Pattern for formatting the year. Must be one or more of 'y' or 'Y'. Case makes no" + "Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no" + " difference in the year. The pattern can be optionally wrapped by characters that" - + " aren't either alphanumeric or the directory ('/') character. Defaults to 'YYYY'") + + " aren't either alphanumeric or the directory (`/`) character. Defaults to `YYYY`") @Default.String("YYYY") String getYearPattern(); @@ -89,9 +89,9 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { regexes = {"^[^A-Za-z0-9/](M+)[^A-Za-z0-9/]$"}, description = "Custom Month Pattern to use for the output directory", helpText = - "Pattern for formatting the month. Must be one or more of the 'M' character. The " + "Pattern for formatting the month. Must be one or more of the `M` character. The " + "pattern can be optionally wrapped by characters that aren't alphanumeric or the " - + "directory ('/') character. Defaults to 'MM'") + + "directory (`/`) character. Defaults to `MM`") @Default.String("MM") String getMonthPattern(); @@ -103,10 +103,10 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { regexes = {"^[^A-Za-z0-9/](d+|D+)[^A-Za-z0-9/]$"}, description = "Custom Day Pattern to use for the output directory", helpText = - "Pattern for formatting the day. Must be one or more of 'd' for day of month or 'D' for" + "Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for" + " day of year. Case makes no difference in the year. The pattern can be optionally" - + " wrapped by characters that aren't either alphanumeric or the directory ('/')" - + " character. Defaults to 'dd'") + + " wrapped by characters that aren't either alphanumeric or the directory (`/`)" + + " character. Defaults to `dd`") @Default.String("dd") String getDayPattern(); @@ -118,9 +118,9 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { regexes = {"^[^A-Za-z0-9/](H+)[^A-Za-z0-9/]$"}, description = "Custom Hour Pattern to use for the output directory", helpText = - "Pattern for formatting the hour. Must be one or more of the 'H' character. The pattern" + "Pattern for formatting the hour. Must be one or more of the `H` character. The pattern" + " can be optionally wrapped by characters that aren't alphanumeric or the directory" - + " ('/') character. Defaults to 'HH'") + + " (`/`) character. Defaults to `HH`") @Default.String("HH") String getHourPattern(); @@ -132,9 +132,9 @@ public interface WindowedFilenamePolicyOptions extends PipelineOptions { regexes = {"^[^A-Za-z0-9/](m+)[^A-Za-z0-9/]$"}, description = "Custom Minute Pattern to use for the output directory", helpText = - "Pattern for formatting the minute. Must be one or more of the 'm' character. The pattern" + "Pattern for formatting the minute. Must be one or more of the `m` character. The pattern" + " can be optionally wrapped by characters that aren't alphanumeric or the directory" - + " ('/') character. Defaults to 'mm'") + + " (`/`) character. Defaults to `mm`") @Default.String("mm") String getMinutePattern(); diff --git a/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/BigQueryConverters.java b/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/BigQueryConverters.java index 116071d69f..6ee96fbcc6 100644 --- a/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/BigQueryConverters.java +++ b/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/BigQueryConverters.java @@ -168,12 +168,12 @@ public interface BigQueryReadOptions extends PipelineOptions { optional = true, description = "BigQuery source table", helpText = - "The BigQuery table to read from. Format: `projectId:datasetId.tablename`. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the" + "The BigQuery table to read from. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the" + " BigQuery Storage Read API (https://cloud.google.com/bigquery/docs/reference/storage)." + " For information about limitations in the Storage Read API, see" + " https://cloud.google.com/bigquery/docs/reference/storage#limitations." + " You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter.", - example = "bigquery-project:dataset.input_table") + example = ":.") String getInputTableSpec(); void setInputTableSpec(String inputTableSpec); @@ -183,11 +183,10 @@ public interface BigQueryReadOptions extends PipelineOptions { optional = true, description = "The dead-letter table name to output failed messages to BigQuery", helpText = - "The BigQuery table for messages that failed to reach the output" - + " table, in the format :.." - + " If a table doesn't exist, is is created during pipeline execution. If" + "The BigQuery table for messages that failed to reach the output table." + + " If a table doesn't exist, it is created during pipeline execution. If" + " not specified, `_error_records` is used.", - example = "your-project-id:your-dataset.your-table-name") + example = ":.") String getOutputDeadletterTable(); void setOutputDeadletterTable(String outputDeadletterTable); @@ -212,8 +211,8 @@ public interface BigQueryReadOptions extends PipelineOptions { optional = true, description = "Set to true to use legacy SQL", helpText = - "Set to true to use legacy SQL. This parameter only applies when using" - + " the `query` parameter. Defaults to: false.") + "Set to `true` to use legacy SQL. This parameter only applies when using" + + " the `query` parameter. Defaults to `false`.") @Default.Boolean(false) Boolean getUseLegacySql(); diff --git a/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/CsvConverters.java b/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/CsvConverters.java index 7440101d41..f117f86e9f 100644 --- a/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/CsvConverters.java +++ b/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/CsvConverters.java @@ -184,7 +184,7 @@ public interface CsvPipelineOptions extends PipelineOptions { order = 1, description = "The input filepattern to read from.", helpText = - "The Cloud Storage file pattern to search for CSV files. Example: gs://mybucket/test-*.csv.") + "The Cloud Storage file pattern to search for CSV files. For example, `gs://mybucket/test-*.csv`.") String getInputFileSpec(); void setInputFileSpec(String inputFileSpec); @@ -214,8 +214,7 @@ public interface CsvPipelineOptions extends PipelineOptions { order = 4, optional = true, description = "Column delimiter of the data files.", - helpText = - "The column delimiter of the input text files. Default: use delimiter provided in csvFormat", + helpText = "The column delimiter of the input text files. Default: `,`", example = ",") @Default.InstanceFactory(DelimiterFactory.class) String getDelimiter(); @@ -227,7 +226,7 @@ public interface CsvPipelineOptions extends PipelineOptions { optional = true, description = "CSV Format to use for parsing records.", helpText = - "CSV format specification to use for parsing records. Default is: Default. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: " + "CSV format specification to use for parsing records. Default is: `Default`. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: " + "https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html") @Default.String("Default") String getCsvFormat(); @@ -238,7 +237,7 @@ public interface CsvPipelineOptions extends PipelineOptions { order = 6, optional = true, description = "Path to JSON schema", - helpText = "The path to the JSON schema. Defaults to: null.", + helpText = "The path to the JSON schema. Defaults to `null`.", example = "gs://path/to/schema") String getJsonSchemaPath(); @@ -249,7 +248,7 @@ public interface CsvPipelineOptions extends PipelineOptions { optional = true, description = "Set to true if number of files is in the tens of thousands", helpText = - "Set to true if number of files is in the tens of thousands. Defaults to: false.") + "Set to true if number of files is in the tens of thousands. Defaults to `false`.") @Default.Boolean(false) Boolean getLargeNumFiles(); @@ -261,7 +260,7 @@ public interface CsvPipelineOptions extends PipelineOptions { regexes = {"^(US-ASCII|ISO-8859-1|UTF-8|UTF-16)$"}, description = "CSV file encoding", helpText = - "The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16.") + "The CSV file character encoding format. Allowed values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`.") @Default.String("UTF-8") String getCsvFileEncoding(); @@ -272,8 +271,8 @@ public interface CsvPipelineOptions extends PipelineOptions { optional = true, description = "Log detailed CSV conversion errors", helpText = - "Set to true to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords)." - + " Default: false.") + "Set to `true` to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords)." + + " Default: `false`.") @Default.Boolean(false) Boolean getLogDetailedCsvConversionErrors(); diff --git a/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/JavascriptTextTransformer.java b/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/JavascriptTextTransformer.java index ba8a48d1ca..ed05a4843f 100644 --- a/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/JavascriptTextTransformer.java +++ b/v2/common/src/main/java/com/google/cloud/teleport/v2/transforms/JavascriptTextTransformer.java @@ -109,8 +109,8 @@ public interface JavascriptTextTransformerOptions extends PipelineOptions { + "is greater than 0, Dataflow periodically checks the UDF file in " + "Cloud Storage, and reloads the UDF if the file is modified. " + "This parameter allows you to update the UDF while the pipeline is running, " - + "without needing to restart the job. If the value is 0, UDF reloading is " - + "disabled. The default value is 0.") + + "without needing to restart the job. If the value is `0`, UDF reloading is " + + "disabled. The default value is `0`.") @Default.Integer(0) Integer getJavascriptTextTransformReloadIntervalMinutes(); diff --git a/v2/dataplex/README_Dataplex_BigQuery_to_GCS.md b/v2/dataplex/README_Dataplex_BigQuery_to_GCS.md index 717c745133..aa5016d07e 100644 --- a/v2/dataplex/README_Dataplex_BigQuery_to_GCS.md +++ b/v2/dataplex/README_Dataplex_BigQuery_to_GCS.md @@ -14,21 +14,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **sourceBigQueryDataset** : Dataplex asset name for the BigQuery dataset to tier data from. Format: projects//locations//lakes//zones//assets/ (Dataplex asset name) or projects//datasets/ (BigQuery dataset ID). -* **destinationStorageBucketAssetName** : Dataplex asset name for the Cloud Storage bucket to tier data to. Format: projects//locations//lakes//zones//assets/. -* **maxParallelBigQueryMetadataRequests** : The maximum number of parallel requests that will be sent to BigQuery when loading table/partition metadata. Defaults to: 5. +* **sourceBigQueryDataset**: Dataplex asset name for the BigQuery dataset to tier data from. Format: projects//locations//lakes//zones//assets/ (Dataplex asset name) or projects//datasets/ (BigQuery dataset ID). +* **destinationStorageBucketAssetName**: Dataplex asset name for the Cloud Storage bucket to tier data to. Format: projects//locations//lakes//zones//assets/. +* **maxParallelBigQueryMetadataRequests**: The maximum number of parallel requests that will be sent to BigQuery when loading table/partition metadata. Defaults to: 5. ### Optional parameters -* **tables** : A comma-separated list of BigQuery tables to tier. If none specified, all tables will be tiered. Tables should be specified by their name only (no project/dataset prefix). Case-sensitive!. -* **exportDataModifiedBeforeDateTime** : Move data older than this date (and optional time). For partitioned tables, move partitions last modified before this date/time. For non-partitioned tables, move if the table was last modified before this date/time. If not specified, move all tables / partitions. The date/time is parsed in the default time zone by default, but optional suffixes Z and +HH:mm are supported. Format: YYYY-MM-DD or YYYY-MM-DDTHH:mm:ss or YYYY-MM-DDTHH:mm:ss+03:00. Relative date/time (https://en.wikipedia.org/wiki/ISO_8601#Durations) is also supported. Format: -PnDTnHnMn.nS (must start with -P meaning time in the past). -* **fileFormat** : Output file format in Cloud Storage. Format: PARQUET or AVRO. Defaults to: PARQUET. -* **fileCompression** : Output file compression. Format: UNCOMPRESSED, SNAPPY, GZIP, or BZIP2. BZIP2 not supported for PARQUET files. Defaults to: SNAPPY. -* **partitionIdRegExp** : Process partitions with partition ID matching this regexp only. Default: process all. -* **writeDisposition** : Specifies the action that occurs if a destination file already exists. Format: OVERWRITE, FAIL, SKIP. If SKIP, only files that don't exist in the destination directory will be processed. If FAIL and at least one file already exists, no data will be processed and an error will be produced. Defaults to: SKIP. -* **enforceSamePartitionKey** : Whether to enforce the same partition key. Due to a BigQuery limitation, it's not possible to have a partitioned external table with the partition key (in the file path) to have the same name as one of the columns in the file. If this param is true (the default), the partition key of the target file will be set to the original partition column name and the column in the file will be renamed. If false, it's the partition key that will be renamed. -* **deleteSourceData** : Whether to delete source data from BigQuery after a successful export. Format: true or false. Defaults to: false. -* **updateDataplexMetadata** : Whether to update Dataplex metadata for the newly created entities. Only supported for Cloud Storage destination. If enabled, the pipeline will automatically copy the schema from source to the destination Dataplex entities, and the automated Dataplex Discovery won't run for them. Use this flag in cases where you have managed schema at the source. Defaults to: false. +* **tables**: A comma-separated list of BigQuery tables to tier. If none specified, all tables will be tiered. Tables should be specified by their name only (no project/dataset prefix). Case-sensitive!. +* **exportDataModifiedBeforeDateTime**: Move data older than this date (and optional time). For partitioned tables, move partitions last modified before this date/time. For non-partitioned tables, move if the table was last modified before this date/time. If not specified, move all tables / partitions. The date/time is parsed in the default time zone by default, but optional suffixes Z and +HH:mm are supported. Format: YYYY-MM-DD or YYYY-MM-DDTHH:mm:ss or YYYY-MM-DDTHH:mm:ss+03:00. Relative date/time (https://en.wikipedia.org/wiki/ISO_8601#Durations) is also supported. Format: -PnDTnHnMn.nS (must start with -P meaning time in the past). +* **fileFormat**: Output file format in Cloud Storage. Format: PARQUET or AVRO. Defaults to: PARQUET. +* **fileCompression**: Output file compression. Format: UNCOMPRESSED, SNAPPY, GZIP, or BZIP2. BZIP2 not supported for PARQUET files. Defaults to: SNAPPY. +* **partitionIdRegExp**: Process partitions with partition ID matching this regexp only. Default: process all. +* **writeDisposition**: Specifies the action that occurs if a destination file already exists. Format: OVERWRITE, FAIL, SKIP. If SKIP, only files that don't exist in the destination directory will be processed. If FAIL and at least one file already exists, no data will be processed and an error will be produced. Defaults to: SKIP. +* **enforceSamePartitionKey**: Whether to enforce the same partition key. Due to a BigQuery limitation, it's not possible to have a partitioned external table with the partition key (in the file path) to have the same name as one of the columns in the file. If this param is true (the default), the partition key of the target file will be set to the original partition column name and the column in the file will be renamed. If false, it's the partition key that will be renamed. +* **deleteSourceData**: Whether to delete source data from BigQuery after a successful export. Format: true or false. Defaults to: false. +* **updateDataplexMetadata**: Whether to update Dataplex metadata for the newly created entities. Only supported for Cloud Storage destination. If enabled, the pipeline will automatically copy the schema from source to the destination Dataplex entities, and the automated Dataplex Discovery won't run for them. Use this flag in cases where you have managed schema at the source. Defaults to: false. diff --git a/v2/dataplex/README_Dataplex_File_Format_Conversion.md b/v2/dataplex/README_Dataplex_File_Format_Conversion.md index 2a80b0474e..64ee27083d 100644 --- a/v2/dataplex/README_Dataplex_File_Format_Conversion.md +++ b/v2/dataplex/README_Dataplex_File_Format_Conversion.md @@ -14,15 +14,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputAssetOrEntitiesList** : Dataplex asset or Dataplex entities that contain the input files. Format: projects//locations//lakes//zones//assets/ OR projects//locations//lakes//zones//entities/,projects//locations//lakes//zones//entities/... . -* **outputFileFormat** : Output file format in Cloud Storage. Format: PARQUET or AVRO. -* **outputAsset** : Name of the Dataplex asset that contains Cloud Storage bucket where output files will be put into. Format: projects//locations//lakes//zones//assets/. +* **inputAssetOrEntitiesList**: Dataplex asset or Dataplex entities that contain the input files. Format: projects//locations//lakes//zones//assets/ OR projects//locations//lakes//zones//entities/,projects//locations//lakes//zones//entities/... . +* **outputFileFormat**: Output file format in Cloud Storage. Format: PARQUET or AVRO. +* **outputAsset**: Name of the Dataplex asset that contains Cloud Storage bucket where output files will be put into. Format: projects//locations//lakes//zones//assets/. ### Optional parameters -* **outputFileCompression** : Output file compression. Format: UNCOMPRESSED, SNAPPY, GZIP, or BZIP2. BZIP2 not supported for PARQUET files. Defaults to: SNAPPY. -* **writeDisposition** : Specifies the action that occurs if a destination file already exists. Format: OVERWRITE, FAIL, SKIP. If SKIP, only files that don't exist in the destination directory will be processed. If FAIL and at least one file already exists, no data will be processed and an error will be produced. Defaults to: SKIP. -* **updateDataplexMetadata** : Whether to update Dataplex metadata for the newly created entities. Only supported for Cloud Storage destination. If enabled, the pipeline will automatically copy the schema from source to the destination Dataplex entities, and the automated Dataplex Discovery won't run for them. Use this flag in cases where you have managed schema at the source. Defaults to: false. +* **outputFileCompression**: Output file compression. Format: UNCOMPRESSED, SNAPPY, GZIP, or BZIP2. BZIP2 not supported for PARQUET files. Defaults to: SNAPPY. +* **writeDisposition**: Specifies the action that occurs if a destination file already exists. Format: OVERWRITE, FAIL, SKIP. If SKIP, only files that don't exist in the destination directory will be processed. If FAIL and at least one file already exists, no data will be processed and an error will be produced. Defaults to: SKIP. +* **updateDataplexMetadata**: Whether to update Dataplex metadata for the newly created entities. Only supported for Cloud Storage destination. If enabled, the pipeline will automatically copy the schema from source to the destination Dataplex entities, and the automated Dataplex Discovery won't run for them. Use this flag in cases where you have managed schema at the source. Defaults to: false. diff --git a/v2/dataplex/README_Dataplex_JDBC_Ingestion.md b/v2/dataplex/README_Dataplex_JDBC_Ingestion.md index fe4f05d583..128cc6ec88 100644 --- a/v2/dataplex/README_Dataplex_JDBC_Ingestion.md +++ b/v2/dataplex/README_Dataplex_JDBC_Ingestion.md @@ -22,28 +22,28 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **connectionURL** : Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **driverJars** : Comma separated Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : Query to be executed on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTable** : BigQuery table location or Cloud Storage top folder name to write the output to. If it's a BigQuery table location, the table’s schema must match the source query schema and should in the format of some-project-id:somedataset.sometable. If it's a Cloud Storage top folder, just provide the top folder name. -* **outputAsset** : Dataplex output asset ID to which the results are stored to. Should be in the format of projects/your-project/locations//lakes//zones//assets/. +* **connectionURL**: Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **driverJars**: Comma separated Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: Query to be executed on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTable**: BigQuery table location or Cloud Storage top folder name to write the output to. If it's a BigQuery table location, the table’s schema must match the source query schema and should in the format of some-project-id:somedataset.sometable. If it's a Cloud Storage top folder, just provide the top folder name. +* **outputAsset**: Dataplex output asset ID to which the results are stored to. Should be in the format of projects/your-project/locations//lakes//zones//assets/. ### Optional parameters -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **username** : User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **password** : Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **KMSEncryptionKey** : If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **partitioningScheme** : The partition scheme when writing the file. Format: DAILY or MONTHLY or HOURLY. Defaults to: DAILY. -* **paritionColumn** : The partition column on which the partition is based. The column type must be of timestamp/date format. -* **writeDisposition** : Strategy to employ if the target file/table exists. If the table exists - should it overwrite/append or fail the load. Format: WRITE_APPEND or WRITE_TRUNCATE or WRITE_EMPTY. Only supported for writing to BigQuery. Defaults to: WRITE_EMPTY. -* **fileFormat** : Output file format in Cloud Storage. Format: PARQUET or AVRO. Defaults to: PARQUET. -* **useColumnAlias** : If enabled (set to true) the pipeline will consider column alias ("AS") instead of the column name to map the rows to BigQuery. Defaults to false. -* **fetchSize** : It should ONLY be used if the default value throws memory errors. If not set, using Beam's default fetch size. -* **updateDataplexMetadata** : Whether to update Dataplex metadata for the newly created entities. Only supported for Cloud Storage destination. If enabled, the pipeline will automatically copy the schema from source to the destination Dataplex entities, and the automated Dataplex Discovery won't run for them. Use this flag in cases where you have managed schema at the source. Defaults to: false. -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **username**: User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **password**: Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **KMSEncryptionKey**: If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **partitioningScheme**: The partition scheme when writing the file. Format: DAILY or MONTHLY or HOURLY. Defaults to: DAILY. +* **paritionColumn**: The partition column on which the partition is based. The column type must be of timestamp/date format. +* **writeDisposition**: Strategy to employ if the target file/table exists. If the table exists - should it overwrite/append or fail the load. Format: WRITE_APPEND or WRITE_TRUNCATE or WRITE_EMPTY. Only supported for writing to BigQuery. Defaults to: WRITE_EMPTY. +* **fileFormat**: Output file format in Cloud Storage. Format: PARQUET or AVRO. Defaults to: PARQUET. +* **useColumnAlias**: If enabled (set to true) the pipeline will consider column alias ("AS") instead of the column name to map the rows to BigQuery. Defaults to false. +* **fetchSize**: It should ONLY be used if the default value throws memory errors. If not set, using Beam's default fetch size. +* **updateDataplexMetadata**: Whether to update Dataplex metadata for the newly created entities. Only supported for Cloud Storage destination. If enabled, the pipeline will automatically copy the schema from source to the destination Dataplex entities, and the automated Dataplex Discovery won't run for them. Use this flag in cases where you have managed schema at the source. Defaults to: false. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. @@ -259,16 +259,16 @@ resource "google_dataflow_flex_template_job" "dataplex_jdbc_ingestion" { name = "dataplex-jdbc-ingestion" region = var.region parameters = { - connectionURL = "jdbc:mysql://some-host:3306/sampledb" - driverClassName = "com.mysql.jdbc.Driver" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" + connectionURL = "" + driverClassName = "" + driverJars = "" + query = "" outputTable = "" outputAsset = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" + # connectionProperties = "" # username = "" # password = "" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # KMSEncryptionKey = "" # partitioningScheme = "DAILY" # paritionColumn = "" # writeDisposition = "WRITE_EMPTY" diff --git a/v2/datastream-common/src/main/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJson.java b/v2/datastream-common/src/main/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJson.java index 6b96ba25a5..ff5ef5d5ea 100644 --- a/v2/datastream-common/src/main/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJson.java +++ b/v2/datastream-common/src/main/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJson.java @@ -25,6 +25,7 @@ import java.time.Duration; import java.time.Instant; import java.time.LocalDate; +import java.time.Period; import java.time.ZoneId; import java.time.ZoneOffset; import java.time.ZonedDateTime; @@ -43,6 +44,7 @@ import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericRecord; import org.apache.beam.sdk.transforms.SerializableFunction; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -563,6 +565,48 @@ static void handleDatastreamRecordType( .withZoneSameInstant(ZoneId.of("UTC")) .format(DEFAULT_TIMESTAMP_WITH_TZ_FORMATTER)); break; + /* + * The `intervalNano` maps to nano second precision interval type used by Cassandra Interval. + * On spanner this will map to `string` or `Interval` type. + * This is added here for DQL retrials for sourcedb-to-spanner. + * + * TODO(b/383689307): + * There's a lot of commonality in handling avro types between {@link FormatDatastreamRecordToJson} and {@link com.google.cloud.teleport.v2.spanner.migrations.avro.GenericRecordTypeConvertor}. + * Adding inter-package dependency might not be the best route, and we might eventually want to build a common package for handling common logic between the two. + */ + case "intervalNano": + Period period = + Period.ZERO + .plusYears(getOrDefault(element, "years", 0L)) + .plusMonths(getOrDefault(element, "months", 0L)) + .plusDays(getOrDefault(element, "days", 0L)); + /* + * Convert the period to a ISO-8601 period formatted String, such as P6Y3M1D. + * A zero period will be represented as zero days, 'P0D'. + * Refer to javadoc for Period#toString. + */ + String periodIso8061 = period.toString(); + java.time.Duration duration = + java.time.Duration.ZERO + .plusHours(getOrDefault(element, "hours", 0L)) + .plusMinutes(getOrDefault(element, "minutes", 0L)) + .plusSeconds(getOrDefault(element, "seconds", 0L)) + .plusNanos(getOrDefault(element, "nanos", 0L)); + /* + * Convert the duration to a ISO-8601 period formatted String, such as PT8H6M12.345S + * refer to javadoc for Duration#toString. + */ + String durationIso8610 = duration.toString(); + // Convert to ISO-8601 period format. + String convertedIntervalNano; + if (duration.isZero()) { + convertedIntervalNano = periodIso8061; + } else { + convertedIntervalNano = + periodIso8061 + StringUtils.removeStartIgnoreCase(durationIso8610, "P"); + } + jsonObject.put(fieldName, convertedIntervalNano); + break; default: LOG.warn( "Unknown field type {} for field {} in record {}.", fieldSchema, fieldName, element); @@ -578,5 +622,12 @@ static void handleDatastreamRecordType( break; } } + + private static T getOrDefault(GenericRecord element, String name, T def) { + if (element.get(name) == null) { + return def; + } + return (T) element.get(name); + } } } diff --git a/v2/datastream-common/src/test/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJsonTest.java b/v2/datastream-common/src/test/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJsonTest.java index 6ce9d2e818..71e31b65bc 100644 --- a/v2/datastream-common/src/test/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJsonTest.java +++ b/v2/datastream-common/src/test/java/com/google/cloud/teleport/v2/datastream/transforms/FormatDatastreamRecordToJsonTest.java @@ -18,17 +18,21 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; +import com.google.cloud.teleport.v2.datastream.transforms.FormatDatastreamRecordToJson.UnifiedTypesFormatter; import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.net.URL; import org.apache.avro.LogicalTypes; import org.apache.avro.Schema; +import org.apache.avro.SchemaBuilder; import org.apache.avro.file.DataFileReader; +import org.apache.avro.generic.GenericData; import org.apache.avro.generic.GenericDatumReader; import org.apache.avro.generic.GenericRecord; import org.apache.avro.io.DatumReader; @@ -211,4 +215,128 @@ public void testLogicalType_micros() { fieldNamePositiveNumber, fieldSchema, element, jsonObject); assertTrue(jsonObject.get(fieldNamePositiveNumber).asText().equals("1981-11-21T11:45:11Z")); } + + @Test + public void testIntervalNano() throws JsonProcessingException { + + ObjectNode objectNode = new ObjectNode(new JsonNodeFactory(true)); + + /* Basic Test. */ + UnifiedTypesFormatter.handleDatastreamRecordType( + "basic", + generateIntervalNanosSchema(), + generateIntervalNanosRecord(1000L, 1000L, 3890L, 25L, 331L, 12L, 9L), + objectNode); + + /* Test with any field set as null gets treated as 0. */ + + UnifiedTypesFormatter.handleDatastreamRecordType( + "null_minute", + generateIntervalNanosSchema(), + generateIntervalNanosRecord(1000L, 1000L, 3890L, 25L, null, 12L, 9L), + objectNode); + + /* Basic test for negative field. */ + + UnifiedTypesFormatter.handleDatastreamRecordType( + "neg_field_basic", + generateIntervalNanosSchema(), + generateIntervalNanosRecord(1000L, -1000L, 3890L, 25L, 31L, 12L, 9L), + objectNode); + + /* Test that negative nanos subtract from the fractional seconds, for example 12 Seconds -1 Nanos becomes 11.999999991s. */ + UnifiedTypesFormatter.handleDatastreamRecordType( + "neg_fractional_seconds", + generateIntervalNanosSchema(), + generateIntervalNanosRecord(1000L, 31L, 3890L, 25L, 31L, 12L, -9L), + objectNode); + + /* Test 0 interval. */ + UnifiedTypesFormatter.handleDatastreamRecordType( + "zero_interval", + generateIntervalNanosSchema(), + generateIntervalNanosRecord(0L, 0L, 0L, 0L, 0L, 0L, 0L), + objectNode); + + /* Test almost zero interval with only nanos set. */ + UnifiedTypesFormatter.handleDatastreamRecordType( + "one_nano_interval", + generateIntervalNanosSchema(), + generateIntervalNanosRecord(0L, 0L, 0L, 0L, 0L, 0L, 1L), + objectNode); + /* Test with large values. */ + UnifiedTypesFormatter.handleDatastreamRecordType( + "large_values", + generateIntervalNanosSchema(), + generateIntervalNanosRecord( + 2147483647L, 11L, 2147483647L, 2147483647L, 2147483647L, 2147483647L, 999999999L), + objectNode); + + /* Test with large negative values. */ + UnifiedTypesFormatter.handleDatastreamRecordType( + "large_negative_values", + generateIntervalNanosSchema(), + generateIntervalNanosRecord( + -2147483647L, + -11L, + -2147483647L, + -2147483647L, + -2147483647L, + -2147483647L, + -999999999L), + objectNode); + String expected = + "{\"basic\":\"P1000Y1000M3890DT30H31M12.000000009S\"," + + "\"null_minute\":\"P1000Y1000M3890DT25H12.000000009S\"," + + "\"neg_field_basic\":\"P1000Y-1000M3890DT25H31M12.000000009S\"," + + "\"neg_fractional_seconds\":\"P1000Y31M3890DT25H31M11.999999991S\"," + + "\"zero_interval\":\"P0D\"," + + "\"one_nano_interval\":\"P0DT0.000000001S\"," + + "\"large_values\":\"P2147483647Y11M2147483647DT2183871564H21M7.999999999S\"," + + "\"large_negative_values\":\"P-2147483647Y-11M-2147483647DT-2183871564H-21M-7.999999999S\"}"; + assertEquals(expected, new ObjectMapper().writeValueAsString(objectNode)); + } + + private GenericRecord generateIntervalNanosRecord( + Long years, Long months, Long days, Long hours, Long minutes, Long seconds, Long nanos) { + + GenericRecord genericRecord = new GenericData.Record(generateIntervalNanosSchema()); + genericRecord.put("years", years); + genericRecord.put("months", months); + genericRecord.put("days", days); + genericRecord.put("hours", hours); + genericRecord.put("minutes", minutes); + genericRecord.put("seconds", seconds); + genericRecord.put("nanos", nanos); + return genericRecord; + } + + private Schema generateIntervalNanosSchema() { + + return SchemaBuilder.builder() + .record("intervalNano") + .fields() + .name("years") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("months") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("days") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("hours") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("minutes") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("seconds") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("nanos") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .endRecord(); + } } diff --git a/v2/datastream-to-bigquery/README_Cloud_Datastream_to_BigQuery.md b/v2/datastream-to-bigquery/README_Cloud_Datastream_to_BigQuery.md index 99d1d246cf..09ba8ec724 100644 --- a/v2/datastream-to-bigquery/README_Cloud_Datastream_to_BigQuery.md +++ b/v2/datastream-to-bigquery/README_Cloud_Datastream_to_BigQuery.md @@ -31,39 +31,39 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The file location for Datastream file output in Cloud Storage, in the format: gs:////. -* **inputFileFormat** : The format of the output files produced by Datastream. Value can be 'avro' or 'json'. Defaults to: avro. -* **gcsPubSubSubscription** : The Pub/Sub subscription used by Cloud Storage to notify Dataflow of new files available for processing, in the format: projects//subscriptions/. -* **outputStagingDatasetTemplate** : The name of the dataset that contains staging tables. This parameter supports templates, for example {_metadata_dataset}_log or my_dataset_log. Normally, this parameter is a dataset name. Defaults to: {_metadata_dataset}. -* **outputDatasetTemplate** : The name of the dataset that contains the replica tables. This parameter supports templates, for example {_metadata_dataset} or my_dataset. Normally, this parameter is a dataset name. Defaults to: {_metadata_dataset}. -* **deadLetterQueueDirectory** : The path that Dataflow uses to write the dead-letter queue output. This path must not be in the same path as the Datastream file output. Defaults to empty. +* **inputFilePattern**: The file location for Datastream file output in Cloud Storage, in the format `gs:////`. +* **inputFileFormat**: The format of the output files produced by Datastream. Allowed values are `avro` and `json`. Defaults to `avro`. +* **gcsPubSubSubscription**: The Pub/Sub subscription used by Cloud Storage to notify Dataflow of new files available for processing, in the format: `projects//subscriptions/`. +* **outputStagingDatasetTemplate**: The name of the dataset that contains staging tables. This parameter supports templates, for example `{_metadata_dataset}_log` or `my_dataset_log`. Normally, this parameter is a dataset name. Defaults to `{_metadata_dataset}`. +* **outputDatasetTemplate**: The name of the dataset that contains the replica tables. This parameter supports templates, for example `{_metadata_dataset}` or `my_dataset`. Normally, this parameter is a dataset name. Defaults to `{_metadata_dataset}`. +* **deadLetterQueueDirectory**: The path that Dataflow uses to write the dead-letter queue output. This path must not be in the same path as the Datastream file output. Defaults to `empty`. ### Optional parameters -* **streamName** : The name or the template for the stream to poll for schema information. Defaults to: {_metadata_stream}. The default value is usually enough. -* **rfcStartDateTime** : The starting DateTime to use to fetch data from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z. -* **fileReadConcurrency** : The number of concurrent DataStream files to read. Default is 10. -* **outputProjectId** : The ID of the Google Cloud project that contains the BigQuery datasets to output data into. The default for this parameter is the project where the Dataflow pipeline is running. -* **outputStagingTableNameTemplate** : The template to use to name the staging tables. For example, {_metadata_table}). Defaults to: {_metadata_table}_log. -* **outputTableNameTemplate** : The template to use for the name of the replica tables, for example {_metadata_table}. Defaults to: {_metadata_table}. -* **ignoreFields** : Comma-separated fields to ignore in BigQuery. Defaults to: _metadata_stream,_metadata_schema,_metadata_table,_metadata_source,_metadata_tx_id,_metadata_dlq_reconsumed,_metadata_primary_keys,_metadata_error,_metadata_retry_count. (Example: _metadata_stream,_metadata_schema). -* **mergeFrequencyMinutes** : The number of minutes between merges for a given table. Defaults to: 5. -* **dlqRetryMinutes** : The number of minutes between DLQ Retries. Defaults to: 10. -* **dataStreamRootUrl** : The Datastream API root URL. Defaults to: https://datastream.googleapis.com/. -* **applyMerge** : Whether to disable MERGE queries for the job. Defaults to: true. -* **mergeConcurrency** : The number of concurrent BigQuery MERGE queries. Only effective when applyMerge is set to true. Defaults to: 30. -* **partitionRetentionDays** : The number of days to use for partition retention when running BigQuery merges. Defaults to: 1. -* **useStorageWriteApiAtLeastOnce** : This parameter takes effect only if "Use BigQuery Storage Write API" is enabled. If true, at-least-once semantics are used for the Storage Write API. Otherwise, exactly-once semantics are used. Defaults to: false. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. -* **pythonTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-transforms/*.py). -* **pythonRuntimeVersion** : The runtime version to use for this Python UDF. -* **pythonTextTransformFunctionName** : The name of the function to call from your JavaScript file. Use only letters, digits, and underscores. (Example: transform_udf1). -* **runtimeRetries** : The number of times a runtime will be retried before failing. Defaults to: 5. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **streamName**: The name or the template for the stream to poll for schema information. Defaults to: {_metadata_stream}. The default value is usually enough. +* **rfcStartDateTime**: The starting DateTime to use to fetch data from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: `1970-01-01T00:00:00.00Z`. +* **fileReadConcurrency**: The number of concurrent DataStream files to read. Default is `10`. +* **outputProjectId**: The ID of the Google Cloud project that contains the BigQuery datasets to output data into. The default for this parameter is the project where the Dataflow pipeline is running. +* **outputStagingTableNameTemplate**: The template to use to name the staging tables. For example, `{_metadata_table}`. Defaults to `{_metadata_table}_log`. +* **outputTableNameTemplate**: The template to use for the name of the replica tables, for example `{_metadata_table}`. Defaults to `{_metadata_table}`. +* **ignoreFields**: Comma-separated fields to ignore in BigQuery. Defaults to: `_metadata_stream,_metadata_schema,_metadata_table,_metadata_source,_metadata_tx_id,_metadata_dlq_reconsumed,_metadata_primary_keys,_metadata_error,_metadata_retry_count`. For example, `_metadata_stream,_metadata_schema`. +* **mergeFrequencyMinutes**: The number of minutes between merges for a given table. Defaults to `5`. +* **dlqRetryMinutes**: The number of minutes between DLQ Retries. Defaults to `10`. +* **dataStreamRootUrl**: The Datastream API root URL. Defaults to: https://datastream.googleapis.com/. +* **applyMerge**: Whether to disable MERGE queries for the job. Defaults to `true`. +* **mergeConcurrency**: The number of concurrent BigQuery MERGE queries. Only effective when applyMerge is set to true. Defaults to `30`. +* **partitionRetentionDays**: The number of days to use for partition retention when running BigQuery merges. Defaults to `1`. +* **useStorageWriteApiAtLeastOnce**: This parameter takes effect only if `Use BigQuery Storage Write API` is enabled. If `true`, at-least-once semantics are used for the Storage Write API. Otherwise, exactly-once semantics are used. Defaults to `false`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. +* **pythonTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-transforms/*.py`. +* **pythonRuntimeVersion**: The runtime version to use for this Python UDF. +* **pythonTextTransformFunctionName**: The name of the function to call from your JavaScript file. Use only letters, digits, and underscores. For example, `transform_udf1`. +* **runtimeRetries**: The number of times a runtime will be retried before failing. Defaults to: 5. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. ## User-Defined functions (UDFs) @@ -334,7 +334,7 @@ resource "google_dataflow_flex_template_job" "cloud_datastream_to_bigquery" { # outputProjectId = "" # outputStagingTableNameTemplate = "{_metadata_table}_log" # outputTableNameTemplate = "{_metadata_table}" - # ignoreFields = "_metadata_stream,_metadata_schema" + # ignoreFields = "_metadata_stream,_metadata_schema,_metadata_table,_metadata_source,_metadata_tx_id,_metadata_dlq_reconsumed,_metadata_primary_keys,_metadata_error,_metadata_retry_count" # mergeFrequencyMinutes = "5" # dlqRetryMinutes = "10" # dataStreamRootUrl = "https://datastream.googleapis.com/" @@ -342,12 +342,12 @@ resource "google_dataflow_flex_template_job" "cloud_datastream_to_bigquery" { # mergeConcurrency = "30" # partitionRetentionDays = "1" # useStorageWriteApiAtLeastOnce = "false" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" - # pythonTextTransformGcsPath = "gs://your-bucket/your-transforms/*.py" + # pythonTextTransformGcsPath = "" # pythonRuntimeVersion = "" - # pythonTextTransformFunctionName = "transform_udf1" + # pythonTextTransformFunctionName = "" # runtimeRetries = "5" # useStorageWriteApi = "false" # numStorageWriteApiStreams = "0" diff --git a/v2/datastream-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToBigQuery.java b/v2/datastream-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToBigQuery.java index b55fb26204..5b29c3acff 100644 --- a/v2/datastream-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToBigQuery.java +++ b/v2/datastream-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToBigQuery.java @@ -149,7 +149,7 @@ public interface Options groupName = "Source", description = "File location for Datastream file output in Cloud Storage.", helpText = - "The file location for Datastream file output in Cloud Storage, in the format: gs:////.") + "The file location for Datastream file output in Cloud Storage, in the format `gs:////`.") String getInputFilePattern(); void setInputFilePattern(String value); @@ -159,7 +159,7 @@ public interface Options enumOptions = {@TemplateEnumOption("avro"), @TemplateEnumOption("json")}, description = "Datastream output file format (avro/json).", helpText = - "The format of the output files produced by Datastream. Value can be 'avro' or 'json'. Defaults to: avro.") + "The format of the output files produced by Datastream. Allowed values are `avro` and `json`. Defaults to `avro`.") @Default.String("avro") String getInputFileFormat(); @@ -169,7 +169,7 @@ public interface Options order = 3, description = "The Pub/Sub subscription on the Cloud Storage bucket.", helpText = - "The Pub/Sub subscription used by Cloud Storage to notify Dataflow of new files available for processing, in the format: projects//subscriptions/.") + "The Pub/Sub subscription used by Cloud Storage to notify Dataflow of new files available for processing, in the format: `projects//subscriptions/`.") String getGcsPubSubSubscription(); void setGcsPubSubSubscription(String value); @@ -191,7 +191,7 @@ public interface Options "The starting DateTime used to fetch from Cloud Storage " + "(https://tools.ietf.org/html/rfc3339).", helpText = - "The starting DateTime to use to fetch data from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z.") + "The starting DateTime to use to fetch data from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: `1970-01-01T00:00:00.00Z`.") @Default.String("1970-01-01T00:00:00.00Z") String getRfcStartDateTime(); @@ -201,7 +201,7 @@ public interface Options order = 6, optional = true, description = "File read concurrency", - helpText = "The number of concurrent DataStream files to read. Default is 10.") + helpText = "The number of concurrent DataStream files to read. Default is `10`.") @Default.Integer(10) Integer getFileReadConcurrency(); @@ -223,7 +223,7 @@ public interface Options groupName = "Target", description = "Name or template for the dataset to contain staging tables.", helpText = - "The name of the dataset that contains staging tables. This parameter supports templates, for example {_metadata_dataset}_log or my_dataset_log. Normally, this parameter is a dataset name. Defaults to: {_metadata_dataset}.") + "The name of the dataset that contains staging tables. This parameter supports templates, for example `{_metadata_dataset}_log` or `my_dataset_log`. Normally, this parameter is a dataset name. Defaults to `{_metadata_dataset}`.") @Default.String("{_metadata_dataset}") String getOutputStagingDatasetTemplate(); @@ -235,7 +235,7 @@ public interface Options groupName = "Target", description = "Template for the name of staging tables.", helpText = - "The template to use to name the staging tables. For example, {_metadata_table}). Defaults to: {_metadata_table}_log.") + "The template to use to name the staging tables. For example, `{_metadata_table}`. Defaults to `{_metadata_table}_log`.") @Default.String("{_metadata_table}_log") String getOutputStagingTableNameTemplate(); @@ -246,7 +246,7 @@ public interface Options groupName = "Target", description = "Template for the dataset to contain replica tables.", helpText = - "The name of the dataset that contains the replica tables. This parameter supports templates, for example {_metadata_dataset} or my_dataset. Normally, this parameter is a dataset name. Defaults to: {_metadata_dataset}.") + "The name of the dataset that contains the replica tables. This parameter supports templates, for example `{_metadata_dataset}` or `my_dataset`. Normally, this parameter is a dataset name. Defaults to `{_metadata_dataset}`.") @Default.String("{_metadata_dataset}") String getOutputDatasetTemplate(); @@ -258,7 +258,7 @@ public interface Options optional = true, description = "Template for the name of replica tables.", helpText = - "The template to use for the name of the replica tables, for example {_metadata_table}. Defaults to: {_metadata_table}.") + "The template to use for the name of the replica tables, for example `{_metadata_table}`. Defaults to `{_metadata_table}`.") @Default.String("{_metadata_table}") String getOutputTableNameTemplate(); @@ -269,7 +269,7 @@ public interface Options optional = true, description = "Fields to be ignored", helpText = - "Comma-separated fields to ignore in BigQuery. Defaults to: _metadata_stream,_metadata_schema,_metadata_table,_metadata_source,_metadata_tx_id,_metadata_dlq_reconsumed,_metadata_primary_keys,_metadata_error,_metadata_retry_count.", + "Comma-separated fields to ignore in BigQuery. Defaults to: `_metadata_stream,_metadata_schema,_metadata_table,_metadata_source,_metadata_tx_id,_metadata_dlq_reconsumed,_metadata_primary_keys,_metadata_error,_metadata_retry_count`.", example = "_metadata_stream,_metadata_schema") @Default.String( "_metadata_stream,_metadata_schema,_metadata_table,_metadata_source," @@ -283,7 +283,7 @@ public interface Options order = 13, optional = true, description = "The number of minutes between merges for a given table", - helpText = "The number of minutes between merges for a given table. Defaults to: 5.") + helpText = "The number of minutes between merges for a given table. Defaults to `5`.") @Default.Integer(5) Integer getMergeFrequencyMinutes(); @@ -293,7 +293,7 @@ public interface Options order = 14, description = "Dead letter queue directory.", helpText = - "The path that Dataflow uses to write the dead-letter queue output. This path must not be in the same path as the Datastream file output. Defaults to empty.") + "The path that Dataflow uses to write the dead-letter queue output. This path must not be in the same path as the Datastream file output. Defaults to `empty`.") @Default.String("") String getDeadLetterQueueDirectory(); @@ -303,7 +303,7 @@ public interface Options order = 15, optional = true, description = "The number of minutes between DLQ Retries.", - helpText = "The number of minutes between DLQ Retries. Defaults to: 10.") + helpText = "The number of minutes between DLQ Retries. Defaults to `10`.") @Default.Integer(10) Integer getDlqRetryMinutes(); @@ -323,7 +323,7 @@ public interface Options order = 17, optional = true, description = "A switch to disable MERGE queries for the job.", - helpText = "Whether to disable MERGE queries for the job. Defaults to: true.") + helpText = "Whether to disable MERGE queries for the job. Defaults to `true`.") @Default.Boolean(true) Boolean getApplyMerge(); @@ -336,7 +336,7 @@ public interface Options parentTriggerValues = {"true"}, description = "Concurrent queries for merge.", helpText = - "The number of concurrent BigQuery MERGE queries. Only effective when applyMerge is set to true. Defaults to: 30.") + "The number of concurrent BigQuery MERGE queries. Only effective when applyMerge is set to true. Defaults to `30`.") @Default.Integer(MergeConfiguration.DEFAULT_MERGE_CONCURRENCY) Integer getMergeConcurrency(); @@ -347,7 +347,7 @@ public interface Options optional = true, description = "Partition retention days.", helpText = - "The number of days to use for partition retention when running BigQuery merges. Defaults to: 1.") + "The number of days to use for partition retention when running BigQuery merges. Defaults to `1`.") @Default.Integer(MergeConfiguration.DEFAULT_PARTITION_RETENTION_DAYS) Integer getPartitionRetentionDays(); @@ -360,7 +360,7 @@ public interface Options parentTriggerValues = {"true"}, description = "Use at at-least-once semantics in BigQuery Storage Write API", helpText = - "This parameter takes effect only if \"Use BigQuery Storage Write API\" is enabled. If true, at-least-once semantics are used for the Storage Write API. Otherwise, exactly-once semantics are used. Defaults to: false.", + "This parameter takes effect only if `Use BigQuery Storage Write API` is enabled. If `true`, at-least-once semantics are used for the Storage Write API. Otherwise, exactly-once semantics are used. Defaults to `false`.", hiddenUi = true) @Default.Boolean(false) @Override diff --git a/v2/datastream-to-spanner/README_Cloud_Datastream_to_Spanner.md b/v2/datastream-to-spanner/README_Cloud_Datastream_to_Spanner.md index 1f68678427..df8ca29a25 100644 --- a/v2/datastream-to-spanner/README_Cloud_Datastream_to_Spanner.md +++ b/v2/datastream-to-spanner/README_Cloud_Datastream_to_Spanner.md @@ -42,41 +42,41 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **instanceId** : The Spanner instance where the changes are replicated. -* **databaseId** : The Spanner database where the changes are replicated. +* **instanceId**: The Spanner instance where the changes are replicated. +* **databaseId**: The Spanner database where the changes are replicated. ### Optional parameters -* **inputFilePattern** : The Cloud Storage file location that contains the Datastream files to replicate. Typically, this is the root path for a stream. Support for this feature has been disabled. -* **inputFileFormat** : The format of the output file produced by Datastream. For example `avro,json`. Default, `avro`. -* **sessionFilePath** : Session file path in Cloud Storage that contains mapping information from HarbourBridge. -* **projectId** : The Spanner project ID. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. (Example: https://batch-spanner.googleapis.com). Defaults to: https://batch-spanner.googleapis.com. -* **gcsPubSubSubscription** : The Pub/Sub subscription being used in a Cloud Storage notification policy. The name should be in the format of projects//subscriptions/. -* **streamName** : The name or template for the stream to poll for schema information and source type. -* **shadowTablePrefix** : The prefix used to name shadow tables. Default: `shadow_`. -* **shouldCreateShadowTables** : This flag indicates whether shadow tables must be created in Cloud Spanner database. Defaults to: true. -* **rfcStartDateTime** : The starting DateTime used to fetch from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z. -* **fileReadConcurrency** : The number of concurrent DataStream files to read. Defaults to: 30. -* **deadLetterQueueDirectory** : The file path used when storing the error queue output. The default file path is a directory under the Dataflow job's temp location. -* **dlqRetryMinutes** : The number of minutes between dead letter queue retries. Defaults to 10. -* **dlqMaxRetryCount** : The max number of times temporary errors can be retried through DLQ. Defaults to 500. -* **dataStreamRootUrl** : Datastream API Root URL. Defaults to: https://datastream.googleapis.com/. -* **datastreamSourceType** : This is the type of source database that Datastream connects to. Example - mysql/oracle. Need to be set when testing without an actual running Datastream. -* **roundJsonDecimals** : This flag if set, rounds the decimal values in json columns to a number that can be stored without loss of precision. Defaults to: false. -* **runMode** : This is the run mode type, whether regular or with retryDLQ. Defaults to: regular. -* **transformationContextFilePath** : Transformation context file path in cloud storage used to populate data used in transformations performed during migrations Eg: The shard id to db name to identify the db from which a row was migrated. -* **directoryWatchDurationInMinutes** : The Duration for which the pipeline should keep polling a directory in GCS. Datastreamoutput files are arranged in a directory structure which depicts the timestamp of the event grouped by minutes. This parameter should be approximately equal tomaximum delay which could occur between event occurring in source database and the same event being written to GCS by Datastream. 99.9 percentile = 10 minutes. Defaults to: 10. -* **spannerPriority** : The request priority for Cloud Spanner calls. The value must be one of: [HIGH,MEDIUM,LOW]. Defaults to HIGH. -* **dlqGcsPubSubSubscription** : The Pub/Sub subscription being used in a Cloud Storage notification policy for DLQ retry directory when running in regular mode. The name should be in the format of projects//subscriptions/. When set, the deadLetterQueueDirectory and dlqRetryMinutes are ignored. -* **transformationJarPath** : Custom jar location in Cloud Storage that contains the custom transformation logic for processing records in forward migration. Defaults to empty. -* **transformationClassName** : Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. -* **transformationCustomParameters** : String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. -* **filteredEventsDirectory** : This is the file path to store the events filtered via custom transformation. Default is a directory under the Dataflow job's temp location. The default value is enough under most conditions. -* **shardingContextFilePath** : Sharding context file path in cloud storage is used to populate the shard id in spanner database for each source shard.It is of the format Map>. -* **tableOverrides** : These are the table name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1, SpannerTableName1}, {SourceTableName2, SpannerTableName2}]This example shows mapping Singers table to Vocalists and Albums table to Records. (Example: [{Singers, Vocalists}, {Albums, Records}]). Defaults to empty. -* **columnOverrides** : These are the column name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1.SourceColumnName1, SourceTableName1.SpannerColumnName1}, {SourceTableName2.SourceColumnName1, SourceTableName2.SpannerColumnName1}]Note that the SourceTableName should remain the same in both the source and spanner pair. To override table names, use tableOverrides.The example shows mapping SingerName to TalentName and AlbumName to RecordName in Singers and Albums table respectively. (Example: [{Singers.SingerName, Singers.TalentName}, {Albums.AlbumName, Albums.RecordName}]). Defaults to empty. -* **schemaOverridesFilePath** : A file which specifies the table and the column name overrides from source to spanner. Defaults to empty. +* **inputFilePattern**: The Cloud Storage file location that contains the Datastream files to replicate. Typically, this is the root path for a stream. Support for this feature has been disabled. +* **inputFileFormat**: The format of the output file produced by Datastream. For example `avro,json`. Defaults to `avro`. +* **sessionFilePath**: Session file path in Cloud Storage that contains mapping information from HarbourBridge. +* **projectId**: The Spanner project ID. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. For example, `https://batch-spanner.googleapis.com`. Defaults to: https://batch-spanner.googleapis.com. +* **gcsPubSubSubscription**: The Pub/Sub subscription being used in a Cloud Storage notification policy. For the name, use the format `projects//subscriptions/`. +* **streamName**: The name or template for the stream to poll for schema information and source type. +* **shadowTablePrefix**: The prefix used to name shadow tables. Default: `shadow_`. +* **shouldCreateShadowTables**: This flag indicates whether shadow tables must be created in Cloud Spanner database. Defaults to: true. +* **rfcStartDateTime**: The starting DateTime used to fetch from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z. +* **fileReadConcurrency**: The number of concurrent DataStream files to read. Defaults to: 30. +* **deadLetterQueueDirectory**: The file path used when storing the error queue output. The default file path is a directory under the Dataflow job's temp location. +* **dlqRetryMinutes**: The number of minutes between dead letter queue retries. Defaults to `10`. +* **dlqMaxRetryCount**: The max number of times temporary errors can be retried through DLQ. Defaults to `500`. +* **dataStreamRootUrl**: Datastream API Root URL. Defaults to: https://datastream.googleapis.com/. +* **datastreamSourceType**: This is the type of source database that Datastream connects to. Example - mysql/oracle. Need to be set when testing without an actual running Datastream. +* **roundJsonDecimals**: This flag if set, rounds the decimal values in json columns to a number that can be stored without loss of precision. Defaults to: false. +* **runMode**: This is the run mode type, whether regular or with retryDLQ. Defaults to: regular. +* **transformationContextFilePath**: Transformation context file path in cloud storage used to populate data used in transformations performed during migrations Eg: The shard id to db name to identify the db from which a row was migrated. +* **directoryWatchDurationInMinutes**: The Duration for which the pipeline should keep polling a directory in GCS. Datastreamoutput files are arranged in a directory structure which depicts the timestamp of the event grouped by minutes. This parameter should be approximately equal tomaximum delay which could occur between event occurring in source database and the same event being written to GCS by Datastream. 99.9 percentile = 10 minutes. Defaults to: 10. +* **spannerPriority**: The request priority for Cloud Spanner calls. The value must be one of: [`HIGH`,`MEDIUM`,`LOW`]. Defaults to `HIGH`. +* **dlqGcsPubSubSubscription**: The Pub/Sub subscription being used in a Cloud Storage notification policy for DLQ retry directory when running in regular mode. For the name, use the format `projects//subscriptions/`. When set, the deadLetterQueueDirectory and dlqRetryMinutes are ignored. +* **transformationJarPath**: Custom JAR file location in Cloud Storage for the file that contains the custom transformation logic for processing records in forward migration. Defaults to empty. +* **transformationClassName**: Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. +* **transformationCustomParameters**: String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. +* **filteredEventsDirectory**: This is the file path to store the events filtered via custom transformation. Default is a directory under the Dataflow job's temp location. The default value is enough under most conditions. +* **shardingContextFilePath**: Sharding context file path in cloud storage is used to populate the shard id in spanner database for each source shard.It is of the format Map>. +* **tableOverrides**: These are the table name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1, SpannerTableName1}, {SourceTableName2, SpannerTableName2}]This example shows mapping Singers table to Vocalists and Albums table to Records. For example, `[{Singers, Vocalists}, {Albums, Records}]`. Defaults to empty. +* **columnOverrides**: These are the column name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1.SourceColumnName1, SourceTableName1.SpannerColumnName1}, {SourceTableName2.SourceColumnName1, SourceTableName2.SpannerColumnName1}]Note that the SourceTableName should remain the same in both the source and spanner pair. To override table names, use tableOverrides.The example shows mapping SingerName to TalentName and AlbumName to RecordName in Singers and Albums table respectively. For example, `[{Singers.SingerName, Singers.TalentName}, {Albums.AlbumName, Albums.RecordName}]`. Defaults to empty. +* **schemaOverridesFilePath**: A file which specifies the table and the column name overrides from source to spanner. Defaults to empty. @@ -360,8 +360,8 @@ resource "google_dataflow_flex_template_job" "cloud_datastream_to_spanner" { # transformationCustomParameters = "" # filteredEventsDirectory = "" # shardingContextFilePath = "" - # tableOverrides = "[{Singers, Vocalists}, {Albums, Records}]" - # columnOverrides = "[{Singers.SingerName, Singers.TalentName}, {Albums.AlbumName, Albums.RecordName}]" + # tableOverrides = "" + # columnOverrides = "" # schemaOverridesFilePath = "" } } diff --git a/v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToSpanner.java b/v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToSpanner.java index 47ffe44837..d16fe44497 100644 --- a/v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToSpanner.java +++ b/v2/datastream-to-spanner/src/main/java/com/google/cloud/teleport/v2/templates/DataStreamToSpanner.java @@ -168,7 +168,7 @@ public interface Options optional = true, description = "Datastream output file format (avro/json).", helpText = - "The format of the output file produced by Datastream. For example `avro,json`. Default, `avro`.") + "The format of the output file produced by Datastream. For example `avro,json`. Defaults to `avro`.") @Default.String("avro") String getInputFileFormat(); @@ -230,9 +230,8 @@ public interface Options optional = true, description = "The Pub/Sub subscription being used in a Cloud Storage notification policy.", helpText = - "The Pub/Sub subscription being used in a Cloud Storage notification policy. The name" - + " should be in the format of" - + " projects//subscriptions/.") + "The Pub/Sub subscription being used in a Cloud Storage notification policy. For the name," + + " use the format `projects//subscriptions/`.") String getGcsPubSubSubscription(); void setGcsPubSubSubscription(String value); @@ -309,7 +308,7 @@ public interface Options order = 15, optional = true, description = "Dead letter queue retry minutes", - helpText = "The number of minutes between dead letter queue retries. Defaults to 10.") + helpText = "The number of minutes between dead letter queue retries. Defaults to `10`.") @Default.Integer(10) Integer getDlqRetryMinutes(); @@ -320,7 +319,7 @@ public interface Options optional = true, description = "Dead letter queue maximum retry count", helpText = - "The max number of times temporary errors can be retried through DLQ. Defaults to 500.") + "The max number of times temporary errors can be retried through DLQ. Defaults to `500`.") @Default.Integer(500) Integer getDlqMaxRetryCount(); @@ -412,7 +411,7 @@ public interface Options description = "Priority for Spanner RPC invocations", helpText = "The request priority for Cloud Spanner calls. The value must be one of:" - + " [HIGH,MEDIUM,LOW]. Defaults to HIGH") + + " [`HIGH`,`MEDIUM`,`LOW`]. Defaults to `HIGH`.") @Default.Enum("HIGH") RpcPriority getSpannerPriority(); @@ -426,8 +425,8 @@ public interface Options + " retry directory when running in regular mode.", helpText = "The Pub/Sub subscription being used in a Cloud Storage notification policy for DLQ" - + " retry directory when running in regular mode. The name should be in the format" - + " of projects//subscriptions/. When set, the" + + " retry directory when running in regular mode. For the name, use the format" + + " `projects//subscriptions/`. When set, the" + " deadLetterQueueDirectory and dlqRetryMinutes are ignored.") String getDlqGcsPubSubSubscription(); @@ -438,7 +437,7 @@ public interface Options optional = true, description = "Custom jar location in Cloud Storage", helpText = - "Custom jar location in Cloud Storage that contains the custom transformation logic for processing records" + "Custom JAR file location in Cloud Storage for the file that contains the custom transformation logic for processing records" + " in forward migration.") @Default.String("") String getTransformationJarPath(); diff --git a/v2/datastream-to-sql/README_Cloud_Datastream_to_SQL.md b/v2/datastream-to-sql/README_Cloud_Datastream_to_SQL.md index d8ad3a9ab2..97fb5274a9 100644 --- a/v2/datastream-to-sql/README_Cloud_Datastream_to_SQL.md +++ b/v2/datastream-to-sql/README_Cloud_Datastream_to_SQL.md @@ -37,23 +37,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The file location for the Datastream files in Cloud Storage to replicate. This file location is typically the root path for the stream. -* **databaseHost** : The SQL host to connect on. -* **databaseUser** : The SQL user with all required permissions to write to all tables in replication. -* **databasePassword** : The password for the SQL user. +* **inputFilePattern**: The file location for the Datastream files in Cloud Storage to replicate. This file location is typically the root path for the stream. +* **databaseHost**: The SQL host to connect on. +* **databaseUser**: The SQL user with all required permissions to write to all tables in replication. +* **databasePassword**: The password for the SQL user. ### Optional parameters -* **gcsPubSubSubscription** : The Pub/Sub subscription with Datastream file notifications. For example, `projects//subscriptions/`. -* **inputFileFormat** : The format of the output file produced by Datastream. For example, `avro` or `json`. Defaults to `avro`. -* **streamName** : The name or template for the stream to poll for schema information. The default value is `{_metadata_stream}`. -* **rfcStartDateTime** : The starting DateTime used to fetch from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z. -* **dataStreamRootUrl** : Datastream API Root URL. Defaults to: https://datastream.googleapis.com/. -* **databaseType** : The database type to write to (for example, Postgres). Defaults to: postgres. -* **databasePort** : The SQL database port to connect to. The default value is `5432`. -* **databaseName** : The name of the SQL database to connect to. The default value is `postgres`. -* **schemaMap** : A map of key/values used to dictate schema name changes (ie. old_name:new_name,CaseError:case_error). Defaults to empty. -* **customConnectionString** : Optional connection string which will be used instead of the default database string. +* **gcsPubSubSubscription**: The Pub/Sub subscription with Datastream file notifications. For example, `projects//subscriptions/`. +* **inputFileFormat**: The format of the output file produced by Datastream. For example, `avro` or `json`. Defaults to `avro`. +* **streamName**: The name or template for the stream to poll for schema information. The default value is `{_metadata_stream}`. +* **rfcStartDateTime**: The starting DateTime used to fetch from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z. +* **dataStreamRootUrl**: Datastream API Root URL. Defaults to: https://datastream.googleapis.com/. +* **databaseType**: The database type to write to (for example, Postgres). Defaults to: postgres. +* **databasePort**: The SQL database port to connect to. The default value is `5432`. +* **databaseName**: The name of the SQL database to connect to. The default value is `postgres`. +* **schemaMap**: A map of key/values used to dictate schema name changes (ie. old_name:new_name,CaseError:case_error). Defaults to empty. +* **customConnectionString**: Optional connection string which will be used instead of the default database string. diff --git a/v2/elasticsearch-common/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/ElasticsearchWriteOptions.java b/v2/elasticsearch-common/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/ElasticsearchWriteOptions.java index 046a3a072c..3354554621 100644 --- a/v2/elasticsearch-common/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/ElasticsearchWriteOptions.java +++ b/v2/elasticsearch-common/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/ElasticsearchWriteOptions.java @@ -30,7 +30,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { groupName = "Target", description = "Elasticsearch URL or CloudID if using Elastic Cloud", helpText = - "The Elasticsearch URL in the format https://hostname:[port]. If using Elastic Cloud, specify the CloudID.", + "The Elasticsearch URL in the format `https://hostname:[port]`. If using Elastic Cloud, specify the CloudID.", example = "https://elasticsearch-host:9200") @Validation.Required String getConnectionUrl(); @@ -51,7 +51,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Username for Elasticsearch endpoint", helpText = - "The Elasticsearch username to authenticate with. If specified, the value of 'apiKey' is ignored") + "The Elasticsearch username to authenticate with. If specified, the value of `apiKey` is ignored") String getElasticsearchUsername(); void setElasticsearchUsername(String elasticsearchUsername); @@ -61,7 +61,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Password for Elasticsearch endpoint", helpText = - "The Elasticsearch password to authenticate with. If specified, the value of 'apiKey' is ignored.") + "The Elasticsearch password to authenticate with. If specified, the value of `apiKey` is ignored.") String getElasticsearchPassword(); void setElasticsearchPassword(String elasticsearchPassword); @@ -71,7 +71,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = false, regexes = {"[a-zA-Z0-9._-]+"}, description = "Elasticsearch index", - helpText = "The Elasticsearch index that the requests are issued to, such as `my-index.`", + helpText = "The Elasticsearch index that the requests are issued to.", example = "my-index") String getIndex(); @@ -81,7 +81,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { order = 6, optional = true, description = "Batch Size", - helpText = "The batch size in number of documents. Defaults to: 1000.") + helpText = "The batch size in number of documents. Defaults to `1000`.") @Default.Long(1000) Long getBatchSize(); @@ -91,7 +91,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { order = 7, optional = true, description = "Batch Size in Bytes", - helpText = "The batch size in number of bytes. Defaults to: 5242880 (5mb).") + helpText = "The batch size in number of bytes. Defaults to `5242880` (5mb).") @Default.Long(5242880) Long getBatchSizeBytes(); @@ -102,7 +102,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Max retry attempts.", helpText = - "The maximum number of retry attempts. Must be greater than zero. Defaults to: no retries.") + "The maximum number of retry attempts. Must be greater than zero. Defaults to `no retries`.") Integer getMaxRetryAttempts(); void setMaxRetryAttempts(Integer maxRetryAttempts); @@ -112,7 +112,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Max retry duration.", helpText = - "The maximum retry duration in milliseconds. Must be greater than zero. Defaults to: no retries.") + "The maximum retry duration in milliseconds. Must be greater than zero. Defaults to `no retries`.") Long getMaxRetryDuration(); void setMaxRetryDuration(Long maxRetryDuration); @@ -122,7 +122,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Document property to specify _index metadata", helpText = - "The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to: none.") + "The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to `none`.") String getPropertyAsIndex(); void setPropertyAsIndex(String propertyAsIndex); @@ -132,7 +132,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Cloud Storage path to JavaScript UDF source for _index metadata", helpText = - "The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none.") + "The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`.") String getJavaScriptIndexFnGcsPath(); void setJavaScriptIndexFnGcsPath(String javaScriptTextTransformGcsPath); @@ -142,7 +142,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "UDF JavaScript Function Name for _index metadata", helpText = - "The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none.") + "The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`.") String getJavaScriptIndexFnName(); void setJavaScriptIndexFnName(String javaScriptTextTransformFunctionName); @@ -152,7 +152,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Document property to specify _id metadata", helpText = - "A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to: none.") + "A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to `none`.") String getPropertyAsId(); void setPropertyAsId(String propertyAsId); @@ -162,7 +162,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Cloud Storage path to JavaScript UDF source for _id metadata", helpText = - "The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to: none.") + "The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to `none`.") String getJavaScriptIdFnGcsPath(); void setJavaScriptIdFnGcsPath(String javaScriptTextTransformGcsPath); @@ -172,7 +172,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "UDF JavaScript Function Name for _id metadata", helpText = - "The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to: none.") + "The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to `none`.") String getJavaScriptIdFnName(); void setJavaScriptIdFnName(String javaScriptTextTransformFunctionName); @@ -182,7 +182,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Cloud Storage path to JavaScript UDF source for _type metadata", helpText = - "The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Default: none.") + "The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Defaults to `none`.") String getJavaScriptTypeFnGcsPath(); void setJavaScriptTypeFnGcsPath(String javaScriptTextTransformGcsPath); @@ -192,7 +192,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "UDF JavaScript Function Name for _type metadata", helpText = - "The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to: none.") + "The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to `none`.") String getJavaScriptTypeFnName(); void setJavaScriptTypeFnName(String javaScriptTextTransformFunctionName); @@ -202,7 +202,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Cloud Storage path to JavaScript UDF source for isDelete function", helpText = - "The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none.") + "The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`.") String getJavaScriptIsDeleteFnGcsPath(); void setJavaScriptIsDeleteFnGcsPath(String javaScriptTextTransformGcsPath); @@ -212,7 +212,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "UDF JavaScript Function Name for isDelete", helpText = - "The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none.") + "The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`.") String getJavaScriptIsDeleteFnName(); void setJavaScriptIsDeleteFnName(String javaScriptTextTransformFunctionName); @@ -222,7 +222,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Use partial updates", helpText = - "Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to: false.") + "Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to `false`.") @Default.Boolean(false) Boolean getUsePartialUpdate(); @@ -234,7 +234,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Build insert method", helpText = - "Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to: CREATE.") + "Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to `CREATE`.") @Default.Enum("CREATE") BulkInsertMethodOptions getBulkInsertMethod(); @@ -245,7 +245,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Trust self-signed certificate", helpText = - "Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to True to by-pass the validation on SSL certificate. (default is False)") + "Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to true to by-pass the validation on SSL certificate. (Defaults to: `false`)") @Default.Boolean(false) Boolean getTrustSelfSignedCerts(); @@ -256,8 +256,8 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { optional = true, description = "Disable SSL certificate validation.", helpText = - "If 'true', trust the self-signed SSL certificate. An Elasticsearch instance might have a " - + "self-signed certificate. To bypass validation for the certificate, set this parameter to 'true'. Default: false.") + "If `true`, trust the self-signed SSL certificate. An Elasticsearch instance might have a " + + "self-signed certificate. To bypass validation for the certificate, set this parameter to `true`. Defaults to `false`.") @Default.Boolean(false) Boolean getDisableCertificateValidation(); @@ -270,11 +270,11 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { parentTriggerValues = {"KMS"}, description = "Google Cloud KMS encryption key for the API key", helpText = - "The Cloud KMS key to decrypt the API key. This parameter must be " - + "provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey " - + "string should be passed in encrypted. Encrypt parameters using the KMS API encrypt " - + "endpoint. The Key should be in the format " - + "projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. " + "The Cloud KMS key to decrypt the API key. This parameter is required " + + "if the `apiKeySource` is set to `KMS`. If this parameter is provided, pass in an encrypted `apiKey` string." + + " Encrypt parameters using the KMS API encrypt " + + "endpoint. For the key, use the format " + + "`projects//locations//keyRings//cryptoKeys/`. " + "See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt ", example = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name") @@ -290,7 +290,7 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { regexes = {"^projects\\/[^\\n\\r\\/]+\\/secrets\\/[^\\n\\r\\/]+\\/versions\\/[^\\n\\r\\/]+$"}, description = "Google Cloud Secret Manager ID.", helpText = - "Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}.", + "The Secret Manager secret ID for the apiKey. If the `apiKeySource` is set to `SECRET_MANAGER`, provide this parameter. Use the format `projects//secrets//versions/.", example = "projects/your-project-id/secrets/your-secret/versions/your-secret-version") String getApiKeySecretId(); @@ -306,11 +306,11 @@ public interface ElasticsearchWriteOptions extends PipelineOptions { }, description = "Source of the API key passed. One of PLAINTEXT, KMS or SECRET_MANAGER.", helpText = - "Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter " - + "must be provided if secret manager or KMS is used. If apiKeySource is set to KMS, " - + "apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to " - + "SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, " - + "apiKey must be provided.") + "The source of the API key. Allowed values are `PLAINTEXT`, `KMS` orand `SECRET_MANAGER`. This parameter " + + "is required when you use Secret Manager or KMS. If `apiKeySource` is set to `KMS`, " + + "`apiKeyKMSEncryptionKey` and encrypted apiKey must be provided. If `apiKeySource` is set to " + + "`SECRET_MANAGER`, `apiKeySecretId` must be provided. If `apiKeySource` is set to `PLAINTEXT`, " + + "`apiKey` must be provided.") @Default.String("PLAINTEXT") String getApiKeySource(); diff --git a/v2/file-format-conversion/README_File_Format_Conversion.md b/v2/file-format-conversion/README_File_Format_Conversion.md index 97ad5a9114..30cb70305d 100644 --- a/v2/file-format-conversion/README_File_Format_Conversion.md +++ b/v2/file-format-conversion/README_File_Format_Conversion.md @@ -23,24 +23,24 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFileFormat** : File format of the input files. Needs to be either avro, parquet or csv. -* **outputFileFormat** : File format of the output files. Needs to be either avro or parquet. -* **inputFileSpec** : The Cloud Storage file pattern to search for CSV files. Example: gs://mybucket/test-*.csv. -* **outputBucket** : Cloud storage directory for writing output files. This value must end in a slash. (Example: gs://your-bucket/path/). -* **schema** : Cloud storage path to the avro schema file. (Example: gs://your-bucket/your-path/schema.avsc). +* **inputFileFormat**: File format of the input files. Needs to be either avro, parquet or csv. +* **outputFileFormat**: File format of the output files. Needs to be either avro or parquet. +* **inputFileSpec**: The Cloud Storage file pattern to search for CSV files. For example, `gs://mybucket/test-*.csv`. +* **outputBucket**: Cloud storage directory for writing output files. This value must end in a slash. For example, `gs://your-bucket/path/`. +* **schema**: Cloud storage path to the avro schema file. For example, `gs://your-bucket/your-path/schema.avsc`. ### Optional parameters -* **containsHeaders** : Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. -* **deadletterTable** : Messages failed to reach the target for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. (Example: your-project:your-dataset.your-table-name). -* **delimiter** : The column delimiter of the input text files. Default: use delimiter provided in csvFormat (Example: ,). -* **csvFormat** : CSV format specification to use for parsing records. Default is: Default. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. -* **jsonSchemaPath** : The path to the JSON schema. Defaults to: null. (Example: gs://path/to/schema). -* **largeNumFiles** : Set to true if number of files is in the tens of thousands. Defaults to: false. -* **csvFileEncoding** : The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16. Defaults to: UTF-8. -* **logDetailedCsvConversionErrors** : Set to true to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: false. -* **numShards** : The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Default value is decided by Dataflow. -* **outputFilePrefix** : The prefix of the files to write to. Defaults to: output. +* **containsHeaders**: Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. +* **deadletterTable**: Messages failed to reach the target for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. For example, `your-project:your-dataset.your-table-name`. +* **delimiter**: The column delimiter of the input text files. Default: `,` For example, `,`. +* **csvFormat**: CSV format specification to use for parsing records. Default is: `Default`. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. +* **jsonSchemaPath**: The path to the JSON schema. Defaults to `null`. For example, `gs://path/to/schema`. +* **largeNumFiles**: Set to true if number of files is in the tens of thousands. Defaults to `false`. +* **csvFileEncoding**: The CSV file character encoding format. Allowed values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`. Defaults to: UTF-8. +* **logDetailedCsvConversionErrors**: Set to `true` to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: `false`. +* **numShards**: The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Default value is decided by Dataflow. +* **outputFilePrefix**: The prefix of the files to write to. Defaults to: output. @@ -247,13 +247,13 @@ resource "google_dataflow_flex_template_job" "file_format_conversion" { inputFileFormat = "" outputFileFormat = "" inputFileSpec = "" - outputBucket = "gs://your-bucket/path/" - schema = "gs://your-bucket/your-path/schema.avsc" + outputBucket = "" + schema = "" # containsHeaders = "false" - # deadletterTable = "your-project:your-dataset.your-table-name" - # delimiter = "," + # deadletterTable = "" + # delimiter = "" # csvFormat = "Default" - # jsonSchemaPath = "gs://path/to/schema" + # jsonSchemaPath = "" # largeNumFiles = "false" # csvFileEncoding = "UTF-8" # logDetailedCsvConversionErrors = "false" diff --git a/v2/gcs-to-sourcedb/README_GCS_to_Sourcedb.md b/v2/gcs-to-sourcedb/README_GCS_to_Sourcedb.md index f9abc7a5ad..350ea25037 100644 --- a/v2/gcs-to-sourcedb/README_GCS_to_Sourcedb.md +++ b/v2/gcs-to-sourcedb/README_GCS_to_Sourcedb.md @@ -17,27 +17,27 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **sourceShardsFilePath** : Source shard details file path in Cloud Storage that contains connection profile of source shards. -* **sessionFilePath** : Session file path in Cloud Storage that contains mapping information from HarbourBridge. -* **GCSInputDirectoryPath** : Path from where to read the change stream files. -* **spannerProjectId** : This is the name of the Cloud Spanner project. -* **metadataInstance** : This is the instance to store the shard progress of the files processed. -* **metadataDatabase** : This is the database to store the shard progress of the files processed.. -* **runIdentifier** : The identifier to distinguish between different runs of reverse replication flows. +* **sourceShardsFilePath**: Source shard details file path in Cloud Storage that contains connection profile of source shards. +* **sessionFilePath**: Session file path in Cloud Storage that contains mapping information from HarbourBridge. +* **GCSInputDirectoryPath**: Path from where to read the change stream files. +* **spannerProjectId**: This is the name of the Cloud Spanner project. +* **metadataInstance**: This is the instance to store the shard progress of the files processed. +* **metadataDatabase**: This is the database to store the shard progress of the files processed.. +* **runIdentifier**: The identifier to distinguish between different runs of reverse replication flows. ### Optional parameters -* **sourceType** : This is the type of source database. Currently only mysql is supported. Defaults to: mysql. -* **sourceDbTimezoneOffset** : This is the timezone offset from UTC for the source database. Example value: +10:00. Defaults to: +00:00. -* **timerIntervalInMilliSec** : Controls the time between successive polls to buffer and processing of the resultant records. Defaults to: 1. -* **startTimestamp** : Start time of file for all shards. If not provided, the value is taken from spanner_to_gcs_metadata. If provided, this takes precedence. To be given when running in regular run mode. -* **windowDuration** : The window duration/size in which data is written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). If not provided, the value is taken from spanner_to_gcs_metadata. If provided, this takes precedence. To be given when running in regular run mode. (Example: 5m). -* **runMode** : Regular writes to source db, reprocess does processing the specific shards marked as REPROCESS, resumeFailed does reprocess of all shards in error state, resumeSuccess continues processing shards in successful state, resumeAll continues processing all shards irrespective of state. Defaults to: regular. -* **metadataTableSuffix** : Suffix appended to the spanner_to_gcs_metadata and shard_file_create_progress metadata tables.Useful when doing multiple runs.Only alpha numeric and underscores are allowed. Defaults to empty. -* **transformationJarPath** : Custom jar location in Cloud Storage that contains the custom transformation logic for processing records in reverse replication. Defaults to empty. -* **transformationClassName** : Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. -* **transformationCustomParameters** : String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. -* **writeFilteredEventsToGcs** : This is a flag which if set to true will write filtered events from custom transformation to GCS. Defaults to: false. +* **sourceType**: This is the type of source database. Currently only mysql is supported. Defaults to: mysql. +* **sourceDbTimezoneOffset**: This is the timezone offset from UTC for the source database. Example value: +10:00. Defaults to: +00:00. +* **timerIntervalInMilliSec**: Controls the time between successive polls to buffer and processing of the resultant records. Defaults to: 1. +* **startTimestamp**: Start time of file for all shards. If not provided, the value is taken from spanner_to_gcs_metadata. If provided, this takes precedence. To be given when running in regular run mode. +* **windowDuration**: The window duration/size in which data is written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). If not provided, the value is taken from spanner_to_gcs_metadata. If provided, this takes precedence. To be given when running in regular run mode. For example, `5m`. +* **runMode**: Regular writes to source db, reprocess does processing the specific shards marked as REPROCESS, resumeFailed does reprocess of all shards in error state, resumeSuccess continues processing shards in successful state, resumeAll continues processing all shards irrespective of state. Defaults to: regular. +* **metadataTableSuffix**: Suffix appended to the spanner_to_gcs_metadata and shard_file_create_progress metadata tables.Useful when doing multiple runs.Only alpha numeric and underscores are allowed. Defaults to empty. +* **transformationJarPath**: Custom JAR file location in Cloud Storage for the file that contains the custom transformation logic for processing records in reverse replication. Defaults to empty. +* **transformationClassName**: Fully qualified class name for the class that contains the custom transformation logic. When `transformationJarPath` is specified, this field is required. Defaults to empty. +* **transformationCustomParameters**: The string that contains any custom parameters to pass to the custom transformation class. Defaults to empty. +* **writeFilteredEventsToGcs**: When set to `true`, writes filtered events from custom transformation to Cloud Storage. Defaults to: false. @@ -261,7 +261,7 @@ resource "google_dataflow_flex_template_job" "gcs_to_sourcedb" { # sourceDbTimezoneOffset = "+00:00" # timerIntervalInMilliSec = "1" # startTimestamp = "" - # windowDuration = "5m" + # windowDuration = "" # runMode = "regular" # metadataTableSuffix = "" # transformationJarPath = "" diff --git a/v2/gcs-to-sourcedb/src/main/java/com/google/cloud/teleport/v2/templates/GCSToSourceDb.java b/v2/gcs-to-sourcedb/src/main/java/com/google/cloud/teleport/v2/templates/GCSToSourceDb.java index 6066e57cb1..51395e5061 100644 --- a/v2/gcs-to-sourcedb/src/main/java/com/google/cloud/teleport/v2/templates/GCSToSourceDb.java +++ b/v2/gcs-to-sourcedb/src/main/java/com/google/cloud/teleport/v2/templates/GCSToSourceDb.java @@ -125,7 +125,7 @@ public interface Options extends PipelineOptions, StreamingOptions { order = 5, optional = true, description = - "Duration in mili seconds between calls to stateful timer processing.Defaults to 1" + "Duration in mili seconds between calls to stateful timer processing.Defaults to `1`." + " millisecond. ", helpText = "Controls the time between successive polls to buffer and processing of the resultant" @@ -215,7 +215,7 @@ public interface Options extends PipelineOptions, StreamingOptions { }, description = "This type of run mode. Supported values are" - + " regular/reprocess/resumeSucess/resumeFailed/resumeAll. Defaults to regular. All" + + " regular/reprocess/resumeSucess/resumeFailed/resumeAll. Defaults to `regular`. All" + " run modes should have the same run identifier.", helpText = "Regular writes to source db, reprocess does processing the specific shards marked as" @@ -255,7 +255,7 @@ public interface Options extends PipelineOptions, StreamingOptions { optional = true, description = "Custom transformation jar location in Cloud Storage", helpText = - "Custom jar location in Cloud Storage that contains the custom transformation logic for processing records" + "Custom JAR file location in Cloud Storage for the file that contains the custom transformation logic for processing records" + " in reverse replication.") @Default.String("") String getTransformationJarPath(); @@ -267,8 +267,8 @@ public interface Options extends PipelineOptions, StreamingOptions { optional = true, description = "Custom class name for transformation", helpText = - "Fully qualified class name having the custom transformation logic. It is a" - + " mandatory field in case transformationJarPath is specified") + "Fully qualified class name for the class that contains the custom transformation logic. When" + + " `transformationJarPath` is specified, this field is required.") @Default.String("") String getTransformationClassName(); @@ -279,7 +279,7 @@ public interface Options extends PipelineOptions, StreamingOptions { optional = true, description = "Custom parameters for transformation", helpText = - "String containing any custom parameters to be passed to the custom transformation class.") + "The string that contains any custom parameters to pass to the custom transformation class.") @Default.String("") String getTransformationCustomParameters(); @@ -290,7 +290,7 @@ public interface Options extends PipelineOptions, StreamingOptions { optional = true, description = "Write filtered events to GCS", helpText = - "This is a flag which if set to true will write filtered events from custom transformation to GCS.") + "When set to `true`, writes filtered events from custom transformation to Cloud Storage.") @Default.Boolean(false) Boolean getWriteFilteredEventsToGcs(); diff --git a/v2/gcs-to-sourcedb/src/test/resources/GCSToSourceDbWithoutReaderIT/events.txt b/v2/gcs-to-sourcedb/src/test/resources/GCSToSourceDbWithoutReaderIT/events.txt index 19829a98ab..5d642b6cac 100644 --- a/v2/gcs-to-sourcedb/src/test/resources/GCSToSourceDbWithoutReaderIT/events.txt +++ b/v2/gcs-to-sourcedb/src/test/resources/GCSToSourceDbWithoutReaderIT/events.txt @@ -1,6 +1,6 @@ {"commitTimestamp":{"seconds":1719247984,"nanos":109263000},"serverTransactionId":"NzkxOTE5OTY4NTQzNjMzODY3Mg\u003d\u003d","recordSequence":"00000000","tableName":"Users","mods":[{"keysJson":"{\"id\":\"1\"}","oldValuesJson":"{}","newValuesJson":"{\"name\":\"FF\"}"}],"modType":"INSERT","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} -{"commitTimestamp":{"seconds":1719846465,"nanos":372383000},"serverTransactionId":"Mjk0MTkyNDQ4NzkyMDcyMzcwOQ\u003d\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example2\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"YmluX2NvbHVtbg\u003d\u003d\",\"bit_column\":\"MQ\u003d\u003d\",\"blob_column\":\"YmxvYl9jb2x1bW4\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 2\",\"time_column\":\"410000\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"2\",\"year_column\":\"2024\"}"}],"modType":"INSERT","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} -{"commitTimestamp":{"seconds":1719846503,"nanos":617919000},"serverTransactionId":"MTY0MDExNjY5NDAxMjEwMDA5Mzg\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example2\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"YmluX2NvbHVtbg\u003d\u003d\",\"bit_column\":\"MQ\u003d\u003d\",\"blob_column\":\"YmxvYl9jb2x1bW4\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 2\",\"time_column\":\"143000\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"2\",\"year_column\":\"2024\"}"}],"modType":"UPDATE","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} -{"commitTimestamp":{"seconds":1719846503,"nanos":617919000},"serverTransactionId":"MTY0MDExNjY5NDAxMjEwMDA5Mzg\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example2\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"YmluX2NvbHVtbg\u003d\u003d\",\"bit_column\":\"MQ\u003d\u003d\",\"blob_column\":\"YmxvYl9jb2x1bW4\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 2\",\"time_column\":\"143000\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"2\",\"year_column\":\"2024\"}"}],"modType":"DELETE","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} -{"commitTimestamp":{"seconds":1718795517,"nanos":877439000},"serverTransactionId":"MTI4NTc0NTY4OTYwMzkxMDgyNA\u003d\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example1\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"ZXhhbXBsZWJpbmFyeTE\u003d\",\"bit_column\":\"ZXhhbXBsZWJpdDE\u003d\",\"blob_column\":\"ZXhhbXBsZWJsb2Ix\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 1\",\"time_column\":\"143000\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"1\",\"year_column\":\"2024\"}"}],"modType":"INSERT","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} -{"commitTimestamp":{"seconds":1718781240,"nanos":419055000},"serverTransactionId":"MTQxMTQ5MzExNTc0NTkyNzAzODM\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"12345\",\"binary_column\":\"U29tZSBiaW5hcnkgZGF0YQ\u003d\u003d\",\"bit_column\":\"U29tZSBiaW5hcnkgZGF0YQ\u003d\u003d\",\"blob_column\":\"U29tZSBiaW5hcnkgZGF0YQ\u003d\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"12345.67\",\"double_column\":123.456,\"enum_column\":\"1\",\"float_column\":123.45,\"int_column\":\"123\",\"text_column\":\"Sample text\",\"time_column\":\"143000\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"1\",\"year_column\":\"2024\"}"}],"modType":"INSERT","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} \ No newline at end of file +{"commitTimestamp":{"seconds":1719846465,"nanos":372383000},"serverTransactionId":"Mjk0MTkyNDQ4NzkyMDcyMzcwOQ\u003d\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example2\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"YmluX2NvbHVtbg\u003d\u003d\",\"bit_column\":\"MQ\u003d\u003d\",\"blob_column\":\"YmxvYl9jb2x1bW4\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 2\",\"time_column\":\"14:30:00\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"2\",\"year_column\":\"2024\"}"}],"modType":"INSERT","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} +{"commitTimestamp":{"seconds":1719846503,"nanos":617919000},"serverTransactionId":"MTY0MDExNjY5NDAxMjEwMDA5Mzg\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example2\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"YmluX2NvbHVtbg\u003d\u003d\",\"bit_column\":\"MQ\u003d\u003d\",\"blob_column\":\"YmxvYl9jb2x1bW4\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 2\",\"time_column\":\"14:30:00\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"2\",\"year_column\":\"2024\"}"}],"modType":"UPDATE","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} +{"commitTimestamp":{"seconds":1719846503,"nanos":617919000},"serverTransactionId":"MTY0MDExNjY5NDAxMjEwMDA5Mzg\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example2\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"YmluX2NvbHVtbg\u003d\u003d\",\"bit_column\":\"MQ\u003d\u003d\",\"blob_column\":\"YmxvYl9jb2x1bW4\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 2\",\"time_column\":\"14:30:00\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"2\",\"year_column\":\"2024\"}"}],"modType":"DELETE","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} +{"commitTimestamp":{"seconds":1718795517,"nanos":877439000},"serverTransactionId":"MTI4NTc0NTY4OTYwMzkxMDgyNA\u003d\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example1\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"1000\",\"binary_column\":\"ZXhhbXBsZWJpbmFyeTE\u003d\",\"bit_column\":\"ZXhhbXBsZWJpdDE\u003d\",\"blob_column\":\"ZXhhbXBsZWJsb2Ix\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"99999.99\",\"double_column\":123456.123,\"enum_column\":\"1\",\"float_column\":12345.67,\"int_column\":\"100\",\"text_column\":\"Sample text for entry 1\",\"time_column\":\"14:30:00\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"1\",\"year_column\":\"2024\"}"}],"modType":"INSERT","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} +{"commitTimestamp":{"seconds":1718781240,"nanos":419055000},"serverTransactionId":"MTQxMTQ5MzExNTc0NTkyNzAzODM\u003d","recordSequence":"00000000","tableName":"AllDatatypeTransformation","mods":[{"keysJson":"{\"varchar_column\":\"example\"}","oldValuesJson":"{}","newValuesJson":"{\"bigint_column\":\"12345\",\"binary_column\":\"U29tZSBiaW5hcnkgZGF0YQ\u003d\u003d\",\"bit_column\":\"U29tZSBiaW5hcnkgZGF0YQ\u003d\u003d\",\"blob_column\":\"U29tZSBiaW5hcnkgZGF0YQ\u003d\u003d\",\"bool_column\":true,\"date_column\":\"2024-01-01\",\"datetime_column\":\"2024-01-01T12:34:56Z\",\"decimal_column\":\"12345.67\",\"double_column\":123.456,\"enum_column\":\"1\",\"float_column\":123.45,\"int_column\":\"123\",\"text_column\":\"Sample text\",\"time_column\":\"14:30:00\",\"timestamp_column\":\"2024-01-01T12:34:56Z\",\"tinyint_column\":\"1\",\"year_column\":\"2024\"}"}],"modType":"INSERT","numberOfRecordsInTransaction":1,"transactionTag":"","shard":"ls1"} \ No newline at end of file diff --git a/v2/google-ads-to-googlecloud/README_Google_Ads_to_BigQuery.md b/v2/google-ads-to-googlecloud/README_Google_Ads_to_BigQuery.md index 6315ac03b0..256dcb4c99 100644 --- a/v2/google-ads-to-googlecloud/README_Google_Ads_to_BigQuery.md +++ b/v2/google-ads-to-googlecloud/README_Google_Ads_to_BigQuery.md @@ -17,21 +17,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **customerIds** : A list of Google Ads account IDs to use to execute the query. (Example: 12345,67890). -* **query** : The query to use to get the data. See Google Ads Query Language. For example: `SELECT campaign.id, campaign.name FROM campaign`. (Example: SELECT campaign.id, campaign.name FROM campaign). -* **qpsPerWorker** : The rate of query requests per second (QPS) to submit to Google Ads. Divide the desired per pipeline QPS by the maximum number of workers. Avoid exceeding per-account or developer token limits. See Rate Limits (https://developers.google.com/google-ads/api/docs/best-practices/rate-limits). -* **googleAdsClientId** : The OAuth 2.0 client ID that identifies the application. See Create a client ID and client secret (https://developers.google.com/google-ads/api/docs/oauth/cloud-project#create_a_client_id_and_client_secret). -* **googleAdsClientSecret** : The OAuth 2.0 client secret that corresponds to the specified client ID. See Create a client ID and client secret (https://developers.google.com/google-ads/api/docs/oauth/cloud-project#create_a_client_id_and_client_secret). -* **googleAdsRefreshToken** : The OAuth 2.0 refresh token to use to connect to the Google Ads API. See 2-Step Verification (https://developers.google.com/google-ads/api/docs/oauth/2sv). -* **googleAdsDeveloperToken** : The Google Ads developer token to use to connect to the Google Ads API. See Obtain a developer token (https://developers.google.com/google-ads/api/docs/get-started/dev-token). -* **outputTableSpec** : The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. +* **customerIds**: A list of Google Ads account IDs to use to execute the query. For example, `12345,67890`. +* **query**: The query to use to get the data. See Google Ads Query Language (https://developers.google.com/google-ads/api/docs/query/overview). For example, `SELECT campaign.id, campaign.name FROM campaign`. +* **qpsPerWorker**: The rate of query requests per second (QPS) to submit to Google Ads. Divide the desired per pipeline QPS by the maximum number of workers. Avoid exceeding per-account or developer token limits. See Rate Limits (https://developers.google.com/google-ads/api/docs/best-practices/rate-limits). +* **googleAdsClientId**: The OAuth 2.0 client ID that identifies the application. See Create a client ID and client secret (https://developers.google.com/google-ads/api/docs/oauth/cloud-project#create_a_client_id_and_client_secret). +* **googleAdsClientSecret**: The OAuth 2.0 client secret that corresponds to the specified client ID. See Create a client ID and client secret (https://developers.google.com/google-ads/api/docs/oauth/cloud-project#create_a_client_id_and_client_secret). +* **googleAdsRefreshToken**: The OAuth 2.0 refresh token to use to connect to the Google Ads API. See 2-Step Verification (https://developers.google.com/google-ads/api/docs/oauth/2sv). +* **googleAdsDeveloperToken**: The Google Ads developer token to use to connect to the Google Ads API. See Obtain a developer token (https://developers.google.com/google-ads/api/docs/get-started/dev-token). +* **outputTableSpec**: The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. ### Optional parameters -* **loginCustomerId** : A Google Ads manager account ID to use to access the account IDs. (Example: 12345). -* **bigQueryTableSchemaPath** : The Cloud Storage path to the BigQuery schema JSON file. If this value is not set, then the schema is inferred from the Proto schema. (Example: gs://MyBucket/bq_schema.json). -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **loginCustomerId**: A Google Ads manager account ID to use to access the account IDs. For example, `12345`. +* **bigQueryTableSchemaPath**: The Cloud Storage path to the BigQuery schema JSON file. If this value is not set, then the schema is inferred from the Proto schema. For example, `gs://MyBucket/bq_schema.json`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. @@ -226,16 +226,16 @@ resource "google_dataflow_flex_template_job" "google_ads_to_bigquery" { name = "google-ads-to-bigquery" region = var.region parameters = { - customerIds = "12345,67890" - query = "SELECT campaign.id, campaign.name FROM campaign" + customerIds = "" + query = "" qpsPerWorker = "" googleAdsClientId = "" googleAdsClientSecret = "" googleAdsRefreshToken = "" googleAdsDeveloperToken = "" outputTableSpec = "" - # loginCustomerId = "12345" - # bigQueryTableSchemaPath = "gs://MyBucket/bq_schema.json" + # loginCustomerId = "" + # bigQueryTableSchemaPath = "" # writeDisposition = "WRITE_APPEND" # createDisposition = "CREATE_IF_NEEDED" } diff --git a/v2/google-ads-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/GoogleAdsToBigQuery.java b/v2/google-ads-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/GoogleAdsToBigQuery.java index f6ece31eab..8f49e27329 100644 --- a/v2/google-ads-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/GoogleAdsToBigQuery.java +++ b/v2/google-ads-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/GoogleAdsToBigQuery.java @@ -100,7 +100,7 @@ public interface GoogleAdsToBigQueryOptions extends WriteOptions, GoogleAdsOptio order = 3, description = "Google Ads Query Language query", helpText = - "The query to use to get the data. See Google Ads Query Language. For example: `SELECT campaign.id, campaign.name FROM campaign`.", + "The query to use to get the data. See Google Ads Query Language (https://developers.google.com/google-ads/api/docs/query/overview).", example = "SELECT campaign.id, campaign.name FROM campaign") @Validation.Required String getQuery(); diff --git a/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch.md b/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch.md index 316da150d6..59f53dbd2d 100644 --- a/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch.md +++ b/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch.md @@ -18,44 +18,45 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **connectionUrl** : The Elasticsearch URL in the format https://hostname:[port]. If using Elastic Cloud, specify the CloudID. (Example: https://elasticsearch-host:9200). -* **apiKey** : The Base64-encoded API key to use for authentication. -* **index** : The Elasticsearch index that the requests are issued to, such as `my-index.` (Example: my-index). +* **connectionUrl**: The Elasticsearch URL in the format `https://hostname:[port]`. If using Elastic Cloud, specify the CloudID. For example, `https://elasticsearch-host:9200`. +* **apiKey**: The Base64-encoded API key to use for authentication. +* **index**: The Elasticsearch index that the requests are issued to. For example, `my-index`. ### Optional parameters -* **inputTableSpec** : The BigQuery table to read from. Format: `projectId:datasetId.tablename`. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the BigQuery Storage Read API (https://cloud.google.com/bigquery/docs/reference/storage). For information about limitations in the Storage Read API, see https://cloud.google.com/bigquery/docs/reference/storage#limitations. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. (Example: bigquery-project:dataset.input_table). -* **outputDeadletterTable** : The BigQuery table for messages that failed to reach the output table, in the format :.. If a table doesn't exist, is is created during pipeline execution. If not specified, `_error_records` is used. (Example: your-project-id:your-dataset.your-table-name). -* **query** : The SQL query to use to read data from BigQuery. If the BigQuery dataset is in a different project than the Dataflow job, specify the full dataset name in the SQL query, for example: ... By default, the `query` parameter uses GoogleSQL (https://cloud.google.com/bigquery/docs/introduction-sql), unless `useLegacySql` is `true`. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. (Example: select * from sampledb.sample_table). -* **useLegacySql** : Set to true to use legacy SQL. This parameter only applies when using the `query` parameter. Defaults to: false. -* **queryLocation** : Needed when reading from an authorized view without underlying table's permission. (Example: US). -* **queryTempDataset** : With this option, you can set an existing dataset to create the temporary table to store the results of the query. (Example: temp_dataset). -* **elasticsearchUsername** : The Elasticsearch username to authenticate with. If specified, the value of 'apiKey' is ignored. -* **elasticsearchPassword** : The Elasticsearch password to authenticate with. If specified, the value of 'apiKey' is ignored. -* **batchSize** : The batch size in number of documents. Defaults to: 1000. -* **batchSizeBytes** : The batch size in number of bytes. Defaults to: 5242880 (5mb). -* **maxRetryAttempts** : The maximum number of retry attempts. Must be greater than zero. Defaults to: no retries. -* **maxRetryDuration** : The maximum retry duration in milliseconds. Must be greater than zero. Defaults to: no retries. -* **propertyAsIndex** : The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to: none. -* **javaScriptIndexFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIndexFnName** : The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **propertyAsId** : A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to: none. -* **javaScriptIdFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIdFnName** : The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptTypeFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Default: none. -* **javaScriptTypeFnName** : The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIsDeleteFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **javaScriptIsDeleteFnName** : The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **usePartialUpdate** : Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to: false. -* **bulkInsertMethod** : Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to: CREATE. -* **trustSelfSignedCerts** : Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to True to by-pass the validation on SSL certificate. (default is False). -* **disableCertificateValidation** : If 'true', trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to 'true'. Default: false. -* **apiKeyKMSEncryptionKey** : The Cloud KMS key to decrypt the API key. This parameter must be provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **apiKeySecretId** : Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **apiKeySource** : Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter must be provided if secret manager or KMS is used. If apiKeySource is set to KMS, apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, apiKey must be provided. Defaults to: PLAINTEXT. -* **socketTimeout** : If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **inputTableSpec**: The BigQuery table to read from. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the BigQuery Storage Read API (https://cloud.google.com/bigquery/docs/reference/storage). For information about limitations in the Storage Read API, see https://cloud.google.com/bigquery/docs/reference/storage#limitations. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. For example, `:.`. +* **outputDeadletterTable**: The BigQuery table for messages that failed to reach the output table. If a table doesn't exist, it is created during pipeline execution. If not specified, `_error_records` is used. For example, `:.`. +* **query**: The SQL query to use to read data from BigQuery. If the BigQuery dataset is in a different project than the Dataflow job, specify the full dataset name in the SQL query, for example: ... By default, the `query` parameter uses GoogleSQL (https://cloud.google.com/bigquery/docs/introduction-sql), unless `useLegacySql` is `true`. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. For example, `select * from sampledb.sample_table`. +* **useLegacySql**: Set to `true` to use legacy SQL. This parameter only applies when using the `query` parameter. Defaults to `false`. +* **queryLocation**: Needed when reading from an authorized view without underlying table's permission. For example, `US`. +* **queryTempDataset**: With this option, you can set an existing dataset to create the temporary table to store the results of the query. For example, `temp_dataset`. +* **KMSEncryptionKey**: If reading from BigQuery using query source, use this Cloud KMS key to encrypt any temporary tables created. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **elasticsearchUsername**: The Elasticsearch username to authenticate with. If specified, the value of `apiKey` is ignored. +* **elasticsearchPassword**: The Elasticsearch password to authenticate with. If specified, the value of `apiKey` is ignored. +* **batchSize**: The batch size in number of documents. Defaults to `1000`. +* **batchSizeBytes**: The batch size in number of bytes. Defaults to `5242880` (5mb). +* **maxRetryAttempts**: The maximum number of retry attempts. Must be greater than zero. Defaults to `no retries`. +* **maxRetryDuration**: The maximum retry duration in milliseconds. Must be greater than zero. Defaults to `no retries`. +* **propertyAsIndex**: The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to `none`. +* **javaScriptIndexFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIndexFnName**: The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **propertyAsId**: A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to `none`. +* **javaScriptIdFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIdFnName**: The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptTypeFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Defaults to `none`. +* **javaScriptTypeFnName**: The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIsDeleteFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **javaScriptIsDeleteFnName**: The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **usePartialUpdate**: Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to `false`. +* **bulkInsertMethod**: Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to `CREATE`. +* **trustSelfSignedCerts**: Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to true to by-pass the validation on SSL certificate. (Defaults to: `false`). +* **disableCertificateValidation**: If `true`, trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to `true`. Defaults to `false`. +* **apiKeyKMSEncryptionKey**: The Cloud KMS key to decrypt the API key. This parameter is required if the `apiKeySource` is set to `KMS`. If this parameter is provided, pass in an encrypted `apiKey` string. Encrypt parameters using the KMS API encrypt endpoint. For the key, use the format `projects//locations//keyRings//cryptoKeys/`. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **apiKeySecretId**: The Secret Manager secret ID for the apiKey. If the `apiKeySource` is set to `SECRET_MANAGER`, provide this parameter. Use the format `projects//secrets//versions/. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **apiKeySource**: The source of the API key. Allowed values are `PLAINTEXT`, `KMS` orand `SECRET_MANAGER`. This parameter is required when you use Secret Manager or KMS. If `apiKeySource` is set to `KMS`, `apiKeyKMSEncryptionKey` and encrypted apiKey must be provided. If `apiKeySource` is set to `SECRET_MANAGER`, `apiKeySecretId` must be provided. If `apiKeySource` is set to `PLAINTEXT`, `apiKey` must be provided. Defaults to: PLAINTEXT. +* **socketTimeout**: If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). ## User-Defined functions (UDFs) @@ -155,6 +156,7 @@ export QUERY= export USE_LEGACY_SQL=false export QUERY_LOCATION= export QUERY_TEMP_DATASET= +export KMSENCRYPTION_KEY= export ELASTICSEARCH_USERNAME= export ELASTICSEARCH_PASSWORD= export BATCH_SIZE=1000 @@ -192,6 +194,7 @@ gcloud dataflow flex-template run "bigquery-to-elasticsearch-job" \ --parameters "useLegacySql=$USE_LEGACY_SQL" \ --parameters "queryLocation=$QUERY_LOCATION" \ --parameters "queryTempDataset=$QUERY_TEMP_DATASET" \ + --parameters "KMSEncryptionKey=$KMSENCRYPTION_KEY" \ --parameters "connectionUrl=$CONNECTION_URL" \ --parameters "apiKey=$API_KEY" \ --parameters "elasticsearchUsername=$ELASTICSEARCH_USERNAME" \ @@ -250,6 +253,7 @@ export QUERY= export USE_LEGACY_SQL=false export QUERY_LOCATION= export QUERY_TEMP_DATASET= +export KMSENCRYPTION_KEY= export ELASTICSEARCH_USERNAME= export ELASTICSEARCH_PASSWORD= export BATCH_SIZE=1000 @@ -284,7 +288,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="bigquery-to-elasticsearch-job" \ -DtemplateName="BigQuery_to_Elasticsearch" \ --Dparameters="inputTableSpec=$INPUT_TABLE_SPEC,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,query=$QUERY,useLegacySql=$USE_LEGACY_SQL,queryLocation=$QUERY_LOCATION,queryTempDataset=$QUERY_TEMP_DATASET,connectionUrl=$CONNECTION_URL,apiKey=$API_KEY,elasticsearchUsername=$ELASTICSEARCH_USERNAME,elasticsearchPassword=$ELASTICSEARCH_PASSWORD,index=$INDEX,batchSize=$BATCH_SIZE,batchSizeBytes=$BATCH_SIZE_BYTES,maxRetryAttempts=$MAX_RETRY_ATTEMPTS,maxRetryDuration=$MAX_RETRY_DURATION,propertyAsIndex=$PROPERTY_AS_INDEX,javaScriptIndexFnGcsPath=$JAVA_SCRIPT_INDEX_FN_GCS_PATH,javaScriptIndexFnName=$JAVA_SCRIPT_INDEX_FN_NAME,propertyAsId=$PROPERTY_AS_ID,javaScriptIdFnGcsPath=$JAVA_SCRIPT_ID_FN_GCS_PATH,javaScriptIdFnName=$JAVA_SCRIPT_ID_FN_NAME,javaScriptTypeFnGcsPath=$JAVA_SCRIPT_TYPE_FN_GCS_PATH,javaScriptTypeFnName=$JAVA_SCRIPT_TYPE_FN_NAME,javaScriptIsDeleteFnGcsPath=$JAVA_SCRIPT_IS_DELETE_FN_GCS_PATH,javaScriptIsDeleteFnName=$JAVA_SCRIPT_IS_DELETE_FN_NAME,usePartialUpdate=$USE_PARTIAL_UPDATE,bulkInsertMethod=$BULK_INSERT_METHOD,trustSelfSignedCerts=$TRUST_SELF_SIGNED_CERTS,disableCertificateValidation=$DISABLE_CERTIFICATE_VALIDATION,apiKeyKMSEncryptionKey=$API_KEY_KMSENCRYPTION_KEY,apiKeySecretId=$API_KEY_SECRET_ID,apiKeySource=$API_KEY_SOURCE,socketTimeout=$SOCKET_TIMEOUT,javascriptTextTransformGcsPath=$JAVASCRIPT_TEXT_TRANSFORM_GCS_PATH,javascriptTextTransformFunctionName=$JAVASCRIPT_TEXT_TRANSFORM_FUNCTION_NAME" \ +-Dparameters="inputTableSpec=$INPUT_TABLE_SPEC,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,query=$QUERY,useLegacySql=$USE_LEGACY_SQL,queryLocation=$QUERY_LOCATION,queryTempDataset=$QUERY_TEMP_DATASET,KMSEncryptionKey=$KMSENCRYPTION_KEY,connectionUrl=$CONNECTION_URL,apiKey=$API_KEY,elasticsearchUsername=$ELASTICSEARCH_USERNAME,elasticsearchPassword=$ELASTICSEARCH_PASSWORD,index=$INDEX,batchSize=$BATCH_SIZE,batchSizeBytes=$BATCH_SIZE_BYTES,maxRetryAttempts=$MAX_RETRY_ATTEMPTS,maxRetryDuration=$MAX_RETRY_DURATION,propertyAsIndex=$PROPERTY_AS_INDEX,javaScriptIndexFnGcsPath=$JAVA_SCRIPT_INDEX_FN_GCS_PATH,javaScriptIndexFnName=$JAVA_SCRIPT_INDEX_FN_NAME,propertyAsId=$PROPERTY_AS_ID,javaScriptIdFnGcsPath=$JAVA_SCRIPT_ID_FN_GCS_PATH,javaScriptIdFnName=$JAVA_SCRIPT_ID_FN_NAME,javaScriptTypeFnGcsPath=$JAVA_SCRIPT_TYPE_FN_GCS_PATH,javaScriptTypeFnName=$JAVA_SCRIPT_TYPE_FN_NAME,javaScriptIsDeleteFnGcsPath=$JAVA_SCRIPT_IS_DELETE_FN_GCS_PATH,javaScriptIsDeleteFnName=$JAVA_SCRIPT_IS_DELETE_FN_NAME,usePartialUpdate=$USE_PARTIAL_UPDATE,bulkInsertMethod=$BULK_INSERT_METHOD,trustSelfSignedCerts=$TRUST_SELF_SIGNED_CERTS,disableCertificateValidation=$DISABLE_CERTIFICATE_VALIDATION,apiKeyKMSEncryptionKey=$API_KEY_KMSENCRYPTION_KEY,apiKeySecretId=$API_KEY_SECRET_ID,apiKeySource=$API_KEY_SOURCE,socketTimeout=$SOCKET_TIMEOUT,javascriptTextTransformGcsPath=$JAVASCRIPT_TEXT_TRANSFORM_GCS_PATH,javascriptTextTransformFunctionName=$JAVASCRIPT_TEXT_TRANSFORM_FUNCTION_NAME" \ -f v2/googlecloud-to-elasticsearch ``` @@ -329,15 +333,16 @@ resource "google_dataflow_flex_template_job" "bigquery_to_elasticsearch" { name = "bigquery-to-elasticsearch" region = var.region parameters = { - connectionUrl = "https://elasticsearch-host:9200" + connectionUrl = "" apiKey = "" - index = "my-index" - # inputTableSpec = "bigquery-project:dataset.input_table" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" - # query = "select * from sampledb.sample_table" + index = "" + # inputTableSpec = "" + # outputDeadletterTable = "" + # query = "" # useLegacySql = "false" - # queryLocation = "US" - # queryTempDataset = "temp_dataset" + # queryLocation = "" + # queryTempDataset = "" + # KMSEncryptionKey = "" # elasticsearchUsername = "" # elasticsearchPassword = "" # batchSize = "1000" @@ -358,11 +363,11 @@ resource "google_dataflow_flex_template_job" "bigquery_to_elasticsearch" { # bulkInsertMethod = "CREATE" # trustSelfSignedCerts = "false" # disableCertificateValidation = "false" - # apiKeyKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # apiKeySecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # apiKeyKMSEncryptionKey = "" + # apiKeySecretId = "" # apiKeySource = "PLAINTEXT" # socketTimeout = "" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" } } diff --git a/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch_Xlang.md b/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch_Xlang.md index 6a468c9e53..7c0c74fa65 100644 --- a/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch_Xlang.md +++ b/v2/googlecloud-to-elasticsearch/README_BigQuery_to_Elasticsearch_Xlang.md @@ -18,44 +18,45 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **connectionUrl** : The Elasticsearch URL in the format https://hostname:[port]. If using Elastic Cloud, specify the CloudID. (Example: https://elasticsearch-host:9200). -* **apiKey** : The Base64-encoded API key to use for authentication. -* **index** : The Elasticsearch index that the requests are issued to, such as `my-index.` (Example: my-index). +* **connectionUrl**: The Elasticsearch URL in the format `https://hostname:[port]`. If using Elastic Cloud, specify the CloudID. For example, `https://elasticsearch-host:9200`. +* **apiKey**: The Base64-encoded API key to use for authentication. +* **index**: The Elasticsearch index that the requests are issued to. For example, `my-index`. ### Optional parameters -* **inputTableSpec** : The BigQuery table to read from. Format: `projectId:datasetId.tablename`. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the BigQuery Storage Read API (https://cloud.google.com/bigquery/docs/reference/storage). For information about limitations in the Storage Read API, see https://cloud.google.com/bigquery/docs/reference/storage#limitations. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. (Example: bigquery-project:dataset.input_table). -* **outputDeadletterTable** : The BigQuery table for messages that failed to reach the output table, in the format :.. If a table doesn't exist, is is created during pipeline execution. If not specified, `_error_records` is used. (Example: your-project-id:your-dataset.your-table-name). -* **query** : The SQL query to use to read data from BigQuery. If the BigQuery dataset is in a different project than the Dataflow job, specify the full dataset name in the SQL query, for example: ... By default, the `query` parameter uses GoogleSQL (https://cloud.google.com/bigquery/docs/introduction-sql), unless `useLegacySql` is `true`. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. (Example: select * from sampledb.sample_table). -* **useLegacySql** : Set to true to use legacy SQL. This parameter only applies when using the `query` parameter. Defaults to: false. -* **queryLocation** : Needed when reading from an authorized view without underlying table's permission. (Example: US). -* **queryTempDataset** : With this option, you can set an existing dataset to create the temporary table to store the results of the query. (Example: temp_dataset). -* **elasticsearchUsername** : The Elasticsearch username to authenticate with. If specified, the value of 'apiKey' is ignored. -* **elasticsearchPassword** : The Elasticsearch password to authenticate with. If specified, the value of 'apiKey' is ignored. -* **batchSize** : The batch size in number of documents. Defaults to: 1000. -* **batchSizeBytes** : The batch size in number of bytes. Defaults to: 5242880 (5mb). -* **maxRetryAttempts** : The maximum number of retry attempts. Must be greater than zero. Defaults to: no retries. -* **maxRetryDuration** : The maximum retry duration in milliseconds. Must be greater than zero. Defaults to: no retries. -* **propertyAsIndex** : The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to: none. -* **javaScriptIndexFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIndexFnName** : The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **propertyAsId** : A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to: none. -* **javaScriptIdFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIdFnName** : The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptTypeFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Default: none. -* **javaScriptTypeFnName** : The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIsDeleteFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **javaScriptIsDeleteFnName** : The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **usePartialUpdate** : Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to: false. -* **bulkInsertMethod** : Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to: CREATE. -* **trustSelfSignedCerts** : Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to True to by-pass the validation on SSL certificate. (default is False). -* **disableCertificateValidation** : If 'true', trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to 'true'. Default: false. -* **apiKeyKMSEncryptionKey** : The Cloud KMS key to decrypt the API key. This parameter must be provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **apiKeySecretId** : Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **apiKeySource** : Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter must be provided if secret manager or KMS is used. If apiKeySource is set to KMS, apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, apiKey must be provided. Defaults to: PLAINTEXT. -* **socketTimeout** : If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). +* **inputTableSpec**: The BigQuery table to read from. If you specify `inputTableSpec`, the template reads the data directly from BigQuery storage by using the BigQuery Storage Read API (https://cloud.google.com/bigquery/docs/reference/storage). For information about limitations in the Storage Read API, see https://cloud.google.com/bigquery/docs/reference/storage#limitations. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. For example, `:.`. +* **outputDeadletterTable**: The BigQuery table for messages that failed to reach the output table. If a table doesn't exist, it is created during pipeline execution. If not specified, `_error_records` is used. For example, `:.`. +* **query**: The SQL query to use to read data from BigQuery. If the BigQuery dataset is in a different project than the Dataflow job, specify the full dataset name in the SQL query, for example: ... By default, the `query` parameter uses GoogleSQL (https://cloud.google.com/bigquery/docs/introduction-sql), unless `useLegacySql` is `true`. You must specify either `inputTableSpec` or `query`. If you set both parameters, the template uses the `query` parameter. For example, `select * from sampledb.sample_table`. +* **useLegacySql**: Set to `true` to use legacy SQL. This parameter only applies when using the `query` parameter. Defaults to `false`. +* **queryLocation**: Needed when reading from an authorized view without underlying table's permission. For example, `US`. +* **queryTempDataset**: With this option, you can set an existing dataset to create the temporary table to store the results of the query. For example, `temp_dataset`. +* **KMSEncryptionKey**: If reading from BigQuery using query source, use this Cloud KMS key to encrypt any temporary tables created. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **elasticsearchUsername**: The Elasticsearch username to authenticate with. If specified, the value of `apiKey` is ignored. +* **elasticsearchPassword**: The Elasticsearch password to authenticate with. If specified, the value of `apiKey` is ignored. +* **batchSize**: The batch size in number of documents. Defaults to `1000`. +* **batchSizeBytes**: The batch size in number of bytes. Defaults to `5242880` (5mb). +* **maxRetryAttempts**: The maximum number of retry attempts. Must be greater than zero. Defaults to `no retries`. +* **maxRetryDuration**: The maximum retry duration in milliseconds. Must be greater than zero. Defaults to `no retries`. +* **propertyAsIndex**: The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to `none`. +* **javaScriptIndexFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIndexFnName**: The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **propertyAsId**: A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to `none`. +* **javaScriptIdFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIdFnName**: The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptTypeFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Defaults to `none`. +* **javaScriptTypeFnName**: The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIsDeleteFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **javaScriptIsDeleteFnName**: The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **usePartialUpdate**: Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to `false`. +* **bulkInsertMethod**: Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to `CREATE`. +* **trustSelfSignedCerts**: Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to true to by-pass the validation on SSL certificate. (Defaults to: `false`). +* **disableCertificateValidation**: If `true`, trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to `true`. Defaults to `false`. +* **apiKeyKMSEncryptionKey**: The Cloud KMS key to decrypt the API key. This parameter is required if the `apiKeySource` is set to `KMS`. If this parameter is provided, pass in an encrypted `apiKey` string. Encrypt parameters using the KMS API encrypt endpoint. For the key, use the format `projects//locations//keyRings//cryptoKeys/`. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **apiKeySecretId**: The Secret Manager secret ID for the apiKey. If the `apiKeySource` is set to `SECRET_MANAGER`, provide this parameter. Use the format `projects//secrets//versions/. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **apiKeySource**: The source of the API key. Allowed values are `PLAINTEXT`, `KMS` orand `SECRET_MANAGER`. This parameter is required when you use Secret Manager or KMS. If `apiKeySource` is set to `KMS`, `apiKeyKMSEncryptionKey` and encrypted apiKey must be provided. If `apiKeySource` is set to `SECRET_MANAGER`, `apiKeySecretId` must be provided. If `apiKeySource` is set to `PLAINTEXT`, `apiKey` must be provided. Defaults to: PLAINTEXT. +* **socketTimeout**: If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. @@ -145,6 +146,7 @@ export QUERY= export USE_LEGACY_SQL=false export QUERY_LOCATION= export QUERY_TEMP_DATASET= +export KMSENCRYPTION_KEY= export ELASTICSEARCH_USERNAME= export ELASTICSEARCH_PASSWORD= export BATCH_SIZE=1000 @@ -182,6 +184,7 @@ gcloud dataflow flex-template run "bigquery-to-elasticsearch-xlang-job" \ --parameters "useLegacySql=$USE_LEGACY_SQL" \ --parameters "queryLocation=$QUERY_LOCATION" \ --parameters "queryTempDataset=$QUERY_TEMP_DATASET" \ + --parameters "KMSEncryptionKey=$KMSENCRYPTION_KEY" \ --parameters "connectionUrl=$CONNECTION_URL" \ --parameters "apiKey=$API_KEY" \ --parameters "elasticsearchUsername=$ELASTICSEARCH_USERNAME" \ @@ -240,6 +243,7 @@ export QUERY= export USE_LEGACY_SQL=false export QUERY_LOCATION= export QUERY_TEMP_DATASET= +export KMSENCRYPTION_KEY= export ELASTICSEARCH_USERNAME= export ELASTICSEARCH_PASSWORD= export BATCH_SIZE=1000 @@ -274,7 +278,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="bigquery-to-elasticsearch-xlang-job" \ -DtemplateName="BigQuery_to_Elasticsearch_Xlang" \ --Dparameters="inputTableSpec=$INPUT_TABLE_SPEC,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,query=$QUERY,useLegacySql=$USE_LEGACY_SQL,queryLocation=$QUERY_LOCATION,queryTempDataset=$QUERY_TEMP_DATASET,connectionUrl=$CONNECTION_URL,apiKey=$API_KEY,elasticsearchUsername=$ELASTICSEARCH_USERNAME,elasticsearchPassword=$ELASTICSEARCH_PASSWORD,index=$INDEX,batchSize=$BATCH_SIZE,batchSizeBytes=$BATCH_SIZE_BYTES,maxRetryAttempts=$MAX_RETRY_ATTEMPTS,maxRetryDuration=$MAX_RETRY_DURATION,propertyAsIndex=$PROPERTY_AS_INDEX,javaScriptIndexFnGcsPath=$JAVA_SCRIPT_INDEX_FN_GCS_PATH,javaScriptIndexFnName=$JAVA_SCRIPT_INDEX_FN_NAME,propertyAsId=$PROPERTY_AS_ID,javaScriptIdFnGcsPath=$JAVA_SCRIPT_ID_FN_GCS_PATH,javaScriptIdFnName=$JAVA_SCRIPT_ID_FN_NAME,javaScriptTypeFnGcsPath=$JAVA_SCRIPT_TYPE_FN_GCS_PATH,javaScriptTypeFnName=$JAVA_SCRIPT_TYPE_FN_NAME,javaScriptIsDeleteFnGcsPath=$JAVA_SCRIPT_IS_DELETE_FN_GCS_PATH,javaScriptIsDeleteFnName=$JAVA_SCRIPT_IS_DELETE_FN_NAME,usePartialUpdate=$USE_PARTIAL_UPDATE,bulkInsertMethod=$BULK_INSERT_METHOD,trustSelfSignedCerts=$TRUST_SELF_SIGNED_CERTS,disableCertificateValidation=$DISABLE_CERTIFICATE_VALIDATION,apiKeyKMSEncryptionKey=$API_KEY_KMSENCRYPTION_KEY,apiKeySecretId=$API_KEY_SECRET_ID,apiKeySource=$API_KEY_SOURCE,socketTimeout=$SOCKET_TIMEOUT,pythonExternalTextTransformGcsPath=$PYTHON_EXTERNAL_TEXT_TRANSFORM_GCS_PATH,pythonExternalTextTransformFunctionName=$PYTHON_EXTERNAL_TEXT_TRANSFORM_FUNCTION_NAME" \ +-Dparameters="inputTableSpec=$INPUT_TABLE_SPEC,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,query=$QUERY,useLegacySql=$USE_LEGACY_SQL,queryLocation=$QUERY_LOCATION,queryTempDataset=$QUERY_TEMP_DATASET,KMSEncryptionKey=$KMSENCRYPTION_KEY,connectionUrl=$CONNECTION_URL,apiKey=$API_KEY,elasticsearchUsername=$ELASTICSEARCH_USERNAME,elasticsearchPassword=$ELASTICSEARCH_PASSWORD,index=$INDEX,batchSize=$BATCH_SIZE,batchSizeBytes=$BATCH_SIZE_BYTES,maxRetryAttempts=$MAX_RETRY_ATTEMPTS,maxRetryDuration=$MAX_RETRY_DURATION,propertyAsIndex=$PROPERTY_AS_INDEX,javaScriptIndexFnGcsPath=$JAVA_SCRIPT_INDEX_FN_GCS_PATH,javaScriptIndexFnName=$JAVA_SCRIPT_INDEX_FN_NAME,propertyAsId=$PROPERTY_AS_ID,javaScriptIdFnGcsPath=$JAVA_SCRIPT_ID_FN_GCS_PATH,javaScriptIdFnName=$JAVA_SCRIPT_ID_FN_NAME,javaScriptTypeFnGcsPath=$JAVA_SCRIPT_TYPE_FN_GCS_PATH,javaScriptTypeFnName=$JAVA_SCRIPT_TYPE_FN_NAME,javaScriptIsDeleteFnGcsPath=$JAVA_SCRIPT_IS_DELETE_FN_GCS_PATH,javaScriptIsDeleteFnName=$JAVA_SCRIPT_IS_DELETE_FN_NAME,usePartialUpdate=$USE_PARTIAL_UPDATE,bulkInsertMethod=$BULK_INSERT_METHOD,trustSelfSignedCerts=$TRUST_SELF_SIGNED_CERTS,disableCertificateValidation=$DISABLE_CERTIFICATE_VALIDATION,apiKeyKMSEncryptionKey=$API_KEY_KMSENCRYPTION_KEY,apiKeySecretId=$API_KEY_SECRET_ID,apiKeySource=$API_KEY_SOURCE,socketTimeout=$SOCKET_TIMEOUT,pythonExternalTextTransformGcsPath=$PYTHON_EXTERNAL_TEXT_TRANSFORM_GCS_PATH,pythonExternalTextTransformFunctionName=$PYTHON_EXTERNAL_TEXT_TRANSFORM_FUNCTION_NAME" \ -f v2/googlecloud-to-elasticsearch ``` @@ -319,15 +323,16 @@ resource "google_dataflow_flex_template_job" "bigquery_to_elasticsearch_xlang" { name = "bigquery-to-elasticsearch-xlang" region = var.region parameters = { - connectionUrl = "https://elasticsearch-host:9200" + connectionUrl = "" apiKey = "" - index = "my-index" - # inputTableSpec = "bigquery-project:dataset.input_table" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" - # query = "select * from sampledb.sample_table" + index = "" + # inputTableSpec = "" + # outputDeadletterTable = "" + # query = "" # useLegacySql = "false" - # queryLocation = "US" - # queryTempDataset = "temp_dataset" + # queryLocation = "" + # queryTempDataset = "" + # KMSEncryptionKey = "" # elasticsearchUsername = "" # elasticsearchPassword = "" # batchSize = "1000" @@ -348,12 +353,12 @@ resource "google_dataflow_flex_template_job" "bigquery_to_elasticsearch_xlang" { # bulkInsertMethod = "CREATE" # trustSelfSignedCerts = "false" # disableCertificateValidation = "false" - # apiKeyKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # apiKeySecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # apiKeyKMSEncryptionKey = "" + # apiKeySecretId = "" # apiKeySource = "PLAINTEXT" # socketTimeout = "" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" } } ``` diff --git a/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch.md b/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch.md index 3126a8d58a..66c19b9071 100644 --- a/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch.md +++ b/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch.md @@ -27,48 +27,48 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **deadletterTable** : The BigQuery dead-letter table to send failed inserts to. (Example: your-project:your-dataset.your-table-name). -* **inputFileSpec** : The Cloud Storage file pattern to search for CSV files. Example: gs://mybucket/test-*.csv. -* **connectionUrl** : The Elasticsearch URL in the format https://hostname:[port]. If using Elastic Cloud, specify the CloudID. (Example: https://elasticsearch-host:9200). -* **apiKey** : The Base64-encoded API key to use for authentication. -* **index** : The Elasticsearch index that the requests are issued to, such as `my-index.` (Example: my-index). +* **deadletterTable**: The BigQuery dead-letter table to send failed inserts to. For example, `your-project:your-dataset.your-table-name`. +* **inputFileSpec**: The Cloud Storage file pattern to search for CSV files. For example, `gs://mybucket/test-*.csv`. +* **connectionUrl**: The Elasticsearch URL in the format `https://hostname:[port]`. If using Elastic Cloud, specify the CloudID. For example, `https://elasticsearch-host:9200`. +* **apiKey**: The Base64-encoded API key to use for authentication. +* **index**: The Elasticsearch index that the requests are issued to. For example, `my-index`. ### Optional parameters -* **inputFormat** : Input file format. Default is: CSV. -* **containsHeaders** : Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. -* **delimiter** : The column delimiter of the input text files. Default: use delimiter provided in csvFormat (Example: ,). -* **csvFormat** : CSV format specification to use for parsing records. Default is: Default. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. -* **jsonSchemaPath** : The path to the JSON schema. Defaults to: null. (Example: gs://path/to/schema). -* **largeNumFiles** : Set to true if number of files is in the tens of thousands. Defaults to: false. -* **csvFileEncoding** : The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16. Defaults to: UTF-8. -* **logDetailedCsvConversionErrors** : Set to true to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: false. -* **elasticsearchUsername** : The Elasticsearch username to authenticate with. If specified, the value of 'apiKey' is ignored. -* **elasticsearchPassword** : The Elasticsearch password to authenticate with. If specified, the value of 'apiKey' is ignored. -* **batchSize** : The batch size in number of documents. Defaults to: 1000. -* **batchSizeBytes** : The batch size in number of bytes. Defaults to: 5242880 (5mb). -* **maxRetryAttempts** : The maximum number of retry attempts. Must be greater than zero. Defaults to: no retries. -* **maxRetryDuration** : The maximum retry duration in milliseconds. Must be greater than zero. Defaults to: no retries. -* **propertyAsIndex** : The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to: none. -* **javaScriptIndexFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIndexFnName** : The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **propertyAsId** : A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to: none. -* **javaScriptIdFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIdFnName** : The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptTypeFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Default: none. -* **javaScriptTypeFnName** : The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIsDeleteFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **javaScriptIsDeleteFnName** : The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **usePartialUpdate** : Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to: false. -* **bulkInsertMethod** : Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to: CREATE. -* **trustSelfSignedCerts** : Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to True to by-pass the validation on SSL certificate. (default is False). -* **disableCertificateValidation** : If 'true', trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to 'true'. Default: false. -* **apiKeyKMSEncryptionKey** : The Cloud KMS key to decrypt the API key. This parameter must be provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **apiKeySecretId** : Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **apiKeySource** : Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter must be provided if secret manager or KMS is used. If apiKeySource is set to KMS, apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, apiKey must be provided. Defaults to: PLAINTEXT. -* **socketTimeout** : If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **inputFormat**: The input file format. Defaults to `CSV`. +* **containsHeaders**: Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. +* **delimiter**: The column delimiter of the input text files. Default: `,` For example, `,`. +* **csvFormat**: CSV format specification to use for parsing records. Default is: `Default`. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. +* **jsonSchemaPath**: The path to the JSON schema. Defaults to `null`. For example, `gs://path/to/schema`. +* **largeNumFiles**: Set to true if number of files is in the tens of thousands. Defaults to `false`. +* **csvFileEncoding**: The CSV file character encoding format. Allowed values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`. Defaults to: UTF-8. +* **logDetailedCsvConversionErrors**: Set to `true` to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: `false`. +* **elasticsearchUsername**: The Elasticsearch username to authenticate with. If specified, the value of `apiKey` is ignored. +* **elasticsearchPassword**: The Elasticsearch password to authenticate with. If specified, the value of `apiKey` is ignored. +* **batchSize**: The batch size in number of documents. Defaults to `1000`. +* **batchSizeBytes**: The batch size in number of bytes. Defaults to `5242880` (5mb). +* **maxRetryAttempts**: The maximum number of retry attempts. Must be greater than zero. Defaults to `no retries`. +* **maxRetryDuration**: The maximum retry duration in milliseconds. Must be greater than zero. Defaults to `no retries`. +* **propertyAsIndex**: The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to `none`. +* **javaScriptIndexFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIndexFnName**: The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **propertyAsId**: A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to `none`. +* **javaScriptIdFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIdFnName**: The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptTypeFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Defaults to `none`. +* **javaScriptTypeFnName**: The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIsDeleteFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **javaScriptIsDeleteFnName**: The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **usePartialUpdate**: Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to `false`. +* **bulkInsertMethod**: Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to `CREATE`. +* **trustSelfSignedCerts**: Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to true to by-pass the validation on SSL certificate. (Defaults to: `false`). +* **disableCertificateValidation**: If `true`, trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to `true`. Defaults to `false`. +* **apiKeyKMSEncryptionKey**: The Cloud KMS key to decrypt the API key. This parameter is required if the `apiKeySource` is set to `KMS`. If this parameter is provided, pass in an encrypted `apiKey` string. Encrypt parameters using the KMS API encrypt endpoint. For the key, use the format `projects//locations//keyRings//cryptoKeys/`. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **apiKeySecretId**: The Secret Manager secret ID for the apiKey. If the `apiKeySource` is set to `SECRET_MANAGER`, provide this parameter. Use the format `projects//secrets//versions/. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **apiKeySource**: The source of the API key. Allowed values are `PLAINTEXT`, `KMS` orand `SECRET_MANAGER`. This parameter is required when you use Secret Manager or KMS. If `apiKeySource` is set to `KMS`, `apiKeyKMSEncryptionKey` and encrypted apiKey must be provided. If `apiKeySource` is set to `SECRET_MANAGER`, `apiKeySecretId` must be provided. If `apiKeySource` is set to `PLAINTEXT`, `apiKey` must be provided. Defaults to: PLAINTEXT. +* **socketTimeout**: If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). ## User-Defined functions (UDFs) @@ -354,16 +354,16 @@ resource "google_dataflow_flex_template_job" "gcs_to_elasticsearch" { name = "gcs-to-elasticsearch" region = var.region parameters = { - deadletterTable = "your-project:your-dataset.your-table-name" + deadletterTable = "" inputFileSpec = "" - connectionUrl = "https://elasticsearch-host:9200" + connectionUrl = "" apiKey = "" - index = "my-index" + index = "" # inputFormat = "csv" # containsHeaders = "false" - # delimiter = "," + # delimiter = "" # csvFormat = "Default" - # jsonSchemaPath = "gs://path/to/schema" + # jsonSchemaPath = "" # largeNumFiles = "false" # csvFileEncoding = "UTF-8" # logDetailedCsvConversionErrors = "false" @@ -387,11 +387,11 @@ resource "google_dataflow_flex_template_job" "gcs_to_elasticsearch" { # bulkInsertMethod = "CREATE" # trustSelfSignedCerts = "false" # disableCertificateValidation = "false" - # apiKeyKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # apiKeySecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # apiKeyKMSEncryptionKey = "" + # apiKeySecretId = "" # apiKeySource = "PLAINTEXT" # socketTimeout = "" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" } } diff --git a/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch_Xlang.md b/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch_Xlang.md index 901332ec72..295f694286 100644 --- a/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch_Xlang.md +++ b/v2/googlecloud-to-elasticsearch/README_GCS_to_Elasticsearch_Xlang.md @@ -27,48 +27,48 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **deadletterTable** : The BigQuery dead-letter table to send failed inserts to. (Example: your-project:your-dataset.your-table-name). -* **inputFileSpec** : The Cloud Storage file pattern to search for CSV files. Example: gs://mybucket/test-*.csv. -* **connectionUrl** : The Elasticsearch URL in the format https://hostname:[port]. If using Elastic Cloud, specify the CloudID. (Example: https://elasticsearch-host:9200). -* **apiKey** : The Base64-encoded API key to use for authentication. -* **index** : The Elasticsearch index that the requests are issued to, such as `my-index.` (Example: my-index). +* **deadletterTable**: The BigQuery dead-letter table to send failed inserts to. For example, `your-project:your-dataset.your-table-name`. +* **inputFileSpec**: The Cloud Storage file pattern to search for CSV files. For example, `gs://mybucket/test-*.csv`. +* **connectionUrl**: The Elasticsearch URL in the format `https://hostname:[port]`. If using Elastic Cloud, specify the CloudID. For example, `https://elasticsearch-host:9200`. +* **apiKey**: The Base64-encoded API key to use for authentication. +* **index**: The Elasticsearch index that the requests are issued to. For example, `my-index`. ### Optional parameters -* **inputFormat** : Input file format. Default is: CSV. -* **containsHeaders** : Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. -* **delimiter** : The column delimiter of the input text files. Default: use delimiter provided in csvFormat (Example: ,). -* **csvFormat** : CSV format specification to use for parsing records. Default is: Default. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. -* **jsonSchemaPath** : The path to the JSON schema. Defaults to: null. (Example: gs://path/to/schema). -* **largeNumFiles** : Set to true if number of files is in the tens of thousands. Defaults to: false. -* **csvFileEncoding** : The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16. Defaults to: UTF-8. -* **logDetailedCsvConversionErrors** : Set to true to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: false. -* **elasticsearchUsername** : The Elasticsearch username to authenticate with. If specified, the value of 'apiKey' is ignored. -* **elasticsearchPassword** : The Elasticsearch password to authenticate with. If specified, the value of 'apiKey' is ignored. -* **batchSize** : The batch size in number of documents. Defaults to: 1000. -* **batchSizeBytes** : The batch size in number of bytes. Defaults to: 5242880 (5mb). -* **maxRetryAttempts** : The maximum number of retry attempts. Must be greater than zero. Defaults to: no retries. -* **maxRetryDuration** : The maximum retry duration in milliseconds. Must be greater than zero. Defaults to: no retries. -* **propertyAsIndex** : The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to: none. -* **javaScriptIndexFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIndexFnName** : The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **propertyAsId** : A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to: none. -* **javaScriptIdFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIdFnName** : The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptTypeFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Default: none. -* **javaScriptTypeFnName** : The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIsDeleteFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **javaScriptIsDeleteFnName** : The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **usePartialUpdate** : Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to: false. -* **bulkInsertMethod** : Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to: CREATE. -* **trustSelfSignedCerts** : Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to True to by-pass the validation on SSL certificate. (default is False). -* **disableCertificateValidation** : If 'true', trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to 'true'. Default: false. -* **apiKeyKMSEncryptionKey** : The Cloud KMS key to decrypt the API key. This parameter must be provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **apiKeySecretId** : Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **apiKeySource** : Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter must be provided if secret manager or KMS is used. If apiKeySource is set to KMS, apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, apiKey must be provided. Defaults to: PLAINTEXT. -* **socketTimeout** : If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). +* **inputFormat**: The input file format. Defaults to `CSV`. +* **containsHeaders**: Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. +* **delimiter**: The column delimiter of the input text files. Default: `,` For example, `,`. +* **csvFormat**: CSV format specification to use for parsing records. Default is: `Default`. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. +* **jsonSchemaPath**: The path to the JSON schema. Defaults to `null`. For example, `gs://path/to/schema`. +* **largeNumFiles**: Set to true if number of files is in the tens of thousands. Defaults to `false`. +* **csvFileEncoding**: The CSV file character encoding format. Allowed values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`. Defaults to: UTF-8. +* **logDetailedCsvConversionErrors**: Set to `true` to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: `false`. +* **elasticsearchUsername**: The Elasticsearch username to authenticate with. If specified, the value of `apiKey` is ignored. +* **elasticsearchPassword**: The Elasticsearch password to authenticate with. If specified, the value of `apiKey` is ignored. +* **batchSize**: The batch size in number of documents. Defaults to `1000`. +* **batchSizeBytes**: The batch size in number of bytes. Defaults to `5242880` (5mb). +* **maxRetryAttempts**: The maximum number of retry attempts. Must be greater than zero. Defaults to `no retries`. +* **maxRetryDuration**: The maximum retry duration in milliseconds. Must be greater than zero. Defaults to `no retries`. +* **propertyAsIndex**: The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to `none`. +* **javaScriptIndexFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIndexFnName**: The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **propertyAsId**: A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to `none`. +* **javaScriptIdFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIdFnName**: The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptTypeFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Defaults to `none`. +* **javaScriptTypeFnName**: The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIsDeleteFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **javaScriptIsDeleteFnName**: The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **usePartialUpdate**: Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to `false`. +* **bulkInsertMethod**: Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to `CREATE`. +* **trustSelfSignedCerts**: Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to true to by-pass the validation on SSL certificate. (Defaults to: `false`). +* **disableCertificateValidation**: If `true`, trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to `true`. Defaults to `false`. +* **apiKeyKMSEncryptionKey**: The Cloud KMS key to decrypt the API key. This parameter is required if the `apiKeySource` is set to `KMS`. If this parameter is provided, pass in an encrypted `apiKey` string. Encrypt parameters using the KMS API encrypt endpoint. For the key, use the format `projects//locations//keyRings//cryptoKeys/`. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **apiKeySecretId**: The Secret Manager secret ID for the apiKey. If the `apiKeySource` is set to `SECRET_MANAGER`, provide this parameter. Use the format `projects//secrets//versions/. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **apiKeySource**: The source of the API key. Allowed values are `PLAINTEXT`, `KMS` orand `SECRET_MANAGER`. This parameter is required when you use Secret Manager or KMS. If `apiKeySource` is set to `KMS`, `apiKeyKMSEncryptionKey` and encrypted apiKey must be provided. If `apiKeySource` is set to `SECRET_MANAGER`, `apiKeySecretId` must be provided. If `apiKeySource` is set to `PLAINTEXT`, `apiKey` must be provided. Defaults to: PLAINTEXT. +* **socketTimeout**: If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. @@ -344,16 +344,16 @@ resource "google_dataflow_flex_template_job" "gcs_to_elasticsearch_xlang" { name = "gcs-to-elasticsearch-xlang" region = var.region parameters = { - deadletterTable = "your-project:your-dataset.your-table-name" + deadletterTable = "" inputFileSpec = "" - connectionUrl = "https://elasticsearch-host:9200" + connectionUrl = "" apiKey = "" - index = "my-index" + index = "" # inputFormat = "csv" # containsHeaders = "false" - # delimiter = "," + # delimiter = "" # csvFormat = "Default" - # jsonSchemaPath = "gs://path/to/schema" + # jsonSchemaPath = "" # largeNumFiles = "false" # csvFileEncoding = "UTF-8" # logDetailedCsvConversionErrors = "false" @@ -377,12 +377,12 @@ resource "google_dataflow_flex_template_job" "gcs_to_elasticsearch_xlang" { # bulkInsertMethod = "CREATE" # trustSelfSignedCerts = "false" # disableCertificateValidation = "false" - # apiKeyKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # apiKeySecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # apiKeyKMSEncryptionKey = "" + # apiKeySecretId = "" # apiKeySource = "PLAINTEXT" # socketTimeout = "" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" } } ``` diff --git a/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Flex.md b/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Flex.md index 0d1686bf5e..e0186245a7 100644 --- a/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Flex.md +++ b/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Flex.md @@ -29,43 +29,43 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : Pub/Sub subscription to consume the input from. Name should be in the format of 'projects/your-project-id/subscriptions/your-subscription-name' (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **errorOutputTopic** : Pub/Sub output topic for publishing failed records in the format of 'projects/your-project-id/topics/your-topic-name'. -* **connectionUrl** : The Elasticsearch URL in the format https://hostname:[port]. If using Elastic Cloud, specify the CloudID. (Example: https://elasticsearch-host:9200). -* **apiKey** : The Base64-encoded API key to use for authentication. +* **inputSubscription**: Pub/Sub subscription to consume the input from. For example, `projects//subscriptions/`. +* **errorOutputTopic**: The Pub/Sub output topic for publishing failed records, in the format of `projects//topics/`. +* **connectionUrl**: The Elasticsearch URL in the format `https://hostname:[port]`. If using Elastic Cloud, specify the CloudID. For example, `https://elasticsearch-host:9200`. +* **apiKey**: The Base64-encoded API key to use for authentication. ### Optional parameters -* **dataset** : The type of logs sent using Pub/Sub, for which we have an out-of-the-box dashboard. Known log types values are audit, vpcflow and firewall. Default 'pubsub'. -* **namespace** : An arbitrary grouping, such as an environment (dev, prod, or qa), a team, or a strategic business unit. Default: 'default'. -* **elasticsearchTemplateVersion** : Dataflow Template Version Identifier, usually defined by Google Cloud. Defaults to: 1.0.0. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. -* **elasticsearchUsername** : The Elasticsearch username to authenticate with. If specified, the value of 'apiKey' is ignored. -* **elasticsearchPassword** : The Elasticsearch password to authenticate with. If specified, the value of 'apiKey' is ignored. -* **batchSize** : The batch size in number of documents. Defaults to: 1000. -* **batchSizeBytes** : The batch size in number of bytes. Defaults to: 5242880 (5mb). -* **maxRetryAttempts** : The maximum number of retry attempts. Must be greater than zero. Defaults to: no retries. -* **maxRetryDuration** : The maximum retry duration in milliseconds. Must be greater than zero. Defaults to: no retries. -* **propertyAsIndex** : The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to: none. -* **javaScriptIndexFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIndexFnName** : The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **propertyAsId** : A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to: none. -* **javaScriptIdFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIdFnName** : The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptTypeFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Default: none. -* **javaScriptTypeFnName** : The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIsDeleteFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **javaScriptIsDeleteFnName** : The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **usePartialUpdate** : Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to: false. -* **bulkInsertMethod** : Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to: CREATE. -* **trustSelfSignedCerts** : Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to True to by-pass the validation on SSL certificate. (default is False). -* **disableCertificateValidation** : If 'true', trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to 'true'. Default: false. -* **apiKeyKMSEncryptionKey** : The Cloud KMS key to decrypt the API key. This parameter must be provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **apiKeySecretId** : Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **apiKeySource** : Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter must be provided if secret manager or KMS is used. If apiKeySource is set to KMS, apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, apiKey must be provided. Defaults to: PLAINTEXT. -* **socketTimeout** : If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. +* **dataset**: The type of logs sent using Pub/Sub, for which we have an out-of-the-box dashboard. Known log types values are `audit`, `vpcflow`, and `firewall`. Defaults to: `pubsub`. +* **namespace**: An arbitrary grouping, such as an environment (dev, prod, or qa), a team, or a strategic business unit. Defaults to: `default`. +* **elasticsearchTemplateVersion**: Dataflow Template Version Identifier, usually defined by Google Cloud. Defaults to: 1.0.0. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. +* **elasticsearchUsername**: The Elasticsearch username to authenticate with. If specified, the value of `apiKey` is ignored. +* **elasticsearchPassword**: The Elasticsearch password to authenticate with. If specified, the value of `apiKey` is ignored. +* **batchSize**: The batch size in number of documents. Defaults to `1000`. +* **batchSizeBytes**: The batch size in number of bytes. Defaults to `5242880` (5mb). +* **maxRetryAttempts**: The maximum number of retry attempts. Must be greater than zero. Defaults to `no retries`. +* **maxRetryDuration**: The maximum retry duration in milliseconds. Must be greater than zero. Defaults to `no retries`. +* **propertyAsIndex**: The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to `none`. +* **javaScriptIndexFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIndexFnName**: The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **propertyAsId**: A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to `none`. +* **javaScriptIdFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIdFnName**: The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptTypeFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Defaults to `none`. +* **javaScriptTypeFnName**: The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIsDeleteFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **javaScriptIsDeleteFnName**: The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **usePartialUpdate**: Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to `false`. +* **bulkInsertMethod**: Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to `CREATE`. +* **trustSelfSignedCerts**: Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to true to by-pass the validation on SSL certificate. (Defaults to: `false`). +* **disableCertificateValidation**: If `true`, trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to `true`. Defaults to `false`. +* **apiKeyKMSEncryptionKey**: The Cloud KMS key to decrypt the API key. This parameter is required if the `apiKeySource` is set to `KMS`. If this parameter is provided, pass in an encrypted `apiKey` string. Encrypt parameters using the KMS API encrypt endpoint. For the key, use the format `projects//locations//keyRings//cryptoKeys/`. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **apiKeySecretId**: The Secret Manager secret ID for the apiKey. If the `apiKeySource` is set to `SECRET_MANAGER`, provide this parameter. Use the format `projects//secrets//versions/. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **apiKeySource**: The source of the API key. Allowed values are `PLAINTEXT`, `KMS` orand `SECRET_MANAGER`. This parameter is required when you use Secret Manager or KMS. If `apiKeySource` is set to `KMS`, `apiKeyKMSEncryptionKey` and encrypted apiKey must be provided. If `apiKeySource` is set to `SECRET_MANAGER`, `apiKeySecretId` must be provided. If `apiKeySource` is set to `PLAINTEXT`, `apiKey` must be provided. Defaults to: PLAINTEXT. +* **socketTimeout**: If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. ## User-Defined functions (UDFs) @@ -336,14 +336,14 @@ resource "google_dataflow_flex_template_job" "pubsub_to_elasticsearch_flex" { name = "pubsub-to-elasticsearch-flex" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" + inputSubscription = "" errorOutputTopic = "" - connectionUrl = "https://elasticsearch-host:9200" + connectionUrl = "" apiKey = "" # dataset = "PUBSUB" # namespace = "default" # elasticsearchTemplateVersion = "1.0.0" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" # elasticsearchUsername = "" @@ -366,8 +366,8 @@ resource "google_dataflow_flex_template_job" "pubsub_to_elasticsearch_flex" { # bulkInsertMethod = "CREATE" # trustSelfSignedCerts = "false" # disableCertificateValidation = "false" - # apiKeyKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # apiKeySecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # apiKeyKMSEncryptionKey = "" + # apiKeySecretId = "" # apiKeySource = "PLAINTEXT" # socketTimeout = "" } diff --git a/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Xlang.md b/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Xlang.md index 2be0f0271b..13d30d899b 100644 --- a/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Xlang.md +++ b/v2/googlecloud-to-elasticsearch/README_PubSub_to_Elasticsearch_Xlang.md @@ -30,42 +30,42 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : Pub/Sub subscription to consume the input from. Name should be in the format of 'projects/your-project-id/subscriptions/your-subscription-name' (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **errorOutputTopic** : Pub/Sub output topic for publishing failed records in the format of 'projects/your-project-id/topics/your-topic-name'. -* **connectionUrl** : The Elasticsearch URL in the format https://hostname:[port]. If using Elastic Cloud, specify the CloudID. (Example: https://elasticsearch-host:9200). -* **apiKey** : The Base64-encoded API key to use for authentication. +* **inputSubscription**: Pub/Sub subscription to consume the input from. For example, `projects//subscriptions/`. +* **errorOutputTopic**: The Pub/Sub output topic for publishing failed records, in the format of `projects//topics/`. +* **connectionUrl**: The Elasticsearch URL in the format `https://hostname:[port]`. If using Elastic Cloud, specify the CloudID. For example, `https://elasticsearch-host:9200`. +* **apiKey**: The Base64-encoded API key to use for authentication. ### Optional parameters -* **dataset** : The type of logs sent using Pub/Sub, for which we have an out-of-the-box dashboard. Known log types values are audit, vpcflow and firewall. Default 'pubsub'. -* **namespace** : An arbitrary grouping, such as an environment (dev, prod, or qa), a team, or a strategic business unit. Default: 'default'. -* **elasticsearchTemplateVersion** : Dataflow Template Version Identifier, usually defined by Google Cloud. Defaults to: 1.0.0. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). -* **elasticsearchUsername** : The Elasticsearch username to authenticate with. If specified, the value of 'apiKey' is ignored. -* **elasticsearchPassword** : The Elasticsearch password to authenticate with. If specified, the value of 'apiKey' is ignored. -* **batchSize** : The batch size in number of documents. Defaults to: 1000. -* **batchSizeBytes** : The batch size in number of bytes. Defaults to: 5242880 (5mb). -* **maxRetryAttempts** : The maximum number of retry attempts. Must be greater than zero. Defaults to: no retries. -* **maxRetryDuration** : The maximum retry duration in milliseconds. Must be greater than zero. Defaults to: no retries. -* **propertyAsIndex** : The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to: none. -* **javaScriptIndexFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIndexFnName** : The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to: none. -* **propertyAsId** : A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to: none. -* **javaScriptIdFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIdFnName** : The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptTypeFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Default: none. -* **javaScriptTypeFnName** : The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to: none. -* **javaScriptIsDeleteFnGcsPath** : The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **javaScriptIsDeleteFnName** : The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to: none. -* **usePartialUpdate** : Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to: false. -* **bulkInsertMethod** : Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to: CREATE. -* **trustSelfSignedCerts** : Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to True to by-pass the validation on SSL certificate. (default is False). -* **disableCertificateValidation** : If 'true', trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to 'true'. Default: false. -* **apiKeyKMSEncryptionKey** : The Cloud KMS key to decrypt the API key. This parameter must be provided if the apiKeySource is set to KMS. If this parameter is provided, apiKey string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **apiKeySecretId** : Secret Manager secret ID for the apiKey. This parameter should be provided if the apiKeySource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **apiKeySource** : Source of the API key. One of PLAINTEXT, KMS or SECRET_MANAGER. This parameter must be provided if secret manager or KMS is used. If apiKeySource is set to KMS, apiKeyKMSEncryptionKey and encrypted apiKey must be provided. If apiKeySource is set to SECRET_MANAGER, apiKeySecretId must be provided. If apiKeySource is set to PLAINTEXT, apiKey must be provided. Defaults to: PLAINTEXT. -* **socketTimeout** : If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. +* **dataset**: The type of logs sent using Pub/Sub, for which we have an out-of-the-box dashboard. Known log types values are `audit`, `vpcflow`, and `firewall`. Defaults to: `pubsub`. +* **namespace**: An arbitrary grouping, such as an environment (dev, prod, or qa), a team, or a strategic business unit. Defaults to: `default`. +* **elasticsearchTemplateVersion**: Dataflow Template Version Identifier, usually defined by Google Cloud. Defaults to: 1.0.0. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. +* **elasticsearchUsername**: The Elasticsearch username to authenticate with. If specified, the value of `apiKey` is ignored. +* **elasticsearchPassword**: The Elasticsearch password to authenticate with. If specified, the value of `apiKey` is ignored. +* **batchSize**: The batch size in number of documents. Defaults to `1000`. +* **batchSizeBytes**: The batch size in number of bytes. Defaults to `5242880` (5mb). +* **maxRetryAttempts**: The maximum number of retry attempts. Must be greater than zero. Defaults to `no retries`. +* **maxRetryDuration**: The maximum retry duration in milliseconds. Must be greater than zero. Defaults to `no retries`. +* **propertyAsIndex**: The property in the document being indexed whose value specifies `_index` metadata to include with the document in bulk requests. Takes precedence over an `_index` UDF. Defaults to `none`. +* **javaScriptIndexFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIndexFnName**: The name of the UDF JavaScript function that specifies `_index` metadata to include with the document in bulk requests. Defaults to `none`. +* **propertyAsId**: A property in the document being indexed whose value specifies `_id` metadata to include with the document in bulk requests. Takes precedence over an `_id` UDF. Defaults to `none`. +* **javaScriptIdFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that specifies `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIdFnName**: The name of the UDF JavaScript function that specifies the `_id` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptTypeFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for a function that specifies `_type` metadata to include with documents in bulk requests. Defaults to `none`. +* **javaScriptTypeFnName**: The name of the UDF JavaScript function that specifies the `_type` metadata to include with the document in bulk requests. Defaults to `none`. +* **javaScriptIsDeleteFnGcsPath**: The Cloud Storage path to the JavaScript UDF source for the function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **javaScriptIsDeleteFnName**: The name of the UDF JavaScript function that determines whether to delete the document instead of inserting or updating it. The function returns a string value of `true` or `false`. Defaults to `none`. +* **usePartialUpdate**: Whether to use partial updates (update rather than create or index, allowing partial documents) with Elasticsearch requests. Defaults to `false`. +* **bulkInsertMethod**: Whether to use `INDEX` (index, allows upserts) or `CREATE` (create, errors on duplicate _id) with Elasticsearch bulk requests. Defaults to `CREATE`. +* **trustSelfSignedCerts**: Whether to trust self-signed certificate or not. An Elasticsearch instance installed might have a self-signed certificate, Enable this to true to by-pass the validation on SSL certificate. (Defaults to: `false`). +* **disableCertificateValidation**: If `true`, trust the self-signed SSL certificate. An Elasticsearch instance might have a self-signed certificate. To bypass validation for the certificate, set this parameter to `true`. Defaults to `false`. +* **apiKeyKMSEncryptionKey**: The Cloud KMS key to decrypt the API key. This parameter is required if the `apiKeySource` is set to `KMS`. If this parameter is provided, pass in an encrypted `apiKey` string. Encrypt parameters using the KMS API encrypt endpoint. For the key, use the format `projects//locations//keyRings//cryptoKeys/`. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **apiKeySecretId**: The Secret Manager secret ID for the apiKey. If the `apiKeySource` is set to `SECRET_MANAGER`, provide this parameter. Use the format `projects//secrets//versions/. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **apiKeySource**: The source of the API key. Allowed values are `PLAINTEXT`, `KMS` orand `SECRET_MANAGER`. This parameter is required when you use Secret Manager or KMS. If `apiKeySource` is set to `KMS`, `apiKeyKMSEncryptionKey` and encrypted apiKey must be provided. If `apiKeySource` is set to `SECRET_MANAGER`, `apiKeySecretId` must be provided. If `apiKeySource` is set to `PLAINTEXT`, `apiKey` must be provided. Defaults to: PLAINTEXT. +* **socketTimeout**: If set, overwrites the default max retry timeout and default socket timeout (30000ms) in the Elastic RestClient. @@ -323,15 +323,15 @@ resource "google_dataflow_flex_template_job" "pubsub_to_elasticsearch_xlang" { name = "pubsub-to-elasticsearch-xlang" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" + inputSubscription = "" errorOutputTopic = "" - connectionUrl = "https://elasticsearch-host:9200" + connectionUrl = "" apiKey = "" # dataset = "PUBSUB" # namespace = "default" # elasticsearchTemplateVersion = "1.0.0" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" # elasticsearchUsername = "" # elasticsearchPassword = "" # batchSize = "1000" @@ -352,8 +352,8 @@ resource "google_dataflow_flex_template_job" "pubsub_to_elasticsearch_xlang" { # bulkInsertMethod = "CREATE" # trustSelfSignedCerts = "false" # disableCertificateValidation = "false" - # apiKeyKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # apiKeySecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" + # apiKeyKMSEncryptionKey = "" + # apiKeySecretId = "" # apiKeySource = "PLAINTEXT" # socketTimeout = "" } diff --git a/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/GCSToElasticsearchOptions.java b/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/GCSToElasticsearchOptions.java index b5d1e728db..db0d822d10 100644 --- a/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/GCSToElasticsearchOptions.java +++ b/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/GCSToElasticsearchOptions.java @@ -45,7 +45,7 @@ public interface GCSToElasticsearchOptions optional = true, regexes = {"[a-zA-Z0-9._-]+"}, description = "Input file format", - helpText = "Input file format. Default is: CSV") + helpText = "The input file format. Defaults to `CSV`.") @Default.String("csv") String getInputFormat(); diff --git a/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/PubSubToElasticsearchOptions.java b/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/PubSubToElasticsearchOptions.java index bc74f753af..df6fc13ef6 100644 --- a/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/PubSubToElasticsearchOptions.java +++ b/v2/googlecloud-to-elasticsearch/src/main/java/com/google/cloud/teleport/v2/elasticsearch/options/PubSubToElasticsearchOptions.java @@ -38,9 +38,8 @@ public interface PubSubToElasticsearchOptions order = 1, groupName = "Source", description = "Pub/Sub input subscription", - helpText = - "Pub/Sub subscription to consume the input from. Name should be in the format of 'projects/your-project-id/subscriptions/your-subscription-name'", - example = "projects/your-project-id/subscriptions/your-subscription-name") + helpText = "Pub/Sub subscription to consume the input from.", + example = "projects//subscriptions/") @Validation.Required String getInputSubscription(); @@ -52,7 +51,7 @@ public interface PubSubToElasticsearchOptions description = "Dataset, the type of logs that are sent to Pub/Sub", helpText = "The type of logs sent using Pub/Sub, for which we have an out-of-the-box dashboard. Known " - + "log types values are audit, vpcflow and firewall. Default 'pubsub'") + + "log types values are `audit`, `vpcflow`, and `firewall`. Defaults to: `pubsub`.") @Default.Enum("PUBSUB") Dataset getDataset(); @@ -63,7 +62,7 @@ public interface PubSubToElasticsearchOptions optional = true, description = "The namespace for dataset.", helpText = - "An arbitrary grouping, such as an environment (dev, prod, or qa), a team, or a strategic business unit. Default: 'default'") + "An arbitrary grouping, such as an environment (dev, prod, or qa), a team, or a strategic business unit. Defaults to: `default`.") @Default.String("default") String getNamespace(); @@ -73,7 +72,7 @@ public interface PubSubToElasticsearchOptions order = 4, description = "Output deadletter Pub/Sub topic", helpText = - "Pub/Sub output topic for publishing failed records in the format of 'projects/your-project-id/topics/your-topic-name'.") + "The Pub/Sub output topic for publishing failed records, in the format of `projects//topics/`.") @Validation.Required String getErrorOutputTopic(); diff --git a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_BigQuery.md b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_BigQuery.md index be97deef86..eda5430c38 100644 --- a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_BigQuery.md +++ b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_BigQuery.md @@ -17,31 +17,31 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bigQueryDataset** : The dataset name of the destination BigQuery table. -* **bigtableChangeStreamAppProfile** : The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. -* **bigtableReadInstanceId** : The source Bigtable instance ID. -* **bigtableReadTableId** : The source Bigtable table ID. +* **bigQueryDataset**: The dataset name of the destination BigQuery table. +* **bigtableChangeStreamAppProfile**: The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. +* **bigtableReadInstanceId**: The source Bigtable instance ID. +* **bigtableReadTableId**: The source Bigtable table ID. ### Optional parameters -* **writeRowkeyAsBytes** : Whether to write rowkeys as BigQuery `BYTES`. When set to `true`, row keys are written to the `BYTES` column. Otherwise, rowkeys are written to the `STRING` column. Defaults to `false`. -* **writeValuesAsBytes** : When set true values are written to BYTES column, otherwise to STRING column. Defaults to false. -* **writeNumericTimestamps** : Whether to write the Bigtable timestamp as BigQuery `INT64`. When set to true, values are written to the `INT64` column. Otherwise, values are written to the `TIMESTAMP` column. Columns affected: `timestamp`, `timestamp_from`, and `timestamp_to`. Defaults to `false`. When set to `true`, the time is measured in microseconds since the Unix epoch (January 1, 1970 at UTC). -* **bigQueryProjectId** : The BigQuery dataset project ID. The default is the project for the Dataflow job. -* **bigQueryChangelogTableName** : Destination BigQuery table name. If not specified, the value `bigtableReadTableId + "_changelog"` is used. Defaults to empty. -* **bigQueryChangelogTablePartitionGranularity** : Specifies a granularity for partitioning the changelog table. When set, the table is partitioned. Use one of the following supported values: `HOUR`, `DAY`, `MONTH`, or `YEAR`. By default, the table isn't partitioned. -* **bigQueryChangelogTablePartitionExpirationMs** : Sets the changelog table partition expiration time, in milliseconds. When set to true, partitions older than the specified number of milliseconds are deleted. By default, no expiration is set. -* **bigQueryChangelogTableFieldsToIgnore** : A comma-separated list of the changelog columns that, when specified, aren't created and populated. Use one of the following supported values: `is_gc`, `source_instance`, `source_cluster`, `source_table`, `tiebreaker`, or `big_query_commit_timestamp`. By default, all columns are populated. -* **dlqDirectory** : The directory to use for the dead-letter queue. Records that fail to be processed are stored in this directory. The default is a directory under the Dataflow job's temp location. In most cases, you can use the default path. -* **bigtableChangeStreamMetadataInstanceId** : The Bigtable change streams metadata instance ID. Defaults to empty. -* **bigtableChangeStreamMetadataTableTableId** : The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. -* **bigtableChangeStreamCharset** : The Bigtable change streams charset name. Defaults to: UTF-8. -* **bigtableChangeStreamStartTimestamp** : The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. -* **bigtableChangeStreamIgnoreColumnFamilies** : A comma-separated list of column family name changes to ignore. Defaults to empty. -* **bigtableChangeStreamIgnoreColumns** : A comma-separated list of column name changes to ignore. Defaults to empty. -* **bigtableChangeStreamName** : A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. -* **bigtableChangeStreamResume** : When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. -* **bigtableReadProjectId** : The Bigtable project ID. The default is the project for the Dataflow job. +* **writeRowkeyAsBytes**: Whether to write rowkeys as BigQuery `BYTES`. When set to `true`, row keys are written to the `BYTES` column. Otherwise, rowkeys are written to the `STRING` column. Defaults to `false`. +* **writeValuesAsBytes**: When set to `true`, values are written to a column of type BYTES, otherwise to a column of type STRING . Defaults to: `false`. +* **writeNumericTimestamps**: Whether to write the Bigtable timestamp as BigQuery INT64. When set to `true`, values are written to the INT64 column. Otherwise, values are written to the `TIMESTAMP` column. Columns affected: `timestamp`, `timestamp_from`, and `timestamp_to`. Defaults to `false`. When set to `true`, the time is measured in microseconds since the Unix epoch (January 1, 1970 at UTC). +* **bigQueryProjectId**: The BigQuery dataset project ID. The default is the project for the Dataflow job. +* **bigQueryChangelogTableName**: Destination BigQuery table name. If not specified, the value `bigtableReadTableId + "_changelog"` is used. Defaults to empty. +* **bigQueryChangelogTablePartitionGranularity**: Specifies a granularity for partitioning the changelog table. When set, the table is partitioned. Use one of the following supported values: `HOUR`, `DAY`, `MONTH`, or `YEAR`. By default, the table isn't partitioned. +* **bigQueryChangelogTablePartitionExpirationMs**: Sets the changelog table partition expiration time, in milliseconds. When set to `true`, partitions older than the specified number of milliseconds are deleted. By default, no expiration is set. +* **bigQueryChangelogTableFieldsToIgnore**: A comma-separated list of the changelog columns that, when specified, aren't created and populated. Use one of the following supported values: `is_gc`, `source_instance`, `source_cluster`, `source_table`, `tiebreaker`, or `big_query_commit_timestamp`. By default, all columns are populated. +* **dlqDirectory**: The directory to use for the dead-letter queue. Records that fail to be processed are stored in this directory. The default is a directory under the Dataflow job's temp location. In most cases, you can use the default path. +* **bigtableChangeStreamMetadataInstanceId**: The Bigtable change streams metadata instance ID. Defaults to empty. +* **bigtableChangeStreamMetadataTableTableId**: The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. +* **bigtableChangeStreamCharset**: The Bigtable change streams charset name. Defaults to: UTF-8. +* **bigtableChangeStreamStartTimestamp**: The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. +* **bigtableChangeStreamIgnoreColumnFamilies**: A comma-separated list of column family name changes to ignore. Defaults to empty. +* **bigtableChangeStreamIgnoreColumns**: A comma-separated list of column name changes to ignore. Defaults to empty. +* **bigtableChangeStreamName**: A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. +* **bigtableChangeStreamResume**: When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. +* **bigtableReadProjectId**: The Bigtable project ID. The default is the project for the Dataflow job. diff --git a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Google_Cloud_Storage.md b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Google_Cloud_Storage.md index 31a7b125d2..67ab96c4c3 100644 --- a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Google_Cloud_Storage.md +++ b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Google_Cloud_Storage.md @@ -14,32 +14,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **gcsOutputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. (Example: gs://your-bucket/your-path). -* **bigtableChangeStreamAppProfile** : The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. -* **bigtableReadInstanceId** : The source Bigtable instance ID. -* **bigtableReadTableId** : The source Bigtable table ID. +* **gcsOutputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. For example, `gs://your-bucket/your-path`. +* **bigtableChangeStreamAppProfile**: The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. +* **bigtableReadInstanceId**: The source Bigtable instance ID. +* **bigtableReadTableId**: The source Bigtable table ID. ### Optional parameters -* **outputFileFormat** : The format of the output Cloud Storage file. Allowed formats are TEXT, AVRO. Defaults to AVRO. -* **windowDuration** : The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 1h). Defaults to: 1h. -* **bigtableMetadataTableTableId** : Table ID used for creating the metadata table. -* **schemaOutputFormat** : Schema chosen for outputting data to GCS. CHANGELOG_ENTRY support TEXT and AVRO output formats, BIGTABLE_ROW only supports AVRO output. Defaults to: CHANGELOG_ENTRY. -* **outputFilenamePrefix** : The prefix to place on each windowed file. Defaults to "changelog-" (Example: changelog-). -* **outputBatchSize** : Batching mutations reduces overhead and cost. Depending on the size of values written to Cloud Bigtable the batch size might need to be adjusted lower to avoid memory pressures on the worker fleet. Defaults to 10000. -* **outputShardsCount** : The maximum number of output shards produced when writing to Cloud Storage. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 20. -* **useBase64Rowkeys** : Only supported for the TEXT output file format. When set to true, rowkeys will be written as Base64-encoded strings. Otherwise bigtableChangeStreamCharset charset will be used to decode binary values into String rowkeysDefaults to false. -* **useBase64ColumnQualifiers** : Only supported for the TEXT output file format. When set to true, column qualifiers will be written as Base64-encoded strings. Otherwise bigtableChangeStreamCharset charset will be used to decode binary values into String column qualifiersDefaults to false. -* **useBase64Values** : Only supported for the TEXT output file format. When set to true, values will be written as Base64-encoded strings. Otherwise bigtableChangeStreamCharset charset will be used to decode binary values into String valuesDefaults to false. -* **bigtableChangeStreamMetadataInstanceId** : The Bigtable change streams metadata instance ID. Defaults to empty. -* **bigtableChangeStreamMetadataTableTableId** : The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. -* **bigtableChangeStreamCharset** : The Bigtable change streams charset name. Defaults to: UTF-8. -* **bigtableChangeStreamStartTimestamp** : The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. -* **bigtableChangeStreamIgnoreColumnFamilies** : A comma-separated list of column family name changes to ignore. Defaults to empty. -* **bigtableChangeStreamIgnoreColumns** : A comma-separated list of column name changes to ignore. Defaults to empty. -* **bigtableChangeStreamName** : A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. -* **bigtableChangeStreamResume** : When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. -* **bigtableReadProjectId** : The Bigtable project ID. The default is the project for the Dataflow job. +* **outputFileFormat**: The format of the output Cloud Storage file. Allowed formats are TEXT, AVRO. Defaults to AVRO. +* **windowDuration**: The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `1h`. Defaults to: 1h. +* **bigtableMetadataTableTableId**: Table ID used for creating the metadata table. +* **schemaOutputFormat**: Schema chosen for outputting data to GCS. CHANGELOG_ENTRY support TEXT and AVRO output formats, BIGTABLE_ROW only supports AVRO output. Defaults to: CHANGELOG_ENTRY. +* **outputFilenamePrefix**: The prefix to place on each windowed file. Defaults to "changelog-" For example, `changelog-`. +* **outputBatchSize**: Batching mutations reduces overhead and cost. Depending on the size of values written to Cloud Bigtable the batch size might need to be adjusted lower to avoid memory pressures on the worker fleet. Defaults to 10000. +* **outputShardsCount**: The maximum number of output shards produced when writing to Cloud Storage. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 20. +* **useBase64Rowkeys**: Only supported for the TEXT output file format. When set to true, rowkeys will be written as Base64-encoded strings. Otherwise bigtableChangeStreamCharset charset will be used to decode binary values into String rowkeysDefaults to false. +* **useBase64ColumnQualifiers**: Only supported for the TEXT output file format. When set to true, column qualifiers will be written as Base64-encoded strings. Otherwise bigtableChangeStreamCharset charset will be used to decode binary values into String column qualifiersDefaults to false. +* **useBase64Values**: Only supported for the TEXT output file format. When set to true, values will be written as Base64-encoded strings. Otherwise bigtableChangeStreamCharset charset will be used to decode binary values into String valuesDefaults to false. +* **bigtableChangeStreamMetadataInstanceId**: The Bigtable change streams metadata instance ID. Defaults to empty. +* **bigtableChangeStreamMetadataTableTableId**: The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. +* **bigtableChangeStreamCharset**: The Bigtable change streams charset name. Defaults to: UTF-8. +* **bigtableChangeStreamStartTimestamp**: The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. +* **bigtableChangeStreamIgnoreColumnFamilies**: A comma-separated list of column family name changes to ignore. Defaults to empty. +* **bigtableChangeStreamIgnoreColumns**: A comma-separated list of column name changes to ignore. Defaults to empty. +* **bigtableChangeStreamName**: A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. +* **bigtableChangeStreamResume**: When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. +* **bigtableReadProjectId**: The Bigtable project ID. The default is the project for the Dataflow job. @@ -267,7 +267,7 @@ resource "google_dataflow_flex_template_job" "bigtable_change_streams_to_google_ name = "bigtable-change-streams-to-google-cloud-storage" region = var.region parameters = { - gcsOutputDirectory = "gs://your-bucket/your-path" + gcsOutputDirectory = "" bigtableChangeStreamAppProfile = "" bigtableReadInstanceId = "" bigtableReadTableId = "" diff --git a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_PubSub.md b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_PubSub.md index 07aa3daef2..552c088a28 100644 --- a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_PubSub.md +++ b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_PubSub.md @@ -17,33 +17,33 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **pubSubTopic** : The name of the destination Pub/Sub topic. -* **bigtableChangeStreamAppProfile** : The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. -* **bigtableReadInstanceId** : The source Bigtable instance ID. -* **bigtableReadTableId** : The source Bigtable table ID. +* **pubSubTopic**: The name of the destination Pub/Sub topic. +* **bigtableChangeStreamAppProfile**: The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. +* **bigtableReadInstanceId**: The source Bigtable instance ID. +* **bigtableReadTableId**: The source Bigtable table ID. ### Optional parameters -* **messageEncoding** : The encoding of the messages to be published to the Pub/Sub topic. When the schema of the destination topic is configured, the message encoding is determined by the topic settings. The following values are supported: `BINARY` and `JSON`. Defaults to `JSON`. -* **messageFormat** : The encoding of the messages to publish to the Pub/Sub topic. When the schema of the destination topic is configured, the message encoding is determined by the topic settings. The following values are supported: `AVRO`, `PROTOCOL_BUFFERS`, and `JSON`. The default value is `JSON`. When the `JSON` format is used, the rowKey, column, and value fields of the message are strings, the contents of which are determined by the pipeline options `useBase64Rowkeys`, `useBase64ColumnQualifiers`, `useBase64Values`, and `bigtableChangeStreamCharset`. -* **stripValues** : When set to true, the SET_CELL mutations are returned without new values set. Defaults to false. This parameter is useful when you don't need a new value to be present, also known as cache invalidation, or when values are extremely large and exceed Pub/Sub message size limits. -* **dlqDirectory** : The directory for the dead-letter queue. Records that fail to be processed are stored in this directory. Defaults to a directory under the Dataflow job temp location. In most cases, you can use the default path. -* **dlqRetryMinutes** : The number of minutes between dead-letter queue retries. Defaults to `10`. -* **dlqMaxRetries** : The dead letter maximum retries. Defaults to `5`. -* **useBase64Rowkeys** : Used with JSON message encoding. When set to `true`, the `rowKey` field is a Base64-encoded string. Otherwise, the `rowKey` is produced by using `bigtableChangeStreamCharset` to decode bytes into a string. Defaults to `false`. -* **pubSubProjectId** : The Bigtable project ID. The default is the project of the Dataflow job. -* **useBase64ColumnQualifiers** : Used with JSON message encoding. When set to `true`, the `column` field is a Base64-encoded string. Otherwise, the column is produced by using `bigtableChangeStreamCharset` to decode bytes into a string. Defaults to `false`. -* **useBase64Values** : Used with JSON message encoding. When set to `true`, the value field is a Base64-encoded string. Otherwise, the value isproduced by using `bigtableChangeStreamCharset` to decode bytes into a string. Defaults to `false`. -* **disableDlqRetries** : Whether or not to disable retries for the DLQ. Defaults to: false. -* **bigtableChangeStreamMetadataInstanceId** : The Bigtable change streams metadata instance ID. Defaults to empty. -* **bigtableChangeStreamMetadataTableTableId** : The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. -* **bigtableChangeStreamCharset** : The Bigtable change streams charset name. Defaults to: UTF-8. -* **bigtableChangeStreamStartTimestamp** : The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. -* **bigtableChangeStreamIgnoreColumnFamilies** : A comma-separated list of column family name changes to ignore. Defaults to empty. -* **bigtableChangeStreamIgnoreColumns** : A comma-separated list of column name changes to ignore. Defaults to empty. -* **bigtableChangeStreamName** : A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. -* **bigtableChangeStreamResume** : When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. -* **bigtableReadProjectId** : The Bigtable project ID. The default is the project for the Dataflow job. +* **messageEncoding**: The encoding of the messages to be published to the Pub/Sub topic. When the schema of the destination topic is configured, the message encoding is determined by the topic settings. The following values are supported: `BINARY` and `JSON`. Defaults to `JSON`. +* **messageFormat**: The encoding of the messages to publish to the Pub/Sub topic. When the schema of the destination topic is configured, the message encoding is determined by the topic settings. The following values are supported: `AVRO`, `PROTOCOL_BUFFERS`, and `JSON`. The default value is `JSON`. When the `JSON` format is used, the rowKey, column, and value fields of the message are strings, the contents of which are determined by the pipeline options `useBase64Rowkeys`, `useBase64ColumnQualifiers`, `useBase64Values`, and `bigtableChangeStreamCharset`. +* **stripValues**: When set to `true`, the `SET_CELL` mutations are returned without new values set. Defaults to `false`. This parameter is useful when you don't need a new value to be present, also known as cache invalidation, or when values are extremely large and exceed Pub/Sub message size limits. +* **dlqDirectory**: The directory for the dead-letter queue. Records that fail to be processed are stored in this directory. Defaults to a directory under the Dataflow job temp location. In most cases, you can use the default path. +* **dlqRetryMinutes**: The number of minutes between dead-letter queue retries. Defaults to `10`. +* **dlqMaxRetries**: The dead letter maximum retries. Defaults to `5`. +* **useBase64Rowkeys**: Used with JSON message encoding. When set to `true`, the `rowKey` field is a Base64-encoded string. Otherwise, the `rowKey` is produced by using `bigtableChangeStreamCharset` to decode bytes into a string. Defaults to `false`. +* **pubSubProjectId**: The Bigtable project ID. The default is the project of the Dataflow job. +* **useBase64ColumnQualifiers**: Used with JSON message encoding. When set to `true`, the `column` field is a Base64-encoded string. Otherwise, the column is produced by using `bigtableChangeStreamCharset` to decode bytes into a string. Defaults to `false`. +* **useBase64Values**: Used with JSON message encoding. When set to `true`, the value field is a Base64-encoded string. Otherwise, the value isproduced by using `bigtableChangeStreamCharset` to decode bytes into a string. Defaults to `false`. +* **disableDlqRetries**: Whether or not to disable retries for the DLQ. Defaults to: false. +* **bigtableChangeStreamMetadataInstanceId**: The Bigtable change streams metadata instance ID. Defaults to empty. +* **bigtableChangeStreamMetadataTableTableId**: The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. +* **bigtableChangeStreamCharset**: The Bigtable change streams charset name. Defaults to: UTF-8. +* **bigtableChangeStreamStartTimestamp**: The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. +* **bigtableChangeStreamIgnoreColumnFamilies**: A comma-separated list of column family name changes to ignore. Defaults to empty. +* **bigtableChangeStreamIgnoreColumns**: A comma-separated list of column name changes to ignore. Defaults to empty. +* **bigtableChangeStreamName**: A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. +* **bigtableChangeStreamResume**: When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. +* **bigtableReadProjectId**: The Bigtable project ID. The default is the project for the Dataflow job. diff --git a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Vector_Search.md b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Vector_Search.md index 64340f0238..6ff43b025b 100644 --- a/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Vector_Search.md +++ b/v2/googlecloud-to-googlecloud/README_Bigtable_Change_Streams_to_Vector_Search.md @@ -17,36 +17,36 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **embeddingColumn** : The fully qualified column name where the embeddings are stored. In the format cf:col. -* **embeddingByteSize** : The byte size of each entry in the embeddings array. Use 4 for Float, and 8 for Double. Defaults to: 4. -* **vectorSearchIndex** : The Vector Search Index where changes will be streamed, in the format 'projects/{projectID}/locations/{region}/indexes/{indexID}' (no leading or trailing spaces) (Example: projects/123/locations/us-east1/indexes/456). -* **bigtableChangeStreamAppProfile** : The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. -* **bigtableReadInstanceId** : The source Bigtable instance ID. -* **bigtableReadTableId** : The source Bigtable table ID. +* **embeddingColumn**: The fully qualified column name where the embeddings are stored. In the format cf:col. +* **embeddingByteSize**: The byte size of each entry in the embeddings array. Use 4 for Float, and 8 for Double. Defaults to: 4. +* **vectorSearchIndex**: The Vector Search Index where changes will be streamed, in the format 'projects/{projectID}/locations/{region}/indexes/{indexID}' (no leading or trailing spaces) For example, `projects/123/locations/us-east1/indexes/456`. +* **bigtableChangeStreamAppProfile**: The Bigtable application profile ID. The application profile must use single-cluster routing and allow single-row transactions. +* **bigtableReadInstanceId**: The source Bigtable instance ID. +* **bigtableReadTableId**: The source Bigtable table ID. ### Optional parameters -* **bigtableMetadataTableTableId** : Table ID used for creating the metadata table. -* **crowdingTagColumn** : The fully qualified column name where the crowding tag is stored. In the format cf:col. -* **allowRestrictsMappings** : The comma separated fully qualified column names of the columns that should be used as the `allow` restricts, with their alias. In the format cf:col->alias. -* **denyRestrictsMappings** : The comma separated fully qualified column names of the columns that should be used as the `deny` restricts, with their alias. In the format cf:col->alias. -* **intNumericRestrictsMappings** : The comma separated fully qualified column names of the columns that should be used as integer `numeric_restricts`, with their alias. In the format cf:col->alias. -* **floatNumericRestrictsMappings** : The comma separated fully qualified column names of the columns that should be used as float (4 bytes) `numeric_restricts`, with their alias. In the format cf:col->alias. -* **doubleNumericRestrictsMappings** : The comma separated fully qualified column names of the columns that should be used as double (8 bytes) `numeric_restricts`, with their alias. In the format cf:col->alias. -* **upsertMaxBatchSize** : The maximum number of upserts to buffer before upserting the batch to the Vector Search Index. Batches will be sent when there are either upsertBatchSize records ready, or any record has been waiting upsertBatchDelay time has passed. (Example: 10). Defaults to: 10. -* **upsertMaxBufferDuration** : The maximum delay before a batch of upserts is sent to Vector Search.Batches will be sent when there are either upsertBatchSize records ready, or any record has been waiting upsertBatchDelay time has passed. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 10s). Defaults to: 10s. -* **deleteMaxBatchSize** : The maximum number of deletes to buffer before deleting the batch from the Vector Search Index. Batches will be sent when there are either deleteBatchSize records ready, or any record has been waiting deleteBatchDelay time has passed. (Example: 10). Defaults to: 10. -* **deleteMaxBufferDuration** : The maximum delay before a batch of deletes is sent to Vector Search.Batches will be sent when there are either deleteBatchSize records ready, or any record has been waiting deleteBatchDelay time has passed. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 10s). Defaults to: 10s. -* **dlqDirectory** : The path to store any unprocessed records with the reason they failed to be processed. Default is a directory under the Dataflow job's temp location. The default value is enough under most conditions. -* **bigtableChangeStreamMetadataInstanceId** : The Bigtable change streams metadata instance ID. Defaults to empty. -* **bigtableChangeStreamMetadataTableTableId** : The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. -* **bigtableChangeStreamCharset** : The Bigtable change streams charset name. Defaults to: UTF-8. -* **bigtableChangeStreamStartTimestamp** : The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. -* **bigtableChangeStreamIgnoreColumnFamilies** : A comma-separated list of column family name changes to ignore. Defaults to empty. -* **bigtableChangeStreamIgnoreColumns** : A comma-separated list of column name changes to ignore. Defaults to empty. -* **bigtableChangeStreamName** : A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. -* **bigtableChangeStreamResume** : When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. -* **bigtableReadProjectId** : The Bigtable project ID. The default is the project for the Dataflow job. +* **bigtableMetadataTableTableId**: Table ID used for creating the metadata table. +* **crowdingTagColumn**: The fully qualified column name where the crowding tag is stored. In the format cf:col. +* **allowRestrictsMappings**: The comma separated fully qualified column names of the columns that should be used as the `allow` restricts, with their alias. In the format cf:col->alias. +* **denyRestrictsMappings**: The comma separated fully qualified column names of the columns that should be used as the `deny` restricts, with their alias. In the format cf:col->alias. +* **intNumericRestrictsMappings**: The comma separated fully qualified column names of the columns that should be used as integer `numeric_restricts`, with their alias. In the format cf:col->alias. +* **floatNumericRestrictsMappings**: The comma separated fully qualified column names of the columns that should be used as float (4 bytes) `numeric_restricts`, with their alias. In the format cf:col->alias. +* **doubleNumericRestrictsMappings**: The comma separated fully qualified column names of the columns that should be used as double (8 bytes) `numeric_restricts`, with their alias. In the format cf:col->alias. +* **upsertMaxBatchSize**: The maximum number of upserts to buffer before upserting the batch to the Vector Search Index. Batches will be sent when there are either upsertBatchSize records ready, or any record has been waiting upsertBatchDelay time has passed. For example, `10`. Defaults to: 10. +* **upsertMaxBufferDuration**: The maximum delay before a batch of upserts is sent to Vector Search.Batches will be sent when there are either upsertBatchSize records ready, or any record has been waiting upsertBatchDelay time has passed. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `10s`. Defaults to: 10s. +* **deleteMaxBatchSize**: The maximum number of deletes to buffer before deleting the batch from the Vector Search Index. Batches will be sent when there are either deleteBatchSize records ready, or any record has been waiting deleteBatchDelay time has passed. For example, `10`. Defaults to: 10. +* **deleteMaxBufferDuration**: The maximum delay before a batch of deletes is sent to Vector Search.Batches will be sent when there are either deleteBatchSize records ready, or any record has been waiting deleteBatchDelay time has passed. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `10s`. Defaults to: 10s. +* **dlqDirectory**: The path to store any unprocessed records with the reason they failed to be processed. Default is a directory under the Dataflow job's temp location. The default value is enough under most conditions. +* **bigtableChangeStreamMetadataInstanceId**: The Bigtable change streams metadata instance ID. Defaults to empty. +* **bigtableChangeStreamMetadataTableTableId**: The ID of the Bigtable change streams connector metadata table. If not provided, a Bigtable change streams connector metadata table is automatically created during pipeline execution. Defaults to empty. +* **bigtableChangeStreamCharset**: The Bigtable change streams charset name. Defaults to: UTF-8. +* **bigtableChangeStreamStartTimestamp**: The starting timestamp (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, `2022-05-05T07:59:59Z`. Defaults to the timestamp of the pipeline start time. +* **bigtableChangeStreamIgnoreColumnFamilies**: A comma-separated list of column family name changes to ignore. Defaults to empty. +* **bigtableChangeStreamIgnoreColumns**: A comma-separated list of column name changes to ignore. Defaults to empty. +* **bigtableChangeStreamName**: A unique name for the client pipeline. Lets you resume processing from the point at which a previously running pipeline stopped. Defaults to an automatically generated name. See the Dataflow job logs for the value used. +* **bigtableChangeStreamResume**: When set to `true`, a new pipeline resumes processing from the point at which a previously running pipeline with the same `bigtableChangeStreamName` value stopped. If the pipeline with the given `bigtableChangeStreamName` value has never run, a new pipeline doesn't start. When set to `false`, a new pipeline starts. If a pipeline with the same `bigtableChangeStreamName` value has already run for the given source, a new pipeline doesn't start. Defaults to `false`. +* **bigtableReadProjectId**: The Bigtable project ID. The default is the project for the Dataflow job. @@ -288,7 +288,7 @@ resource "google_dataflow_flex_template_job" "bigtable_change_streams_to_vector_ parameters = { embeddingColumn = "" embeddingByteSize = "4" - vectorSearchIndex = "projects/123/locations/us-east1/indexes/456" + vectorSearchIndex = "" bigtableChangeStreamAppProfile = "" bigtableReadInstanceId = "" bigtableReadTableId = "" diff --git a/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_Avro_Flex.md b/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_Avro_Flex.md index 6cd8460fea..2ce705b0e7 100644 --- a/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_Avro_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_Avro_Flex.md @@ -18,23 +18,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. -* **avroTempDirectory** : Directory for temporary Avro files. +* **outputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. +* **avroTempDirectory**: Directory for temporary Avro files. ### Optional parameters -* **inputSubscription** : Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name' (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **inputTopic** : Pub/Sub topic to read the input from, in the format of 'projects/your-project-id/topics/your-topic-name'. -* **outputFilenamePrefix** : The prefix to place on each windowed file. Defaults to: output. -* **outputFilenameSuffix** : The suffix to place on each windowed file. Typically a file extension such as .txt or .csv. Defaults to empty. -* **outputShardTemplate** : The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. This means that all data outputs into a single file per window. The `outputShardTemplate` defaults to `W-P-SS-of-NN` where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. -* **numShards** : The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 0. -* **windowDuration** : The window duration is the interval in which data is written to the output directory. Configure the duration based on the pipeline's throughput. For example, a higher throughput might require smaller window sizes so that the data fits into memory. Defaults to 5m (5 minutes), with a minimum of 1s (1 second). Allowed formats are: [int]s (for seconds, example: 5s), [int]m (for minutes, example: 12m), [int]h (for hours, example: 2h). (Example: 5m). -* **yearPattern** : Pattern for formatting the year. Must be one or more of 'y' or 'Y'. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory ('/') character. Defaults to 'YYYY'. -* **monthPattern** : Pattern for formatting the month. Must be one or more of the 'M' character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory ('/') character. Defaults to 'MM'. -* **dayPattern** : Pattern for formatting the day. Must be one or more of 'd' for day of month or 'D' for day of year. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory ('/') character. Defaults to 'dd'. -* **hourPattern** : Pattern for formatting the hour. Must be one or more of the 'H' character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory ('/') character. Defaults to 'HH'. -* **minutePattern** : Pattern for formatting the minute. Must be one or more of the 'm' character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory ('/') character. Defaults to 'mm'. +* **inputSubscription**: Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name' For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **inputTopic**: Pub/Sub topic to read the input from, in the format of 'projects/your-project-id/topics/your-topic-name'. +* **outputFilenamePrefix**: The prefix to place on each windowed file. Defaults to: output. +* **outputFilenameSuffix**: The suffix to place on each windowed file. Typically a file extension such as .txt or .csv. Defaults to empty. +* **outputShardTemplate**: The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. This means that all data outputs into a single file per window. The `outputShardTemplate` defaults to `W-P-SS-of-NN` where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. +* **numShards**: The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 0. +* **windowDuration**: The window duration is the interval in which data is written to the output directory. Configure the duration based on the pipeline's throughput. For example, a higher throughput might require smaller window sizes so that the data fits into memory. Defaults to `5m` (5 minutes), with a minimum of `1s` (1 second). Allowed formats are: `[int]s` (for seconds, example: `5s`), `[int]m` (for minutes, example: `12m`), `[int]h` (for hours, example: `2h`). For example, `5m`. +* **yearPattern**: Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory (`/`) character. Defaults to `YYYY`. +* **monthPattern**: Pattern for formatting the month. Must be one or more of the `M` character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory (`/`) character. Defaults to `MM`. +* **dayPattern**: Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for day of year. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory (`/`) character. Defaults to `dd`. +* **hourPattern**: Pattern for formatting the hour. Must be one or more of the `H` character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory (`/`) character. Defaults to `HH`. +* **minutePattern**: Pattern for formatting the minute. Must be one or more of the `m` character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory (`/`) character. Defaults to `mm`. @@ -237,7 +237,7 @@ resource "google_dataflow_flex_template_job" "cloud_pubsub_to_avro_flex" { parameters = { outputDirectory = "" avroTempDirectory = "" - # inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" + # inputSubscription = "" # inputTopic = "" # outputFilenamePrefix = "output" # outputFilenameSuffix = "" diff --git a/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_GCS_Text_Flex.md b/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_GCS_Text_Flex.md index f7384d273f..0407ba3b81 100644 --- a/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_GCS_Text_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_Cloud_PubSub_to_GCS_Text_Flex.md @@ -20,23 +20,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputDirectory** : The path and filename prefix to write write output files to. This value must end in a slash. (Example: gs://your-bucket/your-path). +* **outputDirectory**: The path and filename prefix to write write output files to. This value must end in a slash. For example, `gs://your-bucket/your-path/`. ### Optional parameters -* **inputTopic** : The Pub/Sub topic to read the input from. The topic name should be in the format `projects//topics/`. If this parameter is provided don't use `inputSubscription`. (Example: projects/your-project-id/topics/your-topic-name). -* **inputSubscription** : The Pub/Sub subscription to read the input from. The subscription name uses the format `projects//subscription/`. If this parameter is provided, don't use `inputTopic`. (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **userTempLocation** : The user provided directory to output temporary files to. Must end with a slash. -* **outputFilenamePrefix** : The prefix to place on each windowed file. (Example: output-). Defaults to: output. -* **outputFilenameSuffix** : The suffix to place on each windowed file, typically a file extension such as `.txt` or `.csv`. (Example: .txt). Defaults to empty. -* **outputShardTemplate** : The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. This means that all data outputs into a single file per window. The `outputShardTemplate` defaults to `W-P-SS-of-NN` where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. -* **numShards** : The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 0. -* **windowDuration** : The window duration is the interval in which data is written to the output directory. Configure the duration based on the pipeline's throughput. For example, a higher throughput might require smaller window sizes so that the data fits into memory. Defaults to 5m (5 minutes), with a minimum of 1s (1 second). Allowed formats are: [int]s (for seconds, example: 5s), [int]m (for minutes, example: 12m), [int]h (for hours, example: 2h). (Example: 5m). -* **yearPattern** : Pattern for formatting the year. Must be one or more of 'y' or 'Y'. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory ('/') character. Defaults to 'YYYY'. -* **monthPattern** : Pattern for formatting the month. Must be one or more of the 'M' character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory ('/') character. Defaults to 'MM'. -* **dayPattern** : Pattern for formatting the day. Must be one or more of 'd' for day of month or 'D' for day of year. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory ('/') character. Defaults to 'dd'. -* **hourPattern** : Pattern for formatting the hour. Must be one or more of the 'H' character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory ('/') character. Defaults to 'HH'. -* **minutePattern** : Pattern for formatting the minute. Must be one or more of the 'm' character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory ('/') character. Defaults to 'mm'. +* **inputTopic**: The Pub/Sub topic to read the input from. If this parameter is provided don't use `inputSubscription`. For example, `projects//topics/`. +* **inputSubscription**: The Pub/Sub subscription to read the input from. If this parameter is provided, don't use `inputTopic`. For example, `projects//subscription/`. +* **userTempLocation**: The user provided directory to output temporary files to. Must end with a slash. +* **outputFilenamePrefix**: The prefix to place on each windowed file. For example, `output-`. Defaults to: output. +* **outputFilenameSuffix**: The suffix to place on each windowed file, typically a file extension such as `.txt` or `.csv`. For example, `.txt`. Defaults to empty. +* **outputShardTemplate**: The shard template defines the dynamic portion of each windowed file. By default, the pipeline uses a single shard for output to the file system within each window. This means that all data outputs into a single file per window. The `outputShardTemplate` defaults to `W-P-SS-of-NN` where `W` is the window date range, `P` is the pane info, `S` is the shard number, and `N` is the number of shards. In case of a single file, the `SS-of-NN` portion of the `outputShardTemplate` is `00-of-01`. +* **numShards**: The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 0. +* **windowDuration**: The window duration is the interval in which data is written to the output directory. Configure the duration based on the pipeline's throughput. For example, a higher throughput might require smaller window sizes so that the data fits into memory. Defaults to `5m` (5 minutes), with a minimum of `1s` (1 second). Allowed formats are: `[int]s` (for seconds, example: `5s`), `[int]m` (for minutes, example: `12m`), `[int]h` (for hours, example: `2h`). For example, `5m`. +* **yearPattern**: Pattern for formatting the year. Must be one or more of `y` or `Y`. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory (`/`) character. Defaults to `YYYY`. +* **monthPattern**: Pattern for formatting the month. Must be one or more of the `M` character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory (`/`) character. Defaults to `MM`. +* **dayPattern**: Pattern for formatting the day. Must be one or more of `d` for day of month or `D` for day of year. Case makes no difference in the year. The pattern can be optionally wrapped by characters that aren't either alphanumeric or the directory (`/`) character. Defaults to `dd`. +* **hourPattern**: Pattern for formatting the hour. Must be one or more of the `H` character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory (`/`) character. Defaults to `HH`. +* **minutePattern**: Pattern for formatting the minute. Must be one or more of the `m` character. The pattern can be optionally wrapped by characters that aren't alphanumeric or the directory (`/`) character. Defaults to `mm`. @@ -237,12 +237,12 @@ resource "google_dataflow_flex_template_job" "cloud_pubsub_to_gcs_text_flex" { name = "cloud-pubsub-to-gcs-text-flex" region = var.region parameters = { - outputDirectory = "gs://your-bucket/your-path" - # inputTopic = "projects/your-project-id/topics/your-topic-name" - # inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" + outputDirectory = "" + # inputTopic = "" + # inputSubscription = "" # userTempLocation = "" - # outputFilenamePrefix = "output-" - # outputFilenameSuffix = ".txt" + # outputFilenamePrefix = "output" + # outputFilenameSuffix = "" # outputShardTemplate = "W-P-SS-of-NN" # numShards = "0" # windowDuration = "5m" diff --git a/v2/googlecloud-to-googlecloud/README_Cloud_Spanner_to_BigQuery_Flex.md b/v2/googlecloud-to-googlecloud/README_Cloud_Spanner_to_BigQuery_Flex.md index 5ddbabacbb..2469d5ba11 100644 --- a/v2/googlecloud-to-googlecloud/README_Cloud_Spanner_to_BigQuery_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_Cloud_Spanner_to_BigQuery_Flex.md @@ -17,21 +17,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **spannerInstanceId** : The instance ID of the Spanner database to read from. -* **spannerDatabaseId** : The database ID of the Spanner database to export. -* **outputTableSpec** : The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. +* **spannerInstanceId**: The instance ID of the Spanner database to read from. +* **spannerDatabaseId**: The database ID of the Spanner database to export. +* **outputTableSpec**: The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. ### Optional parameters -* **spannerProjectId** : The ID of the project that the Spanner database resides in. The default value for this parameter is the project where the Dataflow pipeline is running. -* **spannerTableId** : The table name of the Spanner database to export. Ignored if sqlQuery is set. -* **spannerRpcPriority** : The request priority (https://cloud.google.com/spanner/docs/reference/rest/v1/RequestOptions) for Spanner calls. Possible values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `HIGH`. -* **sqlQuery** : The SQL query to use to read data from the Spanner database. Required if spannerTableId is empty. -* **bigQuerySchemaPath** : The Cloud Storage path (gs://) to the JSON file that defines your BigQuery schema. (Example: gs://your-bucket/your-schema.json). -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **spannerProjectId**: The ID of the project that the Spanner database resides in. The default value for this parameter is the project where the Dataflow pipeline is running. +* **spannerTableId**: The table name of the Spanner database to export. Ignored if sqlQuery is set. +* **spannerRpcPriority**: The request priority (https://cloud.google.com/spanner/docs/reference/rest/v1/RequestOptions) for Spanner calls. Possible values are `HIGH`, `MEDIUM`, and `LOW`. The default value is `HIGH`. +* **sqlQuery**: The SQL query to use to read data from the Spanner database. Required if spannerTableId is empty. +* **bigQuerySchemaPath**: The Cloud Storage path (gs://) to the JSON file that defines your BigQuery schema. This is required if the Create Disposition is not CREATE_NEVER For example, `gs://your-bucket/your-schema.json`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. @@ -233,7 +233,7 @@ resource "google_dataflow_flex_template_job" "cloud_spanner_to_bigquery_flex" { # spannerTableId = "" # spannerRpcPriority = "" # sqlQuery = "" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" + # bigQuerySchemaPath = "" # writeDisposition = "WRITE_APPEND" # createDisposition = "CREATE_IF_NEEDED" # useStorageWriteApi = "false" diff --git a/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Flex.md b/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Flex.md index 9da3ecd774..1e90e0564f 100644 --- a/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Flex.md @@ -13,21 +13,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process (Example: gs://your-bucket/your-files/temp_dir). -* **firestoreReadGqlQuery** : Specifies which Firestore entities to read. Ex: ‘SELECT * FROM MyKind’. -* **firestoreReadProjectId** : The Google Cloud project ID of the Firestore instance to read from. +* **outputTableSpec**: BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process For example, `gs://your-bucket/your-files/temp_dir`. +* **firestoreReadGqlQuery**: Specifies which Firestore entities to read. Ex: ‘SELECT * FROM MyKind’. +* **firestoreReadProjectId**: The Google Cloud project ID of the Firestore instance to read from. ### Optional parameters -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is not set, or set to CREATE_IF_NEEDED, this parameter must be specified. (Example: gs://your-bucket/your-schema.json). -* **firestoreReadNamespace** : Namespace of requested Firestore entities. Leave blank to use default namespace. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is not set, or set to CREATE_IF_NEEDED, this parameter must be specified. For example, `gs://your-bucket/your-schema.json`. +* **firestoreReadNamespace**: Namespace of requested Firestore entities. Leave blank to use default namespace. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. ## User-Defined functions (UDFs) @@ -233,12 +233,12 @@ resource "google_dataflow_flex_template_job" "firestore_to_bigquery_flex" { region = var.region parameters = { outputTableSpec = "" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" + bigQueryLoadingTemporaryDirectory = "" firestoreReadGqlQuery = "" firestoreReadProjectId = "" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" + # bigQuerySchemaPath = "" # firestoreReadNamespace = "" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" diff --git a/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Xlang.md b/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Xlang.md index 9b8ade772d..7c409d7f54 100644 --- a/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Xlang.md +++ b/v2/googlecloud-to-googlecloud/README_Firestore_to_BigQuery_Xlang.md @@ -13,21 +13,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process (Example: gs://your-bucket/your-files/temp_dir). -* **firestoreReadGqlQuery** : Specifies which Firestore entities to read. Ex: ‘SELECT * FROM MyKind’. -* **firestoreReadProjectId** : The Google Cloud project ID of the Firestore instance to read from. +* **outputTableSpec**: BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process For example, `gs://your-bucket/your-files/temp_dir`. +* **firestoreReadGqlQuery**: Specifies which Firestore entities to read. Ex: ‘SELECT * FROM MyKind’. +* **firestoreReadProjectId**: The Google Cloud project ID of the Firestore instance to read from. ### Optional parameters -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is not set, or set to CREATE_IF_NEEDED, this parameter must be specified. (Example: gs://your-bucket/your-schema.json). -* **firestoreReadNamespace** : Namespace of requested Firestore entities. Leave blank to use default namespace. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is not set, or set to CREATE_IF_NEEDED, this parameter must be specified. For example, `gs://your-bucket/your-schema.json`. +* **firestoreReadNamespace**: Namespace of requested Firestore entities. Leave blank to use default namespace. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. @@ -223,13 +223,13 @@ resource "google_dataflow_flex_template_job" "firestore_to_bigquery_xlang" { region = var.region parameters = { outputTableSpec = "" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" + bigQueryLoadingTemporaryDirectory = "" firestoreReadGqlQuery = "" firestoreReadProjectId = "" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" + # bigQuerySchemaPath = "" # firestoreReadNamespace = "" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" # writeDisposition = "WRITE_APPEND" diff --git a/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Flex.md b/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Flex.md index 0e550f728a..67537d2a1e 100644 --- a/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Flex.md @@ -19,17 +19,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The gs:// path to the text in Cloud Storage you'd like to process. (Example: gs://your-bucket/your-file.txt). -* **JSONPath** : The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. (Example: gs://your-bucket/your-schema.json). -* **outputTable** : The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. (Example: :.). -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) you want to use. (Example: gs://your-bucket/your-transforms/*.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) that you want to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples) (Example: transform_udf1). -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process. (Example: gs://your-bucket/your-files/temp-dir). +* **inputFilePattern**: The gs:// path to the text in Cloud Storage you'd like to process. For example, `gs://your-bucket/your-file.txt`. +* **JSONPath**: The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. For example, `gs://your-bucket/your-schema.json`. +* **outputTable**: The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. For example, `:.`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) you want to use. For example, `gs://your-bucket/your-transforms/*.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) that you want to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples) For example, `transform_udf1`. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process. For example, `gs://your-bucket/your-files/temp-dir`. ### Optional parameters -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. ## User-Defined functions (UDFs) @@ -222,12 +222,12 @@ resource "google_dataflow_flex_template_job" "gcs_text_to_bigquery_flex" { name = "gcs-text-to-bigquery-flex" region = var.region parameters = { - inputFilePattern = "gs://your-bucket/your-file.txt" - JSONPath = "gs://your-bucket/your-schema.json" - outputTable = ":." - javascriptTextTransformGcsPath = "gs://your-bucket/your-transforms/*.js" - javascriptTextTransformFunctionName = "transform_udf1" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp-dir" + inputFilePattern = "" + JSONPath = "" + outputTable = "" + javascriptTextTransformGcsPath = "" + javascriptTextTransformFunctionName = "" + bigQueryLoadingTemporaryDirectory = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" } diff --git a/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Xlang.md b/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Xlang.md index bc4dc83fbb..d5eb89cf5b 100644 --- a/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Xlang.md +++ b/v2/googlecloud-to-googlecloud/README_GCS_Text_to_BigQuery_Xlang.md @@ -19,17 +19,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The gs:// path to the text in Cloud Storage you'd like to process. (Example: gs://your-bucket/your-file.txt). -* **JSONPath** : The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. (Example: gs://your-bucket/your-schema.json). -* **outputTable** : The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. (Example: :.). -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process. (Example: gs://your-bucket/your-files/temp-dir). +* **inputFilePattern**: The gs:// path to the text in Cloud Storage you'd like to process. For example, `gs://your-bucket/your-file.txt`. +* **JSONPath**: The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. For example, `gs://your-bucket/your-schema.json`. +* **outputTable**: The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. For example, `:.`. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process. For example, `gs://your-bucket/your-files/temp-dir`. ### Optional parameters -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. @@ -212,14 +212,14 @@ resource "google_dataflow_flex_template_job" "gcs_text_to_bigquery_xlang" { name = "gcs-text-to-bigquery-xlang" region = var.region parameters = { - inputFilePattern = "gs://your-bucket/your-file.txt" - JSONPath = "gs://your-bucket/your-schema.json" - outputTable = ":." - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp-dir" + inputFilePattern = "" + JSONPath = "" + outputTable = "" + bigQueryLoadingTemporaryDirectory = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" } } ``` diff --git a/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Auto.md b/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Auto.md index 239d289dbd..952ab809b5 100644 --- a/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Auto.md +++ b/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Auto.md @@ -15,22 +15,22 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name'. -* **outputTableSpec** : BigQuery table location to write the output to. The table's schema must match the input JSON objects. +* **inputSubscription**: Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name'. +* **outputTableSpec**: BigQuery table location to write the output to. The table's schema must match the input JSON objects. ### Optional parameters -* **bigQuerySchemaPath** : sample text. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. -* **outputDeadletterTable** : Messages failed to reach the output table for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. It should be in the format of "your-project-id:your-dataset.your-table-name". If it doesn't exist, it will be created during pipeline execution. If not specified, "{outputTableSpec}_error_records" is used instead. -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **bigQuerySchemaPath**: sample text. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. +* **outputDeadletterTable**: Messages failed to reach the output table for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. It should be in the format of "your-project-id:your-dataset.your-table-name". If it doesn't exist, it will be created during pipeline execution. If not specified, "{outputTableSpec}_error_records" is used instead. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. ## User-Defined functions (UDFs) @@ -241,7 +241,7 @@ resource "google_dataflow_flex_template_job" "pubsub_to_bigquery_auto" { inputSubscription = "" outputTableSpec = "" # bigQuerySchemaPath = "" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" # outputDeadletterTable = "" diff --git a/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Flex.md b/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Flex.md index 6fed071961..f8ce04277d 100644 --- a/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Flex.md @@ -20,20 +20,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : The BigQuery table to write to, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. +* **outputTableSpec**: The BigQuery table to write to, formatted as `PROJECT_ID:DATASET_NAME.TABLE_NAME`. ### Optional parameters -* **inputTopic** : The Pub/Sub topic to read from, formatted as `"projects//topics/"`. -* **inputSubscription** : The Pub/Sub subscription to read from, formatted as `"projects//subscriptions/"`. -* **outputDeadletterTable** : The BigQuery table to use for messages that failed to reach the output table, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the value `"OUTPUT_TABLE_SPEC_error_records"` is used instead. -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. +* **inputTopic**: The Pub/Sub topic to read from, formatted as `projects//topics/`. +* **inputSubscription**: The Pub/Sub subscription to read from, formatted as `projects//subscriptions/`. +* **outputDeadletterTable**: The BigQuery table to use for messages that failed to reach the output table, formatted as `PROJECT_ID:DATASET_NAME.TABLE_NAME`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the value `OUTPUT_TABLE_SPEC_error_records` is used instead. +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. ## User-Defined functions (UDFs) @@ -243,7 +243,7 @@ resource "google_dataflow_flex_template_job" "pubsub_to_bigquery_flex" { # useStorageWriteApi = "false" # numStorageWriteApiStreams = "0" # storageWriteApiTriggeringFrequencySec = "" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" } diff --git a/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Xlang.md b/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Xlang.md index 7cdc228797..02f874fd90 100644 --- a/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Xlang.md +++ b/v2/googlecloud-to-googlecloud/README_PubSub_to_BigQuery_Xlang.md @@ -20,19 +20,19 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : The BigQuery table to write to, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. +* **outputTableSpec**: The BigQuery table to write to, formatted as `PROJECT_ID:DATASET_NAME.TABLE_NAME`. ### Optional parameters -* **inputTopic** : The Pub/Sub topic to read from, formatted as `"projects//topics/"`. -* **inputSubscription** : The Pub/Sub subscription to read from, formatted as `"projects//subscriptions/"`. -* **outputDeadletterTable** : The BigQuery table to use for messages that failed to reach the output table, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the value `"OUTPUT_TABLE_SPEC_error_records"` is used instead. -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). +* **inputTopic**: The Pub/Sub topic to read from, formatted as `projects//topics/`. +* **inputSubscription**: The Pub/Sub subscription to read from, formatted as `projects//subscriptions/`. +* **outputDeadletterTable**: The BigQuery table to use for messages that failed to reach the output table, formatted as `PROJECT_ID:DATASET_NAME.TABLE_NAME`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the value `OUTPUT_TABLE_SPEC_error_records` is used instead. +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. @@ -229,8 +229,8 @@ resource "google_dataflow_flex_template_job" "pubsub_to_bigquery_xlang" { # useStorageWriteApi = "false" # numStorageWriteApiStreams = "0" # storageWriteApiTriggeringFrequencySec = "" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" } } ``` diff --git a/v2/googlecloud-to-googlecloud/README_Pubsub_to_Jdbc.md b/v2/googlecloud-to-googlecloud/README_Pubsub_to_Jdbc.md index a25c488f12..eed46b8c56 100644 --- a/v2/googlecloud-to-googlecloud/README_Pubsub_to_Jdbc.md +++ b/v2/googlecloud-to-googlecloud/README_Pubsub_to_Jdbc.md @@ -18,21 +18,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : The Pub/Sub input subscription to read from, in the format of 'projects//subscriptions/' (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **driverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. Remove whitespace characters from the Base64-encoded string. (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma separated Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **statement** : The statement to run against the database. The statement must specify the column names of the table in any order. Only the values of the specified column names are read from the JSON and added to the statement. (Example: INSERT INTO tableName (column1, column2) VALUES (?,?)). -* **outputDeadletterTopic** : The Pub/Sub topic to forward undeliverable messages to. (Example: projects//topics/). +* **inputSubscription**: The Pub/Sub input subscription to read from. For example, `projects//subscriptions/`. +* **driverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. Remove whitespace characters from the Base64-encoded string. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma separated Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **statement**: The statement to run against the database. The statement must specify the column names of the table in any order. Only the values of the specified column names are read from the JSON and added to the statement. For example, `INSERT INTO tableName (column1, column2) VALUES (?,?)`. +* **outputDeadletterTopic**: The Pub/Sub topic to forward undeliverable messages to. For example, `projects//topics/`. ### Optional parameters -* **username** : The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. -* **password** : The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. -* **connectionProperties** : The properties string to use for the JDBC connection. The string must use the format `[propertyName=property;]*`. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted. (Example: projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **username**: The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. +* **password**: The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. +* **connectionProperties**: The properties string to use for the JDBC connection. The string must use the format `[propertyName=property;]*`. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted. For example, `projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}`. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -227,18 +227,18 @@ resource "google_dataflow_flex_template_job" "pubsub_to_jdbc" { name = "pubsub-to-jdbc" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - statement = "INSERT INTO tableName (column1, column2) VALUES (?,?)" - outputDeadletterTopic = "projects//topics/" + inputSubscription = "" + driverClassName = "" + connectionUrl = "" + driverJars = "" + statement = "" + outputDeadletterTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # connectionProperties = "" + # KMSEncryptionKey = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_BigQuery.md b/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_BigQuery.md index 01427afc3e..02702d0bcc 100644 --- a/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_BigQuery.md +++ b/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_BigQuery.md @@ -98,32 +98,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **spannerInstanceId** : The Spanner instance to read change streams from. -* **spannerDatabase** : The Spanner database to read change streams from. -* **spannerMetadataInstanceId** : The Spanner instance to use for the change streams connector metadata table. -* **spannerMetadataDatabase** : The Spanner database to use for the change streams connector metadata table. -* **spannerChangeStreamName** : The name of the Spanner change stream to read from. -* **bigQueryDataset** : The BigQuery dataset for change streams output. +* **spannerInstanceId**: The Spanner instance to read change streams from. +* **spannerDatabase**: The Spanner database to read change streams from. +* **spannerMetadataInstanceId**: The Spanner instance to use for the change streams connector metadata table. +* **spannerMetadataDatabase**: The Spanner database to use for the change streams connector metadata table. +* **spannerChangeStreamName**: The name of the Spanner change stream to read from. +* **bigQueryDataset**: The BigQuery dataset for change streams output. ### Optional parameters -* **spannerProjectId** : The project to read change streams from. This value is also the project where the change streams connector metadata table is created. The default value for this parameter is the project where the Dataflow pipeline is running. -* **spannerDatabaseRole** : The Spanner database role to use when running the template. This parameter is required only when the IAM principal who is running the template is a fine-grained access control user. The database role must have the SELECT privilege on the change stream and the EXECUTE privilege on the change stream's read function. For more information, see Fine-grained access control for change streams (https://cloud.google.com/spanner/docs/fgac-change-streams). -* **spannerMetadataTableName** : The Spanner change streams connector metadata table name to use. If not provided, a Spanner change streams connector metadata table is automatically created during the pipeline flow. You must provide this parameter when updating an existing pipeline. Otherwise, don't provide this parameter. -* **rpcPriority** : The request priority for Spanner calls. The value must be one of the following values: `HIGH`, `MEDIUM`, or `LOW`. The default value is `HIGH`. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. Only used for testing. (Example: https://batch-spanner.googleapis.com). -* **startTimestamp** : The starting DateTime (https://datatracker.ietf.org/doc/html/rfc3339), inclusive, to use for reading change streams. Ex-2021-10-12T07:20:50.52Z. Defaults to the timestamp when the pipeline starts, that is, the current time. -* **endTimestamp** : The ending DateTime (https://datatracker.ietf.org/doc/html/rfc3339), inclusive, to use for reading change streams.Ex-2021-10-12T07:20:50.52Z. Defaults to an infinite time in the future. -* **bigQueryProjectId** : The BigQuery project. The default value is the project for the Dataflow job. -* **bigQueryChangelogTableNameTemplate** : The template for the name of the BigQuery table that contains the changelog. Defaults to: {_metadata_spanner_table_name}_changelog. -* **deadLetterQueueDirectory** : The path to store any unprocessed records. The default path is a directory under the Dataflow job's temp location. The default value is usually sufficient. -* **dlqRetryMinutes** : The number of minutes between dead-letter queue retries. The default value is 10. -* **ignoreFields** : A comma-separated list of fields (case sensitive) to ignore. These fields might be fields of watched tables, or metadata fields added by the pipeline. Ignored fields aren't inserted into BigQuery. When you ignore the _metadata_spanner_table_name field, the bigQueryChangelogTableNameTemplate parameter is also ignored. Defaults to empty. -* **disableDlqRetries** : Whether or not to disable retries for the DLQ. Defaults to: false. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **spannerProjectId**: The project to read change streams from. This value is also the project where the change streams connector metadata table is created. The default value for this parameter is the project where the Dataflow pipeline is running. +* **spannerDatabaseRole**: The Spanner database role to use when running the template. This parameter is required only when the IAM principal who is running the template is a fine-grained access control user. The database role must have the `SELECT` privilege on the change stream and the `EXECUTE` privilege on the change stream's read function. For more information, see Fine-grained access control for change streams (https://cloud.google.com/spanner/docs/fgac-change-streams). +* **spannerMetadataTableName**: The Spanner change streams connector metadata table name to use. If not provided, a Spanner change streams connector metadata table is automatically created during the pipeline flow. You must provide this parameter when updating an existing pipeline. Otherwise, don't provide this parameter. +* **rpcPriority**: The request priority for Spanner calls. The value must be one of the following values: `HIGH`, `MEDIUM`, or `LOW`. The default value is `HIGH`. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. Only used for testing. For example, `https://batch-spanner.googleapis.com`. +* **startTimestamp**: The starting DateTime (https://datatracker.ietf.org/doc/html/rfc3339), inclusive, to use for reading change streams. Ex-2021-10-12T07:20:50.52Z. Defaults to the timestamp when the pipeline starts, that is, the current time. +* **endTimestamp**: The ending DateTime (https://datatracker.ietf.org/doc/html/rfc3339), inclusive, to use for reading change streams.Ex-2021-10-12T07:20:50.52Z. Defaults to an infinite time in the future. +* **bigQueryProjectId**: The BigQuery project. The default value is the project for the Dataflow job. +* **bigQueryChangelogTableNameTemplate**: The template for the name of the BigQuery table that contains the changelog. Defaults to: {_metadata_spanner_table_name}_changelog. +* **deadLetterQueueDirectory**: The path to store any unprocessed records. The default path is a directory under the Dataflow job's temp location. The default value is usually sufficient. +* **dlqRetryMinutes**: The number of minutes between dead-letter queue retries. The default value is `10`. +* **ignoreFields**: A comma-separated list of fields (case sensitive) to ignore. These fields might be fields of watched tables, or metadata fields added by the pipeline. Ignored fields aren't inserted into BigQuery. When you ignore the _metadata_spanner_table_name field, the bigQueryChangelogTableNameTemplate parameter is also ignored. Defaults to empty. +* **disableDlqRetries**: Whether or not to disable retries for the DLQ. Defaults to: false. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. @@ -361,7 +361,7 @@ resource "google_dataflow_flex_template_job" "spanner_change_streams_to_bigquery # spannerDatabaseRole = "" # spannerMetadataTableName = "" # rpcPriority = "HIGH" - # spannerHost = "https://batch-spanner.googleapis.com" + # spannerHost = "" # startTimestamp = "" # endTimestamp = "" # bigQueryProjectId = "" diff --git a/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_Google_Cloud_Storage.md b/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_Google_Cloud_Storage.md index 88c2cc987c..f5ac99b0ea 100644 --- a/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_Google_Cloud_Storage.md +++ b/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_Google_Cloud_Storage.md @@ -42,26 +42,26 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **spannerInstanceId** : The Spanner instance ID to read change streams data from. -* **spannerDatabase** : The Spanner database to read change streams data from. -* **spannerMetadataInstanceId** : The Spanner instance ID to use for the change streams connector metadata table. -* **spannerMetadataDatabase** : The Spanner database to use for the change streams connector metadata table. -* **spannerChangeStreamName** : The name of the Spanner change stream to read from. -* **gcsOutputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. (Example: gs://your-bucket/your-path). +* **spannerInstanceId**: The Spanner instance ID to read change streams data from. +* **spannerDatabase**: The Spanner database to read change streams data from. +* **spannerMetadataInstanceId**: The Spanner instance ID to use for the change streams connector metadata table. +* **spannerMetadataDatabase**: The Spanner database to use for the change streams connector metadata table. +* **spannerChangeStreamName**: The name of the Spanner change stream to read from. +* **gcsOutputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. For example, `gs://your-bucket/your-path`. ### Optional parameters -* **spannerProjectId** : The ID of the Google Cloud project that contains the Spanner database to read change streams from. This project is also where the change streams connector metadata table is created. The default for this parameter is the project where the Dataflow pipeline is running. -* **spannerDatabaseRole** : The Spanner database role to use when running the template. This parameter is required only when the IAM principal who is running the template is a fine-grained access control user. The database role must have the `SELECT` privilege on the change stream and the `EXECUTE` privilege on the change stream's read function. For more information, see Fine-grained access control for change streams (https://cloud.google.com/spanner/docs/fgac-change-streams). -* **spannerMetadataTableName** : The Spanner change streams connector metadata table name to use. If not provided, a Spanner change streams metadata table is automatically created during pipeline execution. You must provide a value for this parameter when updating an existing pipeline. Otherwise, don't use this parameter. -* **startTimestamp** : The starting DateTime, inclusive, to use for reading change streams, in the format Ex-2021-10-12T07:20:50.52Z. Defaults to the timestamp when the pipeline starts, that is, the current time. -* **endTimestamp** : The ending DateTime, inclusive, to use for reading change streams. For example, Ex-2021-10-12T07:20:50.52Z. Defaults to an infinite time in the future. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. Only used for testing. (Example: https://spanner.googleapis.com). Defaults to: https://spanner.googleapis.com. -* **outputFileFormat** : The format of the output Cloud Storage file. Allowed formats are TEXT and AVRO. Defaults to AVRO. -* **windowDuration** : The window duration is the interval in which data is written to the output directory. Configure the duration based on the pipeline's throughput. For example, a higher throughput might require smaller window sizes so that the data fits into memory. Defaults to 5m (five minutes), with a minimum of 1s (one second). Allowed formats are: [int]s (for seconds, example: 5s), [int]m (for minutes, example: 12m), [int]h (for hours, example: 2h). (Example: 5m). -* **rpcPriority** : The request priority for Spanner calls. The value must be HIGH, MEDIUM, or LOW. Defaults to HIGH. -* **outputFilenamePrefix** : The prefix to place on each windowed file. (Example: output-). Defaults to: output. -* **numShards** : The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 20. +* **spannerProjectId**: The ID of the Google Cloud project that contains the Spanner database to read change streams from. This project is also where the change streams connector metadata table is created. The default for this parameter is the project where the Dataflow pipeline is running. +* **spannerDatabaseRole**: The Spanner database role to use when running the template. This parameter is required only when the IAM principal who is running the template is a fine-grained access control user. The database role must have the `SELECT` privilege on the change stream and the `EXECUTE` privilege on the change stream's read function. For more information, see Fine-grained access control for change streams (https://cloud.google.com/spanner/docs/fgac-change-streams). +* **spannerMetadataTableName**: The Spanner change streams connector metadata table name to use. If not provided, a Spanner change streams metadata table is automatically created during pipeline execution. You must provide a value for this parameter when updating an existing pipeline. Otherwise, don't use this parameter. +* **startTimestamp**: The starting DateTime, inclusive, to use for reading change streams, in the format `Ex-2021-10-12T07:20:50.52Z`. Defaults to the timestamp when the pipeline starts, that is, the current time. +* **endTimestamp**: The ending DateTime, inclusive, to use for reading change streams. For example, `Ex-2021-10-12T07:20:50.52Z`. Defaults to an infinite time in the future. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. Only used for testing. For example, `https://spanner.googleapis.com`. Defaults to: https://spanner.googleapis.com. +* **outputFileFormat**: The format of the output Cloud Storage file. Allowed formats are `TEXT` and `AVRO`. Defaults to `AVRO`. +* **windowDuration**: The window duration is the interval in which data is written to the output directory. Configure the duration based on the pipeline's throughput. For example, a higher throughput might require smaller window sizes so that the data fits into memory. Defaults to 5m (five minutes), with a minimum of 1s (one second). Allowed formats are: [int]s (for seconds, example: 5s), [int]m (for minutes, example: 12m), [int]h (for hours, example: 2h). For example, `5m`. +* **rpcPriority**: The request priority for Spanner calls. The value must be `HIGH`, `MEDIUM`, or `LOW`. Defaults to `HIGH`. +* **outputFilenamePrefix**: The prefix to place on each windowed file. For example, `output-`. Defaults to: output. +* **numShards**: The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Defaults to: 20. @@ -276,7 +276,7 @@ resource "google_dataflow_flex_template_job" "spanner_change_streams_to_google_c spannerMetadataInstanceId = "" spannerMetadataDatabase = "" spannerChangeStreamName = "" - gcsOutputDirectory = "gs://your-bucket/your-path" + gcsOutputDirectory = "" # spannerProjectId = "" # spannerDatabaseRole = "" # spannerMetadataTableName = "" @@ -286,7 +286,7 @@ resource "google_dataflow_flex_template_job" "spanner_change_streams_to_google_c # outputFileFormat = "AVRO" # windowDuration = "5m" # rpcPriority = "HIGH" - # outputFilenamePrefix = "output-" + # outputFilenamePrefix = "output" # numShards = "20" } } diff --git a/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_PubSub.md b/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_PubSub.md index cc4063bae8..5b313986e0 100644 --- a/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_PubSub.md +++ b/v2/googlecloud-to-googlecloud/README_Spanner_Change_Streams_to_PubSub.md @@ -33,27 +33,27 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **spannerInstanceId** : The Spanner instance to read change streams from. -* **spannerDatabase** : The Spanner database to read change streams from. -* **spannerMetadataInstanceId** : The Spanner instance to use for the change streams connector metadata table. -* **spannerMetadataDatabase** : The Spanner database to use for the change streams connector metadata table. -* **spannerChangeStreamName** : The name of the Spanner change stream to read from. -* **pubsubTopic** : The Pub/Sub topic for change streams output. +* **spannerInstanceId**: The Spanner instance to read change streams from. +* **spannerDatabase**: The Spanner database to read change streams from. +* **spannerMetadataInstanceId**: The Spanner instance to use for the change streams connector metadata table. +* **spannerMetadataDatabase**: The Spanner database to use for the change streams connector metadata table. +* **spannerChangeStreamName**: The name of the Spanner change stream to read from. +* **pubsubTopic**: The Pub/Sub topic for change streams output. ### Optional parameters -* **spannerProjectId** : The project to read change streams from. This project is also where the change streams connector metadata table is created. The default for this parameter is the project where the Dataflow pipeline is running. -* **spannerDatabaseRole** : The Spanner database role to use when running the template. This parameter is required only when the IAM principal who is running the template is a fine-grained access control user. The database role must have the `SELECT` privilege on the change stream and the `EXECUTE` privilege on the change stream's read function. For more information, see Fine-grained access control for change streams (https://cloud.google.com/spanner/docs/fgac-change-streams). -* **spannerMetadataTableName** : The Spanner change streams connector metadata table name to use. If not provided, Spanner automatically creates the streams connector metadata table during the pipeline flow change. You must provide this parameter when updating an existing pipeline. Don't use this parameter for other cases. -* **startTimestamp** : The starting DateTime (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, ex- 2021-10-12T07:20:50.52Z. Defaults to the timestamp when the pipeline starts, that is, the current time. -* **endTimestamp** : The ending DateTime (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, ex- 2021-10-12T07:20:50.52Z. Defaults to an infinite time in the future. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. Only used for testing. (Example: https://spanner.googleapis.com). Defaults to: https://spanner.googleapis.com. -* **outputDataFormat** : The format of the output. Output is wrapped in many PubsubMessages and sent to a Pub/Sub topic. Allowed formats are JSON and AVRO. Default is JSON. -* **pubsubAPI** : The Pub/Sub API used to implement the pipeline. Allowed APIs are `pubsubio` and `native_client`. For a small number of queries per second (QPS), `native_client` has less latency. For a large number of QPS, `pubsubio` provides better and more stable performance. The default is `pubsubio`. -* **pubsubProjectId** : Project of Pub/Sub topic. The default for this parameter is the project where the Dataflow pipeline is running. -* **rpcPriority** : The request priority for Spanner calls. Allowed values are HIGH, MEDIUM, and LOW. Defaults to: HIGH). -* **includeSpannerSource** : Whether or not to include the spanner database id and instance id to read the change stream from in the output message data. Defaults to: false. -* **outputMessageMetadata** : The string value for the custom field outputMessageMetadata in output pub/sub message. Defaults to empty and the field outputMessageMetadata is only populated if this value is non-empty. Please escape any special characters when entering the value here(ie: double quotes). +* **spannerProjectId**: The project to read change streams from. This project is also where the change streams connector metadata table is created. The default for this parameter is the project where the Dataflow pipeline is running. +* **spannerDatabaseRole**: The Spanner database role to use when running the template. This parameter is required only when the IAM principal who is running the template is a fine-grained access control user. The database role must have the `SELECT` privilege on the change stream and the `EXECUTE` privilege on the change stream's read function. For more information, see Fine-grained access control for change streams (https://cloud.google.com/spanner/docs/fgac-change-streams). +* **spannerMetadataTableName**: The Spanner change streams connector metadata table name to use. If not provided, Spanner automatically creates the streams connector metadata table during the pipeline flow change. You must provide this parameter when updating an existing pipeline. Don't use this parameter for other cases. +* **startTimestamp**: The starting DateTime (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, ex- 2021-10-12T07:20:50.52Z. Defaults to the timestamp when the pipeline starts, that is, the current time. +* **endTimestamp**: The ending DateTime (https://tools.ietf.org/html/rfc3339), inclusive, to use for reading change streams. For example, ex- 2021-10-12T07:20:50.52Z. Defaults to an infinite time in the future. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. Only used for testing. For example, `https://spanner.googleapis.com`. Defaults to: https://spanner.googleapis.com. +* **outputDataFormat**: The format of the output. Output is wrapped in many PubsubMessages and sent to a Pub/Sub topic. Allowed formats are JSON and AVRO. Default is JSON. +* **pubsubAPI**: The Pub/Sub API used to implement the pipeline. Allowed APIs are `pubsubio` and `native_client`. For a small number of queries per second (QPS), `native_client` has less latency. For a large number of QPS, `pubsubio` provides better and more stable performance. The default is `pubsubio`. +* **pubsubProjectId**: Project of Pub/Sub topic. The default for this parameter is the project where the Dataflow pipeline is running. +* **rpcPriority**: The request priority for Spanner calls. Allowed values are HIGH, MEDIUM, and LOW. Defaults to: HIGH). +* **includeSpannerSource**: Whether or not to include the spanner database id and instance id to read the change stream from in the output message data. Defaults to: false. +* **outputMessageMetadata**: The string value for the custom field outputMessageMetadata in output pub/sub message. Defaults to empty and the field outputMessageMetadata is only populated if this value is non-empty. Please escape any special characters when entering the value here(ie: double quotes). diff --git a/v2/googlecloud-to-googlecloud/README_Stream_DLP_GCS_Text_to_BigQuery_Flex.md b/v2/googlecloud-to-googlecloud/README_Stream_DLP_GCS_Text_to_BigQuery_Flex.md index 901afa480b..5c4b4355db 100644 --- a/v2/googlecloud-to-googlecloud/README_Stream_DLP_GCS_Text_to_BigQuery_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_Stream_DLP_GCS_Text_to_BigQuery_Flex.md @@ -34,19 +34,19 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The Cloud Storage location of the files you'd like to process. (Example: gs://your-bucket/your-files/*.csv). -* **deidentifyTemplateName** : Cloud DLP template to deidentify contents. Must be created here: https://console.cloud.google.com/security/dlp/create/template. (Example: projects/your-project-id/locations/global/deidentifyTemplates/generated_template_id). -* **datasetName** : BigQuery Dataset to be used. Dataset must exist prior to execution. Ex. pii_dataset. -* **dlpProjectId** : Cloud DLP project ID to be used for data masking/tokenization. Ex. your-dlp-project. +* **inputFilePattern**: The Cloud Storage location of the files you'd like to process. For example, `gs://your-bucket/your-files/*.csv`. +* **deidentifyTemplateName**: Cloud DLP template to deidentify contents. Must be created here: https://console.cloud.google.com/security/dlp/create/template. For example, `projects/your-project-id/locations/global/deidentifyTemplates/generated_template_id`. +* **datasetName**: BigQuery Dataset to be used. Dataset must exist prior to execution. Ex. pii_dataset. +* **dlpProjectId**: Cloud DLP project ID to be used for data masking/tokenization. Ex. your-dlp-project. ### Optional parameters -* **inspectTemplateName** : Cloud DLP template to inspect contents. (Example: projects/your-project-id/locations/global/inspectTemplates/generated_template_id). -* **batchSize** : Batch size contents (number of rows) to optimize DLP API call. Total size of the rows must not exceed 512 KB and total cell count must not exceed 50,000. Default batch size is set to 100. Ex. 1000. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **inspectTemplateName**: Cloud DLP template to inspect contents. For example, `projects/your-project-id/locations/global/inspectTemplates/generated_template_id`. +* **batchSize**: Batch size contents (number of rows) to optimize DLP API call. Total size of the rows must not exceed 512 KB and total cell count must not exceed 50,000. Default batch size is set to 100. Ex. 1000. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. @@ -235,11 +235,11 @@ resource "google_dataflow_flex_template_job" "stream_dlp_gcs_text_to_bigquery_fl name = "stream-dlp-gcs-text-to-bigquery-flex" region = var.region parameters = { - inputFilePattern = "gs://your-bucket/your-files/*.csv" - deidentifyTemplateName = "projects/your-project-id/locations/global/deidentifyTemplates/generated_template_id" + inputFilePattern = "" + deidentifyTemplateName = "" datasetName = "" dlpProjectId = "" - # inspectTemplateName = "projects/your-project-id/locations/global/inspectTemplates/generated_template_id" + # inspectTemplateName = "" # batchSize = "100" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" diff --git a/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Flex.md b/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Flex.md index c6243bb30f..46db964d2e 100644 --- a/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Flex.md +++ b/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Flex.md @@ -30,22 +30,22 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The gs:// path to the text in Cloud Storage you'd like to process. (Example: gs://your-bucket/your-file.txt). -* **JSONPath** : The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. (Example: gs://your-bucket/your-schema.json). -* **outputTable** : The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. (Example: :.). -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) you want to use. (Example: gs://your-bucket/your-transforms/*.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) that you want to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples) (Example: transform_udf1). -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process. (Example: gs://your-bucket/your-files/temp-dir). +* **inputFilePattern**: The gs:// path to the text in Cloud Storage you'd like to process. For example, `gs://your-bucket/your-file.txt`. +* **JSONPath**: The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. For example, `gs://your-bucket/your-schema.json`. +* **outputTable**: The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. For example, `:.`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) you want to use. For example, `gs://your-bucket/your-transforms/*.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) that you want to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples) For example, `transform_udf1`. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process. For example, `gs://your-bucket/your-files/temp-dir`. ### Optional parameters -* **outputDeadletterTable** : Table for messages that failed to reach the output table. If a table doesn't exist, it is created during pipeline execution. If not specified, `_error_records` is used. (Example: :.). -* **useStorageWriteApiAtLeastOnce** : This parameter takes effect only if "Use BigQuery Storage Write API" is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. +* **outputDeadletterTable**: Table for messages that failed to reach the output table. If a table doesn't exist, it is created during pipeline execution. If not specified, `_error_records` is used. For example, `:.`. +* **useStorageWriteApiAtLeastOnce**: This parameter takes effect only if `Use BigQuery Storage Write API` is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. ## User-Defined functions (UDFs) @@ -253,18 +253,18 @@ resource "google_dataflow_flex_template_job" "stream_gcs_text_to_bigquery_flex" name = "stream-gcs-text-to-bigquery-flex" region = var.region parameters = { - inputFilePattern = "gs://your-bucket/your-file.txt" - JSONPath = "gs://your-bucket/your-schema.json" - outputTable = ":." - javascriptTextTransformGcsPath = "gs://your-bucket/your-transforms/*.js" - javascriptTextTransformFunctionName = "transform_udf1" - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp-dir" - # outputDeadletterTable = ":." + inputFilePattern = "" + JSONPath = "" + outputTable = "" + javascriptTextTransformGcsPath = "" + javascriptTextTransformFunctionName = "" + bigQueryLoadingTemporaryDirectory = "" + # outputDeadletterTable = "" # useStorageWriteApiAtLeastOnce = "false" # useStorageWriteApi = "false" # numStorageWriteApiStreams = "0" # storageWriteApiTriggeringFrequencySec = "" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" + # pythonExternalTextTransformGcsPath = "" # javascriptTextTransformReloadIntervalMinutes = "0" } } diff --git a/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Xlang.md b/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Xlang.md index 33d45a8d29..c5649655b4 100644 --- a/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Xlang.md +++ b/v2/googlecloud-to-googlecloud/README_Stream_GCS_Text_to_BigQuery_Xlang.md @@ -30,20 +30,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputFilePattern** : The gs:// path to the text in Cloud Storage you'd like to process. (Example: gs://your-bucket/your-file.txt). -* **JSONPath** : The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. (Example: gs://your-bucket/your-schema.json). -* **outputTable** : The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. (Example: :.). -* **bigQueryLoadingTemporaryDirectory** : Temporary directory for BigQuery loading process. (Example: gs://your-bucket/your-files/temp-dir). +* **inputFilePattern**: The gs:// path to the text in Cloud Storage you'd like to process. For example, `gs://your-bucket/your-file.txt`. +* **JSONPath**: The gs:// path to the JSON file that defines your BigQuery schema, stored in Cloud Storage. For example, `gs://your-bucket/your-schema.json`. +* **outputTable**: The location of the BigQuery table to use to store the processed data. If you reuse an existing table, it is overwritten. For example, `:.`. +* **bigQueryLoadingTemporaryDirectory**: Temporary directory for BigQuery loading process. For example, `gs://your-bucket/your-files/temp-dir`. ### Optional parameters -* **outputDeadletterTable** : Table for messages that failed to reach the output table. If a table doesn't exist, it is created during pipeline execution. If not specified, `_error_records` is used. (Example: :.). -* **useStorageWriteApiAtLeastOnce** : This parameter takes effect only if "Use BigQuery Storage Write API" is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). +* **outputDeadletterTable**: Table for messages that failed to reach the output table. If a table doesn't exist, it is created during pipeline execution. If not specified, `_error_records` is used. For example, `:.`. +* **useStorageWriteApiAtLeastOnce**: This parameter takes effect only if `Use BigQuery Storage Write API` is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. @@ -235,17 +235,17 @@ resource "google_dataflow_flex_template_job" "stream_gcs_text_to_bigquery_xlang" name = "stream-gcs-text-to-bigquery-xlang" region = var.region parameters = { - inputFilePattern = "gs://your-bucket/your-file.txt" - JSONPath = "gs://your-bucket/your-schema.json" - outputTable = ":." - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp-dir" - # outputDeadletterTable = ":." + inputFilePattern = "" + JSONPath = "" + outputTable = "" + bigQueryLoadingTemporaryDirectory = "" + # outputDeadletterTable = "" # useStorageWriteApiAtLeastOnce = "false" # useStorageWriteApi = "false" # numStorageWriteApiStreams = "0" # storageWriteApiTriggeringFrequencySec = "" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" } } ``` diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamToBigQueryOptions.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamToBigQueryOptions.java index 81717246b1..39a7593438 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamToBigQueryOptions.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamToBigQueryOptions.java @@ -55,8 +55,8 @@ public interface BigtableChangeStreamToBigQueryOptions optional = true, description = "Write values as BigQuery BYTES", helpText = - "When set true values are written to BYTES column, otherwise to STRING column. " - + "Defaults to false.") + "When set to `true`, values are written to a column of type BYTES, otherwise to a column of type STRING . " + + "Defaults to: `false`.") @Default.Boolean(false) Boolean getWriteValuesAsBytes(); @@ -67,7 +67,7 @@ public interface BigtableChangeStreamToBigQueryOptions optional = true, description = "Write Bigtable timestamp as BigQuery INT", helpText = - "Whether to write the Bigtable timestamp as BigQuery `INT64`. When set to true, values are written to the `INT64` column." + "Whether to write the Bigtable timestamp as BigQuery INT64. When set to `true`, values are written to the INT64 column." + " Otherwise, values are written to the `TIMESTAMP` column. Columns affected: `timestamp`, `timestamp_from`, " + "and `timestamp_to`. Defaults to `false`. When set to `true`, the time is measured in microseconds " + "since the Unix epoch (January 1, 1970 at UTC).") @@ -117,7 +117,7 @@ public interface BigtableChangeStreamToBigQueryOptions optional = true, description = "Sets partition expiration time in milliseconds", helpText = - "Sets the changelog table partition expiration time, in milliseconds. When set to true, " + "Sets the changelog table partition expiration time, in milliseconds. When set to `true`, " + "partitions older than the specified number of milliseconds are deleted. " + "By default, no expiration is set.") Long getBigQueryChangelogTablePartitionExpirationMs(); diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamsToPubSubOptions.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamsToPubSubOptions.java index d4e7b519d1..14d30ac153 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamsToPubSubOptions.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/BigtableChangeStreamsToPubSubOptions.java @@ -80,7 +80,7 @@ public interface BigtableChangeStreamsToPubSubOptions optional = true, description = "Strip values for SetCell mutation", helpText = - "When set to true, the SET_CELL mutations are returned without new values set. Defaults to false. This parameter is useful when you don't need a new value to be present, also known as cache invalidation, or when values are extremely large and exceed Pub/Sub message size limits.") + "When set to `true`, the `SET_CELL` mutations are returned without new values set. Defaults to `false`. This parameter is useful when you don't need a new value to be present, also known as cache invalidation, or when values are extremely large and exceed Pub/Sub message size limits.") @Default.Boolean(false) Boolean getStripValues(); diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/PubsubToJdbcOptions.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/PubsubToJdbcOptions.java index a813f01beb..d5fda6508e 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/PubsubToJdbcOptions.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/PubsubToJdbcOptions.java @@ -28,9 +28,8 @@ public interface PubsubToJdbcOptions extends CommonTemplateOptions { order = 1, groupName = "Source", description = "Pub/Sub input subscription", - helpText = - "The Pub/Sub input subscription to read from, in the format of 'projects//subscriptions/'", - example = "projects/your-project-id/subscriptions/your-subscription-name") + helpText = "The Pub/Sub input subscription to read from.", + example = "projects//subscriptions/") @Validation.Required String getInputSubscription(); diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToBigQueryOptions.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToBigQueryOptions.java index 5aa77f880a..d92075fb06 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToBigQueryOptions.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToBigQueryOptions.java @@ -68,8 +68,8 @@ public interface SpannerChangeStreamsToBigQueryOptions description = "Spanner database role", helpText = "The Spanner database role to use when running the template. This parameter is required only when the IAM principal who is running the template is a" - + " fine-grained access control user. The database role must have the SELECT privilege on the change stream" - + " and the EXECUTE privilege on the change stream's read function. For more information, see" + + " fine-grained access control user. The database role must have the `SELECT` privilege on the change stream" + + " and the `EXECUTE` privilege on the change stream's read function. For more information, see" + " Fine-grained access control for change streams (https://cloud.google.com/spanner/docs/fgac-change-streams).") String getSpannerDatabaseRole(); @@ -217,7 +217,7 @@ public interface SpannerChangeStreamsToBigQueryOptions optional = true, description = "Dead letter queue retry minutes", helpText = - "The number of minutes between dead-letter queue retries. The default value is 10.") + "The number of minutes between dead-letter queue retries. The default value is `10`.") @Default.Integer(10) Integer getDlqRetryMinutes(); diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToGcsOptions.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToGcsOptions.java index 2667ade3db..11adfdd2bf 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToGcsOptions.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/SpannerChangeStreamsToGcsOptions.java @@ -119,7 +119,7 @@ public interface SpannerChangeStreamsToGcsOptions optional = true, description = "The timestamp to read change streams from", helpText = - "The starting DateTime, inclusive, to use for reading change streams, in the format Ex-2021-10-12T07:20:50.52Z. Defaults to the timestamp when the pipeline starts, that is, the current time.") + "The starting DateTime, inclusive, to use for reading change streams, in the format `Ex-2021-10-12T07:20:50.52Z`. Defaults to the timestamp when the pipeline starts, that is, the current time.") @Default.String("") String getStartTimestamp(); @@ -130,7 +130,7 @@ public interface SpannerChangeStreamsToGcsOptions optional = true, description = "The timestamp to read change streams to", helpText = - "The ending DateTime, inclusive, to use for reading change streams. For example, Ex-2021-10-12T07:20:50.52Z. Defaults to an infinite time in the future.") + "The ending DateTime, inclusive, to use for reading change streams. For example, `Ex-2021-10-12T07:20:50.52Z`. Defaults to an infinite time in the future.") @Default.String("") String getEndTimestamp(); @@ -153,7 +153,7 @@ public interface SpannerChangeStreamsToGcsOptions optional = true, description = "Output file format", helpText = - "The format of the output Cloud Storage file. Allowed formats are TEXT and AVRO. Defaults to AVRO.") + "The format of the output Cloud Storage file. Allowed formats are `TEXT` and `AVRO`. Defaults to `AVRO`.") @Default.Enum("AVRO") FileFormat getOutputFileFormat(); @@ -181,7 +181,7 @@ public interface SpannerChangeStreamsToGcsOptions optional = true, description = "Priority for Spanner RPC invocations", helpText = - "The request priority for Spanner calls. The value must be HIGH, MEDIUM, or LOW. Defaults to HIGH.") + "The request priority for Spanner calls. The value must be `HIGH`, `MEDIUM`, or `LOW`. Defaults to `HIGH`.") @Default.Enum("HIGH") RpcPriority getRpcPriority(); diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToBigQuery.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToBigQuery.java index a88f14f363..7e798c674f 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToBigQuery.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToBigQuery.java @@ -184,7 +184,7 @@ public interface Options groupName = "Target", description = "BigQuery output table", helpText = - "The BigQuery table to write to, formatted as `\"PROJECT_ID:DATASET_NAME.TABLE_NAME\"`.") + "The BigQuery table to write to, formatted as `PROJECT_ID:DATASET_NAME.TABLE_NAME`.") String getOutputTableSpec(); void setOutputTableSpec(String value); @@ -195,7 +195,7 @@ public interface Options optional = true, description = "Input Pub/Sub topic", helpText = - "The Pub/Sub topic to read from, formatted as `\"projects//topics/\"`.") + "The Pub/Sub topic to read from, formatted as `projects//topics/`.") String getInputTopic(); void setInputTopic(String value); @@ -207,7 +207,7 @@ public interface Options description = "Pub/Sub input subscription", helpText = "The Pub/Sub subscription to read from, " - + "formatted as `\"projects//subscriptions/\"`.") + + "formatted as `projects//subscriptions/`.") String getInputSubscription(); void setInputSubscription(String value); @@ -219,10 +219,10 @@ public interface Options "Table for messages failed to reach the output table (i.e., Deadletter table)", helpText = "The BigQuery table to use for messages that failed to reach the output table, " - + "formatted as `\"PROJECT_ID:DATASET_NAME.TABLE_NAME\"`. If the table " + + "formatted as `PROJECT_ID:DATASET_NAME.TABLE_NAME`. If the table " + "doesn't exist, it is created when the pipeline runs. " + "If this parameter is not specified, " - + "the value `\"OUTPUT_TABLE_SPEC_error_records\"` is used instead.") + + "the value `OUTPUT_TABLE_SPEC_error_records` is used instead.") String getOutputDeadletterTable(); void setOutputDeadletterTable(String value); diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/TextToBigQueryStreaming.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/TextToBigQueryStreaming.java index 3fbb79d6a0..e034993bac 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/TextToBigQueryStreaming.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/TextToBigQueryStreaming.java @@ -495,7 +495,7 @@ public interface TextToBigQueryStreamingOptions parentTriggerValues = {"true"}, description = "Use at at-least-once semantics in BigQuery Storage Write API", helpText = - "This parameter takes effect only if \"Use BigQuery Storage Write API\" is enabled. If" + "This parameter takes effect only if `Use BigQuery Storage Write API` is enabled. If" + " enabled the at-least-once semantics will be used for Storage Write API, otherwise" + " exactly-once semantics will be used.", hiddenUi = true) diff --git a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/pubsubtotext/PubsubToText.java b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/pubsubtotext/PubsubToText.java index 0b66afd011..6144929fd8 100644 --- a/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/pubsubtotext/PubsubToText.java +++ b/v2/googlecloud-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/templates/pubsubtotext/PubsubToText.java @@ -81,10 +81,9 @@ public interface Options optional = true, description = "Pub/Sub input topic", helpText = - "The Pub/Sub topic to read the input from. The topic name should be in the format " - + "`projects//topics/`. If this parameter is provided " + "The Pub/Sub topic to read the input from. If this parameter is provided " + "don't use `inputSubscription`.", - example = "projects/your-project-id/topics/your-topic-name") + example = "projects//topics/") String getInputTopic(); void setInputTopic(String value); @@ -95,10 +94,9 @@ public interface Options optional = true, description = "Pub/Sub input subscription", helpText = - "The Pub/Sub subscription to read the input from. The subscription name uses the format " - + "`projects//subscription/`. If this parameter is " + "The Pub/Sub subscription to read the input from. If this parameter is " + "provided, don't use `inputTopic`.", - example = "projects/your-project-id/subscriptions/your-subscription-name") + example = "projects//subscription/") String getInputSubscription(); void setInputSubscription(String value); @@ -110,7 +108,7 @@ public interface Options helpText = "The path and filename prefix to write write output files to. " + "This value must end in a slash.", - example = "gs://your-bucket/your-path") + example = "gs://your-bucket/your-path/") @Required String getOutputDirectory(); diff --git a/v2/googlecloud-to-mongodb/README_BigQuery_to_MongoDB.md b/v2/googlecloud-to-mongodb/README_BigQuery_to_MongoDB.md index 6ff2d00015..c57cc31c07 100644 --- a/v2/googlecloud-to-mongodb/README_BigQuery_to_MongoDB.md +++ b/v2/googlecloud-to-mongodb/README_BigQuery_to_MongoDB.md @@ -18,10 +18,10 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **mongoDbUri** : The MongoDB connection URI in the format mongodb+srv://:@. -* **database** : Database in MongoDB to store the collection. (Example: my-db). -* **collection** : The name of the collection in the MongoDB database. (Example: my-collection). -* **inputTableSpec** : The BigQuery table to read from. (Example: bigquery-project:dataset.input_table). +* **mongoDbUri**: The MongoDB connection URI in the format `mongodb+srv://:@`. +* **database**: Database in MongoDB to store the collection. For example, `my-db`. +* **collection**: The name of the collection in the MongoDB database. For example, `my-collection`. +* **inputTableSpec**: The BigQuery table to read from. For example, `bigquery-project:dataset.input_table`. ### Optional parameters @@ -196,9 +196,9 @@ resource "google_dataflow_flex_template_job" "bigquery_to_mongodb" { region = var.region parameters = { mongoDbUri = "" - database = "my-db" - collection = "my-collection" - inputTableSpec = "bigquery-project:dataset.input_table" + database = "" + collection = "" + inputTableSpec = "" } } ``` diff --git a/v2/googlecloud-to-mongodb/src/main/java/com/google/cloud/teleport/v2/mongodb/options/BigQueryToMongoDbOptions.java b/v2/googlecloud-to-mongodb/src/main/java/com/google/cloud/teleport/v2/mongodb/options/BigQueryToMongoDbOptions.java index 54fe4eddfa..d85130e296 100644 --- a/v2/googlecloud-to-mongodb/src/main/java/com/google/cloud/teleport/v2/mongodb/options/BigQueryToMongoDbOptions.java +++ b/v2/googlecloud-to-mongodb/src/main/java/com/google/cloud/teleport/v2/mongodb/options/BigQueryToMongoDbOptions.java @@ -31,7 +31,7 @@ public interface MongoDbOptions extends PipelineOptions, DataflowPipelineOptions order = 1, groupName = "Target", description = "MongoDB Connection URI", - helpText = "The MongoDB connection URI in the format mongodb+srv://:@.") + helpText = "The MongoDB connection URI in the format `mongodb+srv://:@`.") String getMongoDbUri(); void setMongoDbUri(String getMongoDbUri); diff --git a/v2/googlecloud-to-neo4j/README_Google_Cloud_to_Neo4j.md b/v2/googlecloud-to-neo4j/README_Google_Cloud_to_Neo4j.md index 34a618fd00..4d2d293361 100644 --- a/v2/googlecloud-to-neo4j/README_Google_Cloud_to_Neo4j.md +++ b/v2/googlecloud-to-neo4j/README_Google_Cloud_to_Neo4j.md @@ -20,17 +20,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **jobSpecUri** : The path to the job specification file, which contains the configuration for source and target metadata. +* **jobSpecUri**: The path to the job specification file, which contains the JSON description of data sources, Neo4j targets and actions. ### Optional parameters -* **neo4jConnectionUri** : The path to the Neo4j connection metadata JSON file. -* **neo4jConnectionSecretId** : The secret ID for the Neo4j connection metadata. This is an alternative to the GCS path option. -* **optionsJson** : Options JSON. Use runtime tokens. (Example: {token1:value1,token2:value2}). Defaults to empty. -* **readQuery** : Override SQL query. Defaults to empty. -* **inputFilePattern** : Override text file pattern (Example: gs://your-bucket/path/*.json). Defaults to empty. -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **neo4jConnectionUri**: The path to the Neo4j connection JSON file. +* **neo4jConnectionSecretId**: The secret ID for the Neo4j connection metadata. You can use this value as an alternative to the `neo4jConnectionUri`. +* **optionsJson**: A JSON object that is also called runtime tokens For example, `{token1:value1,token2:value2}. Spec can refer to $token1 and $token2.`. Defaults to empty. +* **readQuery**: SQL query override. Defaults to empty. +* **inputFilePattern**: The text file path override For example, `gs://your-bucket/path/*.json`. Defaults to empty. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -216,11 +216,11 @@ resource "google_dataflow_flex_template_job" "google_cloud_to_neo4j" { jobSpecUri = "" # neo4jConnectionUri = "" # neo4jConnectionSecretId = "" - # optionsJson = "{token1:value1,token2:value2}" + # optionsJson = "" # readQuery = "" - # inputFilePattern = "gs://your-bucket/path/*.json" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # inputFilePattern = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/helpers/SqlQuerySpec.java b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/helpers/BigQuerySpec.java similarity index 54% rename from v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/helpers/SqlQuerySpec.java rename to v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/helpers/BigQuerySpec.java index f11b4d1357..d1fcf8c4d7 100644 --- a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/helpers/SqlQuerySpec.java +++ b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/helpers/BigQuerySpec.java @@ -19,16 +19,25 @@ * Convenience object for invoking SQL query as well as providing descriptions for read and cast * phase of transform. */ -public class SqlQuerySpec { +public class BigQuerySpec { private final String readDescription; private final String castDescription; private final String sql; + private final String queryTempProject; + private final String queryTempDataset; - public SqlQuerySpec(String readDescription, String castDescription, String sql) { + public BigQuerySpec( + String readDescription, + String castDescription, + String sql, + String queryTempProject, + String queryTempDataset) { this.readDescription = readDescription; this.castDescription = castDescription; this.sql = sql; + this.queryTempProject = queryTempProject; + this.queryTempDataset = queryTempDataset; } public String getReadDescription() { @@ -43,29 +52,50 @@ public String getSql() { return sql; } - public static class SqlQuerySpecBuilder { + public String getQueryTempProject() { + return queryTempProject; + } + + public String getQueryTempDataset() { + return queryTempDataset; + } + + public static class BigQuerySpecBuilder { private String readDescription; private String castDescription; private String sql; + private String queryTempProject; + private String queryTempDataset; - public SqlQuerySpecBuilder readDescription(String readDescription) { + public BigQuerySpecBuilder readDescription(String readDescription) { this.readDescription = readDescription; return this; } - public SqlQuerySpecBuilder castDescription(String castDescription) { + public BigQuerySpecBuilder castDescription(String castDescription) { this.castDescription = castDescription; return this; } - public SqlQuerySpecBuilder sql(String sql) { + public BigQuerySpecBuilder sql(String sql) { this.sql = sql; return this; } - public SqlQuerySpec build() { - return new SqlQuerySpec(readDescription, castDescription, sql); + public BigQuerySpecBuilder queryTempProject(String queryTempProject) { + this.queryTempProject = queryTempProject; + return this; + } + + public BigQuerySpecBuilder queryTempDataset(String queryTempDataset) { + this.queryTempDataset = queryTempDataset; + return this; + } + + public BigQuerySpec build() { + return new BigQuerySpec( + readDescription, castDescription, sql, queryTempProject, queryTempDataset); } } } diff --git a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySource.java b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySource.java index 829b25d46d..f0257d0239 100644 --- a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySource.java +++ b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySource.java @@ -22,11 +22,19 @@ public class BigQuerySource implements Source { private final String name; private final String query; + private final String queryTempProject; + private final String queryTempDataset; public BigQuerySource(String name, String query) { + this(name, query, null, null); + } + public BigQuerySource( + String name, String query, String queryTempProject, String queryTempDataset) { this.name = name; this.query = query; + this.queryTempProject = queryTempProject; + this.queryTempDataset = queryTempDataset; } @Override @@ -43,6 +51,14 @@ public String getQuery() { return query; } + public String getQueryTempProject() { + return queryTempProject; + } + + public String getQueryTempDataset() { + return queryTempDataset; + } + @Override public boolean equals(Object o) { if (this == o) { @@ -62,6 +78,19 @@ public int hashCode() { @Override public String toString() { - return "BigQuerySource{" + "name='" + name + '\'' + ", query='" + query + '\'' + '}'; + return "BigQuerySource{" + + "name='" + + name + + '\'' + + ", query='" + + query + + '\'' + + ", queryTempProject='" + + queryTempProject + + '\'' + + ", queryTempDataset='" + + queryTempDataset + + '\'' + + '}'; } } diff --git a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySourceProvider.java b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySourceProvider.java index 0cd31c12c8..23a7abb478 100644 --- a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySourceProvider.java +++ b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/sources/BigQuerySourceProvider.java @@ -15,11 +15,12 @@ */ package com.google.cloud.teleport.v2.neo4j.model.sources; +import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; +import java.util.Optional; import org.neo4j.importer.v1.sources.SourceProvider; public class BigQuerySourceProvider implements SourceProvider { - @Override public String supportedType() { return "bigquery"; @@ -27,6 +28,10 @@ public String supportedType() { @Override public BigQuerySource provide(ObjectNode node) { - return new BigQuerySource(node.get("name").textValue(), node.get("query").textValue()); + return new BigQuerySource( + node.get("name").textValue(), + node.get("query").textValue(), + Optional.ofNullable(node.get("query_temp_project")).map(JsonNode::textValue).orElse(null), + Optional.ofNullable(node.get("query_temp_dataset")).map(JsonNode::textValue).orElse(null)); } } diff --git a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/validation/BigQuerySourceProjectDatasetValidator.java b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/validation/BigQuerySourceProjectDatasetValidator.java new file mode 100644 index 0000000000..a293308768 --- /dev/null +++ b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/model/validation/BigQuerySourceProjectDatasetValidator.java @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.neo4j.model.validation; + +import com.google.cloud.teleport.v2.neo4j.model.sources.BigQuerySource; +import java.util.LinkedHashSet; +import java.util.Set; +import org.neo4j.importer.v1.sources.Source; +import org.neo4j.importer.v1.validation.SpecificationValidationResult; +import org.neo4j.importer.v1.validation.SpecificationValidator; + +public class BigQuerySourceProjectDatasetValidator implements SpecificationValidator { + + private static final String ERROR_CODE = "DFBQ-001"; + private final Set paths = new LinkedHashSet<>(); + + @Override + public void visitSource(int index, Source source) { + if (!(source instanceof BigQuerySource)) { + return; + } + + var queryTempProject = ((BigQuerySource) source).getQueryTempProject(); + var queryTempDataset = ((BigQuerySource) source).getQueryTempDataset(); + + if (queryTempProject != null && queryTempDataset == null) { + paths.add(String.format("$.sources[%d]", index)); + } + } + + @Override + public boolean report(SpecificationValidationResult.Builder builder) { + paths.forEach( + path -> + builder.addError( + path, + ERROR_CODE, + String.format( + "%s query_temp_project is provided, but query_temp_dataset is missing", path))); + return paths.isEmpty(); + } +} diff --git a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BigQueryImpl.java b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BigQueryImpl.java index e390302555..2c5dd64db2 100644 --- a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BigQueryImpl.java +++ b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BigQueryImpl.java @@ -15,8 +15,8 @@ */ package com.google.cloud.teleport.v2.neo4j.providers.bigquery; -import com.google.cloud.teleport.v2.neo4j.model.helpers.SqlQuerySpec; -import com.google.cloud.teleport.v2.neo4j.model.helpers.SqlQuerySpec.SqlQuerySpecBuilder; +import com.google.cloud.teleport.v2.neo4j.model.helpers.BigQuerySpec; +import com.google.cloud.teleport.v2.neo4j.model.helpers.BigQuerySpec.BigQuerySpecBuilder; import com.google.cloud.teleport.v2.neo4j.model.helpers.TargetQuerySpec; import com.google.cloud.teleport.v2.neo4j.model.helpers.TargetSequence; import com.google.cloud.teleport.v2.neo4j.model.job.OptionsParams; @@ -33,11 +33,9 @@ /** Provider implementation for reading and writing BigQuery. */ public class BigQueryImpl implements Provider { - private static final Logger LOG = LoggerFactory.getLogger(BigQueryImpl.class); private final BigQuerySource source; private final TargetSequence targetSequence; - private OptionsParams optionsParams; public BigQueryImpl(BigQuerySource source, TargetSequence targetSequence) { @@ -67,7 +65,7 @@ public PTransform> queryTargetBeamRows(TargetQuerySpec @Override public PTransform> queryMetadata() { - return new BqQueryToRow(getMetadataQueryBeamSpec(source)); + return new BqQueryToRow(getMetadataQueryBeamSpec()); } /** @@ -75,7 +73,7 @@ public PTransform> queryMetadata() { * * @return helper object includes metadata and SQL */ - public SqlQuerySpec getMetadataQueryBeamSpec(BigQuerySource source) { + public BigQuerySpec getMetadataQueryBeamSpec() { String baseQuery = source.getQuery(); @@ -85,10 +83,12 @@ public SqlQuerySpec getMetadataQueryBeamSpec(BigQuerySource source) { String zeroRowSql = "SELECT * FROM (" + baseQuery + ") LIMIT 0"; LOG.info("Reading BQ metadata with query: {}", zeroRowSql); - return new SqlQuerySpecBuilder() + return new BigQuerySpecBuilder() .readDescription("Read from BQ " + source.getName()) .castDescription("Cast to BeamRow " + source.getName()) .sql(zeroRowSql) + .queryTempProject(source.getQueryTempProject()) + .queryTempDataset(source.getQueryTempDataset()) .build(); } @@ -97,11 +97,13 @@ public SqlQuerySpec getMetadataQueryBeamSpec(BigQuerySource source) { * * @return helper object includes metadata and SQL */ - private SqlQuerySpec getSourceQueryBeamSpec() { - return new SqlQuerySpecBuilder() + private BigQuerySpec getSourceQueryBeamSpec() { + return new BigQuerySpecBuilder() .castDescription("Cast to BeamRow " + source.getName()) .readDescription("Read from BQ " + source.getName()) .sql(source.getQuery()) + .queryTempProject(source.getQueryTempProject()) + .queryTempDataset(source.getQueryTempDataset()) .build(); } @@ -110,7 +112,7 @@ private SqlQuerySpec getSourceQueryBeamSpec() { * * @return helper object includes metadata and SQL */ - private SqlQuerySpec getTargetQueryBeamSpec(TargetQuerySpec spec) { + private BigQuerySpec getTargetQueryBeamSpec(TargetQuerySpec spec) { var sourceFields = ModelUtils.getBeamFieldSet(spec.getSourceBeamSchema()); var target = spec.getTarget(); var startNodeTarget = spec.getStartNodeTarget(); @@ -118,12 +120,14 @@ private SqlQuerySpec getTargetQueryBeamSpec(TargetQuerySpec spec) { String sql = ModelUtils.getTargetSql( target, startNodeTarget, endNodeTarget, sourceFields, true, source.getQuery()); - return new SqlQuerySpecBuilder() + return new BigQuerySpecBuilder() .readDescription( targetSequence.getSequenceNumber(target) + ": Read from BQ " + target.getName()) .castDescription( targetSequence.getSequenceNumber(target) + ": Cast to BeamRow " + target.getName()) .sql(sql) + .queryTempProject(source.getQueryTempProject()) + .queryTempDataset(source.getQueryTempDataset()) .build(); } } diff --git a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BqQueryToRow.java b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BqQueryToRow.java index 1bf1ae6783..2b21b9db78 100644 --- a/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BqQueryToRow.java +++ b/v2/googlecloud-to-neo4j/src/main/java/com/google/cloud/teleport/v2/neo4j/providers/bigquery/BqQueryToRow.java @@ -16,7 +16,7 @@ package com.google.cloud.teleport.v2.neo4j.providers.bigquery; import com.google.api.services.bigquery.model.TableRow; -import com.google.cloud.teleport.v2.neo4j.model.helpers.SqlQuerySpec; +import com.google.cloud.teleport.v2.neo4j.model.helpers.BigQuerySpec; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.schemas.Schema; @@ -34,25 +34,33 @@ public class BqQueryToRow extends PTransform> { private static final Logger LOG = LoggerFactory.getLogger(BqQueryToRow.class); - private final SqlQuerySpec bqQuerySpec; + private final BigQuerySpec bqQuerySpec; - public BqQueryToRow(SqlQuerySpec bqQuerySpec) { + public BqQueryToRow(BigQuerySpec bqQuerySpec) { this.bqQuerySpec = bqQuerySpec; } @Override public PCollection expand(PBegin input) { - String rewrittenSql = this.bqQuerySpec.getSql(); LOG.info("Reading BQ with query: {}", rewrittenSql); - PCollection sourceRows = - input.apply( - bqQuerySpec.getReadDescription(), - BigQueryIO.readTableRowsWithSchema() - .fromQuery(rewrittenSql) - .usingStandardSql() - .withTemplateCompatibility()); + var read = + BigQueryIO.readTableRowsWithSchema() + .fromQuery(rewrittenSql) + .usingStandardSql() + .withTemplateCompatibility(); + + var queryTempProject = this.bqQuerySpec.getQueryTempProject(); + var queryTempDataset = this.bqQuerySpec.getQueryTempDataset(); + + if (queryTempProject != null && queryTempDataset != null) { + read = read.withQueryTempProjectAndDataset(queryTempProject, queryTempDataset); + } else if (queryTempDataset != null) { + read = read.withQueryTempDataset(queryTempDataset); + } + + PCollection sourceRows = input.apply(bqQuerySpec.getReadDescription(), read); Schema beamSchema = sourceRows.getSchema(); Coder rowCoder = SchemaCoder.of(beamSchema); diff --git a/v2/googlecloud-to-neo4j/src/main/resources/META-INF/services/org.neo4j.importer.v1.validation.SpecificationValidator b/v2/googlecloud-to-neo4j/src/main/resources/META-INF/services/org.neo4j.importer.v1.validation.SpecificationValidator index aa73128b88..8ccfbd6008 100644 --- a/v2/googlecloud-to-neo4j/src/main/resources/META-INF/services/org.neo4j.importer.v1.validation.SpecificationValidator +++ b/v2/googlecloud-to-neo4j/src/main/resources/META-INF/services/org.neo4j.importer.v1.validation.SpecificationValidator @@ -1,4 +1,5 @@ # keep this sorted +com.google.cloud.teleport.v2.neo4j.model.validation.BigQuerySourceProjectDatasetValidator com.google.cloud.teleport.v2.neo4j.model.validation.DuplicateAggregateFieldNameValidator com.google.cloud.teleport.v2.neo4j.model.validation.DuplicateTextHeaderValidator com.google.cloud.teleport.v2.neo4j.model.validation.InlineSourceDataValidator diff --git a/v2/googlecloud-to-neo4j/src/test/java/com/google/cloud/teleport/v2/neo4j/model/validation/BigQuerySourceProjectDatasetValidatorTest.java b/v2/googlecloud-to-neo4j/src/test/java/com/google/cloud/teleport/v2/neo4j/model/validation/BigQuerySourceProjectDatasetValidatorTest.java new file mode 100644 index 0000000000..9dfb88845c --- /dev/null +++ b/v2/googlecloud-to-neo4j/src/test/java/com/google/cloud/teleport/v2/neo4j/model/validation/BigQuerySourceProjectDatasetValidatorTest.java @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.neo4j.model.validation; + +import static com.google.common.truth.Truth.assertThat; +import static org.neo4j.importer.v1.ImportSpecificationDeserializer.deserialize; + +import java.io.StringReader; +import org.junit.Assert; +import org.junit.Test; +import org.neo4j.importer.v1.validation.InvalidSpecificationException; + +public class BigQuerySourceProjectDatasetValidatorTest { + @Test + public void fails_if_bigquery_source_only_has_temp_project_id_but_not_temp_dataset_id() { + var spec = + "{\n" + + " \"version\": \"1\",\n" + + " \"sources\": [{\n" + + " \"type\": \"bigquery\",\n" + + " \"name\": \"a-source\",\n" + + " \"query\": \"SELECT field_1 FROM project.dataset.table\",\n" + + " \"query_temp_project\": \"project\"\n" + + " }],\n" + + " \"targets\": {\n" + + " \"nodes\": [{\n" + + " \"name\": \"a-target\",\n" + + " \"source\": \"a-source\",\n" + + " \"write_mode\": \"merge\",\n" + + " \"labels\": [\"Placeholder\"],\n" + + " \"properties\": [\n" + + " {\"source_field\": \"field_1\", \"target_property\": \"property\"}\n" + + " ],\n" + + " \"schema\": {\n" + + " \"key_constraints\": [\n" + + " {\"name\": \"key property\", \"label\": \"Placeholder\", \"properties\": [\"property\"]}\n" + + " ]\n" + + " }\n" + + " }]\n" + + " }\n" + + "}"; + + var exception = + Assert.assertThrows( + InvalidSpecificationException.class, () -> deserialize(new StringReader(spec))); + + assertThat(exception).hasMessageThat().contains("1 error(s)"); + assertThat(exception).hasMessageThat().contains("0 warning(s)"); + assertThat(exception) + .hasMessageThat() + .contains("$.sources[0] query_temp_project is provided, but query_temp_dataset is missing"); + } +} diff --git a/v2/googlecloud-to-splunk/README_GCS_To_Splunk.md b/v2/googlecloud-to-splunk/README_GCS_To_Splunk.md index 368074a3e7..6b59be32f6 100644 --- a/v2/googlecloud-to-splunk/README_GCS_To_Splunk.md +++ b/v2/googlecloud-to-splunk/README_GCS_To_Splunk.md @@ -18,32 +18,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **invalidOutputPath** : Cloud Storage path where to write objects that could not be converted to Splunk objects or pushed to Splunk. (Example: gs://your-bucket/your-path). -* **inputFileSpec** : The Cloud Storage file pattern to search for CSV files. Example: gs://mybucket/test-*.csv. -* **deadletterTable** : Messages failed to reach the target for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. (Example: your-project:your-dataset.your-table-name). -* **url** : Splunk Http Event Collector (HEC) url. This should be routable from the VPC in which the pipeline runs. (Example: https://splunk-hec-host:8088). -* **tokenSource** : Source of the token. One of PLAINTEXT, KMS or SECRET_MANAGER. If tokenSource is set to KMS, tokenKMSEncryptionKey and encrypted token must be provided. If tokenSource is set to SECRET_MANAGER, tokenSecretId must be provided. If tokenSource is set to PLAINTEXT, token must be provided. +* **invalidOutputPath**: Cloud Storage path where to write objects that could not be converted to Splunk objects or pushed to Splunk. For example, `gs://your-bucket/your-path`. +* **inputFileSpec**: The Cloud Storage file pattern to search for CSV files. For example, `gs://mybucket/test-*.csv`. +* **deadletterTable**: Messages failed to reach the target for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. For example, `your-project:your-dataset.your-table-name`. +* **url**: Splunk Http Event Collector (HEC) url. This should be routable from the VPC in which the pipeline runs. For example, `https://splunk-hec-host:8088`. +* **tokenSource**: Source of the token. One of PLAINTEXT, KMS or SECRET_MANAGER. If tokenSource is set to KMS, tokenKMSEncryptionKey and encrypted token must be provided. If tokenSource is set to SECRET_MANAGER, tokenSecretId must be provided. If tokenSource is set to PLAINTEXT, token must be provided. ### Optional parameters -* **containsHeaders** : Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. -* **delimiter** : The column delimiter of the input text files. Default: use delimiter provided in csvFormat (Example: ,). -* **csvFormat** : CSV format specification to use for parsing records. Default is: Default. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. -* **jsonSchemaPath** : The path to the JSON schema. Defaults to: null. (Example: gs://path/to/schema). -* **largeNumFiles** : Set to true if number of files is in the tens of thousands. Defaults to: false. -* **csvFileEncoding** : The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16. Defaults to: UTF-8. -* **logDetailedCsvConversionErrors** : Set to true to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: false. -* **token** : Splunk Http Event Collector (HEC) authentication token. Must be provided if the tokenSource is set to PLAINTEXT or KMS. -* **batchCount** : Batch size for sending multiple events to Splunk HEC. Default 1 (no batching). -* **disableCertificateValidation** : Disable SSL certificate validation (true/false). Default false (validation enabled). If true, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored. -* **parallelism** : Maximum number of parallel requests. Default: 1 (no parallelism). -* **tokenKMSEncryptionKey** : The Cloud KMS key to decrypt the HEC token string. This parameter must be provided if the tokenSource is set to KMS. If this parameter is provided, token string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **tokenSecretId** : Secret Manager secret ID for the token. This parameter should be provided if the tokenSource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **rootCaCertificatePath** : The full URL to root CA certificate in Cloud Storage. The certificate provided in Cloud Storage must be DER-encoded and may be supplied in binary or printable (Base64) encoding. If the certificate is provided in Base64 encoding, it must be bounded at the beginning by -----BEGIN CERTIFICATE-----, and must be bounded at the end by -----END CERTIFICATE-----. If this parameter is provided, this private CA certificate file will be fetched and added to Dataflow worker's trust store in order to verify Splunk HEC endpoint's SSL certificate which is signed by that private CA. If this parameter is not provided, the default trust store is used. (Example: gs://mybucket/mycerts/privateCA.crt). -* **enableBatchLogs** : Parameter which specifies if logs should be enabled for batches written to Splunk. Defaults to: true. -* **enableGzipHttpCompression** : Parameter which specifies if HTTP requests sent to Splunk HEC should be GZIP encoded. Defaults to: true. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **containsHeaders**: Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. +* **delimiter**: The column delimiter of the input text files. Default: `,` For example, `,`. +* **csvFormat**: CSV format specification to use for parsing records. Default is: `Default`. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. +* **jsonSchemaPath**: The path to the JSON schema. Defaults to `null`. For example, `gs://path/to/schema`. +* **largeNumFiles**: Set to true if number of files is in the tens of thousands. Defaults to `false`. +* **csvFileEncoding**: The CSV file character encoding format. Allowed values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`. Defaults to: UTF-8. +* **logDetailedCsvConversionErrors**: Set to `true` to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: `false`. +* **token**: Splunk Http Event Collector (HEC) authentication token. Must be provided if the tokenSource is set to PLAINTEXT or KMS. +* **batchCount**: Batch size for sending multiple events to Splunk HEC. Default 1 (no batching). +* **disableCertificateValidation**: Disable SSL certificate validation (true/false). Default false (validation enabled). If true, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored. +* **parallelism**: Maximum number of parallel requests. Default: 1 (no parallelism). +* **tokenKMSEncryptionKey**: The Cloud KMS key to decrypt the HEC token string. This parameter must be provided if the tokenSource is set to KMS. If this parameter is provided, token string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **tokenSecretId**: Secret Manager secret ID for the token. This parameter should be provided if the tokenSource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **rootCaCertificatePath**: The full URL to root CA certificate in Cloud Storage. The certificate provided in Cloud Storage must be DER-encoded and may be supplied in binary or printable (Base64) encoding. If the certificate is provided in Base64 encoding, it must be bounded at the beginning by -----BEGIN CERTIFICATE-----, and must be bounded at the end by -----END CERTIFICATE-----. If this parameter is provided, this private CA certificate file will be fetched and added to Dataflow worker's trust store in order to verify Splunk HEC endpoint's SSL certificate which is signed by that private CA. If this parameter is not provided, the default trust store is used. For example, `gs://mybucket/mycerts/privateCA.crt`. +* **enableBatchLogs**: Parameter which specifies if logs should be enabled for batches written to Splunk. Defaults to: true. +* **enableGzipHttpCompression**: Parameter which specifies if HTTP requests sent to Splunk HEC should be GZIP encoded. Defaults to: true. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). ## User-Defined functions (UDFs) @@ -281,15 +281,15 @@ resource "google_dataflow_flex_template_job" "gcs_to_splunk" { name = "gcs-to-splunk" region = var.region parameters = { - invalidOutputPath = "gs://your-bucket/your-path" + invalidOutputPath = "" inputFileSpec = "" - deadletterTable = "your-project:your-dataset.your-table-name" - url = "https://splunk-hec-host:8088" + deadletterTable = "" + url = "" tokenSource = "" # containsHeaders = "false" - # delimiter = "," + # delimiter = "" # csvFormat = "Default" - # jsonSchemaPath = "gs://path/to/schema" + # jsonSchemaPath = "" # largeNumFiles = "false" # csvFileEncoding = "UTF-8" # logDetailedCsvConversionErrors = "false" @@ -297,12 +297,12 @@ resource "google_dataflow_flex_template_job" "gcs_to_splunk" { # batchCount = "" # disableCertificateValidation = "" # parallelism = "" - # tokenKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # tokenSecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" - # rootCaCertificatePath = "gs://mybucket/mycerts/privateCA.crt" + # tokenKMSEncryptionKey = "" + # tokenSecretId = "" + # rootCaCertificatePath = "" # enableBatchLogs = "true" # enableGzipHttpCompression = "true" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" } } diff --git a/v2/googlecloud-to-splunk/README_GCS_To_Splunk_Xlang.md b/v2/googlecloud-to-splunk/README_GCS_To_Splunk_Xlang.md index 6810a88c9d..062d49e1b2 100644 --- a/v2/googlecloud-to-splunk/README_GCS_To_Splunk_Xlang.md +++ b/v2/googlecloud-to-splunk/README_GCS_To_Splunk_Xlang.md @@ -18,32 +18,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **invalidOutputPath** : Cloud Storage path where to write objects that could not be converted to Splunk objects or pushed to Splunk. (Example: gs://your-bucket/your-path). -* **inputFileSpec** : The Cloud Storage file pattern to search for CSV files. Example: gs://mybucket/test-*.csv. -* **deadletterTable** : Messages failed to reach the target for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. (Example: your-project:your-dataset.your-table-name). -* **url** : Splunk Http Event Collector (HEC) url. This should be routable from the VPC in which the pipeline runs. (Example: https://splunk-hec-host:8088). -* **tokenSource** : Source of the token. One of PLAINTEXT, KMS or SECRET_MANAGER. If tokenSource is set to KMS, tokenKMSEncryptionKey and encrypted token must be provided. If tokenSource is set to SECRET_MANAGER, tokenSecretId must be provided. If tokenSource is set to PLAINTEXT, token must be provided. +* **invalidOutputPath**: Cloud Storage path where to write objects that could not be converted to Splunk objects or pushed to Splunk. For example, `gs://your-bucket/your-path`. +* **inputFileSpec**: The Cloud Storage file pattern to search for CSV files. For example, `gs://mybucket/test-*.csv`. +* **deadletterTable**: Messages failed to reach the target for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. For example, `your-project:your-dataset.your-table-name`. +* **url**: Splunk Http Event Collector (HEC) url. This should be routable from the VPC in which the pipeline runs. For example, `https://splunk-hec-host:8088`. +* **tokenSource**: Source of the token. One of PLAINTEXT, KMS or SECRET_MANAGER. If tokenSource is set to KMS, tokenKMSEncryptionKey and encrypted token must be provided. If tokenSource is set to SECRET_MANAGER, tokenSecretId must be provided. If tokenSource is set to PLAINTEXT, token must be provided. ### Optional parameters -* **containsHeaders** : Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. -* **delimiter** : The column delimiter of the input text files. Default: use delimiter provided in csvFormat (Example: ,). -* **csvFormat** : CSV format specification to use for parsing records. Default is: Default. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. -* **jsonSchemaPath** : The path to the JSON schema. Defaults to: null. (Example: gs://path/to/schema). -* **largeNumFiles** : Set to true if number of files is in the tens of thousands. Defaults to: false. -* **csvFileEncoding** : The CSV file character encoding format. Allowed Values are US-ASCII, ISO-8859-1, UTF-8, and UTF-16. Defaults to: UTF-8. -* **logDetailedCsvConversionErrors** : Set to true to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: false. -* **token** : Splunk Http Event Collector (HEC) authentication token. Must be provided if the tokenSource is set to PLAINTEXT or KMS. -* **batchCount** : Batch size for sending multiple events to Splunk HEC. Default 1 (no batching). -* **disableCertificateValidation** : Disable SSL certificate validation (true/false). Default false (validation enabled). If true, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored. -* **parallelism** : Maximum number of parallel requests. Default: 1 (no parallelism). -* **tokenKMSEncryptionKey** : The Cloud KMS key to decrypt the HEC token string. This parameter must be provided if the tokenSource is set to KMS. If this parameter is provided, token string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name). -* **tokenSecretId** : Secret Manager secret ID for the token. This parameter should be provided if the tokenSource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: projects/your-project-id/secrets/your-secret/versions/your-secret-version). -* **rootCaCertificatePath** : The full URL to root CA certificate in Cloud Storage. The certificate provided in Cloud Storage must be DER-encoded and may be supplied in binary or printable (Base64) encoding. If the certificate is provided in Base64 encoding, it must be bounded at the beginning by -----BEGIN CERTIFICATE-----, and must be bounded at the end by -----END CERTIFICATE-----. If this parameter is provided, this private CA certificate file will be fetched and added to Dataflow worker's trust store in order to verify Splunk HEC endpoint's SSL certificate which is signed by that private CA. If this parameter is not provided, the default trust store is used. (Example: gs://mybucket/mycerts/privateCA.crt). -* **enableBatchLogs** : Parameter which specifies if logs should be enabled for batches written to Splunk. Defaults to: true. -* **enableGzipHttpCompression** : Parameter which specifies if HTTP requests sent to Splunk HEC should be GZIP encoded. Defaults to: true. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). +* **containsHeaders**: Input CSV files contain a header record (true/false). Only required if reading CSV files. Defaults to: false. +* **delimiter**: The column delimiter of the input text files. Default: `,` For example, `,`. +* **csvFormat**: CSV format specification to use for parsing records. Default is: `Default`. See https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.html for more details. Must match format names exactly found at: https://commons.apache.org/proper/commons-csv/apidocs/org/apache/commons/csv/CSVFormat.Predefined.html. +* **jsonSchemaPath**: The path to the JSON schema. Defaults to `null`. For example, `gs://path/to/schema`. +* **largeNumFiles**: Set to true if number of files is in the tens of thousands. Defaults to `false`. +* **csvFileEncoding**: The CSV file character encoding format. Allowed values are `US-ASCII`, `ISO-8859-1`, `UTF-8`, and `UTF-16`. Defaults to: UTF-8. +* **logDetailedCsvConversionErrors**: Set to `true` to enable detailed error logging when CSV parsing fails. Note that this may expose sensitive data in the logs (e.g., if the CSV file contains passwords). Default: `false`. +* **token**: Splunk Http Event Collector (HEC) authentication token. Must be provided if the tokenSource is set to PLAINTEXT or KMS. +* **batchCount**: Batch size for sending multiple events to Splunk HEC. Default 1 (no batching). +* **disableCertificateValidation**: Disable SSL certificate validation (true/false). Default false (validation enabled). If true, the certificates are not validated (all certificates are trusted) and `rootCaCertificatePath` parameter is ignored. +* **parallelism**: Maximum number of parallel requests. Default: 1 (no parallelism). +* **tokenKMSEncryptionKey**: The Cloud KMS key to decrypt the HEC token string. This parameter must be provided if the tokenSource is set to KMS. If this parameter is provided, token string should be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. The Key should be in the format projects/{gcp_project}/locations/{key_region}/keyRings/{key_ring}/cryptoKeys/{kms_key_name}. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name`. +* **tokenSecretId**: Secret Manager secret ID for the token. This parameter should be provided if the tokenSource is set to SECRET_MANAGER. Should be in the format projects/{project}/secrets/{secret}/versions/{secret_version}. For example, `projects/your-project-id/secrets/your-secret/versions/your-secret-version`. +* **rootCaCertificatePath**: The full URL to root CA certificate in Cloud Storage. The certificate provided in Cloud Storage must be DER-encoded and may be supplied in binary or printable (Base64) encoding. If the certificate is provided in Base64 encoding, it must be bounded at the beginning by -----BEGIN CERTIFICATE-----, and must be bounded at the end by -----END CERTIFICATE-----. If this parameter is provided, this private CA certificate file will be fetched and added to Dataflow worker's trust store in order to verify Splunk HEC endpoint's SSL certificate which is signed by that private CA. If this parameter is not provided, the default trust store is used. For example, `gs://mybucket/mycerts/privateCA.crt`. +* **enableBatchLogs**: Parameter which specifies if logs should be enabled for batches written to Splunk. Defaults to: true. +* **enableGzipHttpCompression**: Parameter which specifies if HTTP requests sent to Splunk HEC should be GZIP encoded. Defaults to: true. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. @@ -271,15 +271,15 @@ resource "google_dataflow_flex_template_job" "gcs_to_splunk_xlang" { name = "gcs-to-splunk-xlang" region = var.region parameters = { - invalidOutputPath = "gs://your-bucket/your-path" + invalidOutputPath = "" inputFileSpec = "" - deadletterTable = "your-project:your-dataset.your-table-name" - url = "https://splunk-hec-host:8088" + deadletterTable = "" + url = "" tokenSource = "" # containsHeaders = "false" - # delimiter = "," + # delimiter = "" # csvFormat = "Default" - # jsonSchemaPath = "gs://path/to/schema" + # jsonSchemaPath = "" # largeNumFiles = "false" # csvFileEncoding = "UTF-8" # logDetailedCsvConversionErrors = "false" @@ -287,13 +287,13 @@ resource "google_dataflow_flex_template_job" "gcs_to_splunk_xlang" { # batchCount = "" # disableCertificateValidation = "" # parallelism = "" - # tokenKMSEncryptionKey = "projects/your-project-id/locations/global/keyRings/your-keyring/cryptoKeys/your-key-name" - # tokenSecretId = "projects/your-project-id/secrets/your-secret/versions/your-secret-version" - # rootCaCertificatePath = "gs://mybucket/mycerts/privateCA.crt" + # tokenKMSEncryptionKey = "" + # tokenSecretId = "" + # rootCaCertificatePath = "" # enableBatchLogs = "true" # enableGzipHttpCompression = "true" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" } } ``` diff --git a/v2/jdbc-to-googlecloud/README_Jdbc_to_BigQuery_Flex.md b/v2/jdbc-to-googlecloud/README_Jdbc_to_BigQuery_Flex.md index 34c940e11f..2db53b91c0 100644 --- a/v2/jdbc-to-googlecloud/README_Jdbc_to_BigQuery_Flex.md +++ b/v2/jdbc-to-googlecloud/README_Jdbc_to_BigQuery_Flex.md @@ -26,36 +26,34 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverJars** : The comma-separated list of driver JAR files. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **driverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). -* **connectionURL** : The JDBC connection URL string. For example, `jdbc:mysql://some-host:3306/sampledb`. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. Remove whitespace characters from the Base64-encoded string. Note the difference between an Oracle non-RAC database connection string (`jdbc:oracle:thin:@some-host::`) and an Oracle RAC database connection string (`jdbc:oracle:thin:@//some-host[:]/`). (Example: jdbc:mysql://some-host:3306/sampledb). -* **outputTable** : The BigQuery output table location. (Example: :.). -* **bigQueryLoadingTemporaryDirectory** : The temporary directory for the BigQuery loading process. (Example: gs://your-bucket/your-files/temp_dir). +* **driverJars**: The comma-separated list of driver JAR files. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **driverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. +* **connectionURL**: The JDBC connection URL string. For example, `jdbc:mysql://some-host:3306/sampledb`. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. Remove whitespace characters from the Base64-encoded string. Note the difference between an Oracle non-RAC database connection string (`jdbc:oracle:thin:@some-host::`) and an Oracle RAC database connection string (`jdbc:oracle:thin:@//some-host[:]/`). For example, `jdbc:mysql://some-host:3306/sampledb`. +* **outputTable**: The BigQuery output table location. For example, `:.`. +* **bigQueryLoadingTemporaryDirectory**: The temporary directory for the BigQuery loading process. For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. (Example: unicode=true;characterEncoding=UTF-8). -* **username** : The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **password** : The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **query** : The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are: -DATETIME --> TIMESTAMP - -Type casting may be required if your schemas do not match. This parameter can be set to a gs:// path pointing to a file in Cloud Storage to load the query from. The file encoding should be UTF-8. (Example: select * from sampledb.sample_table). -* **KMSEncryptionKey** : The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **useColumnAlias** : If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. -* **isTruncate** : If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. -* **partitionColumn** : If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. -* **table** : The table to read from when using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. -* **lowerBound** : The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **upperBound** : The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **fetchSize** : The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. -* **createDisposition** : The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to CREATE_IF_NEEDED, this parameter must be specified. (Example: gs://your-bucket/your-schema.json). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. For example, `unicode=true;characterEncoding=UTF-8`. +* **username**: The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **password**: The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **query**: The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are `DATETIME --> TIMESTAMP`. Type casting may be required if your schemas do not match. For example, `select * from sampledb.sample_table`. +* **KMSEncryptionKey**: The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **useColumnAlias**: If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. +* **isTruncate**: If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. +* **partitionColumn**: If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. +* **table**: The table to read from when using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. +* **lowerBound**: The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **upperBound**: The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **fetchSize**: The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. +* **createDisposition**: The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to `CREATE_IF_NEEDED`, this parameter must be specified. For example, `gs://your-bucket/your-schema.json`. +* **outputDeadletterTable**: The BigQuery table to use for messages that failed to reach the output table, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the pipeline will fail on write errors.This parameter can only be specified if `useStorageWriteApi` or `useStorageWriteApiAtLeastOnce` is set to true. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. @@ -156,6 +154,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -185,6 +184,7 @@ gcloud dataflow flex-template run "jdbc-to-bigquery-flex-job" \ --parameters "fetchSize=$FETCH_SIZE" \ --parameters "createDisposition=$CREATE_DISPOSITION" \ --parameters "bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH" \ + --parameters "outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE" \ --parameters "disabledAlgorithms=$DISABLED_ALGORITHMS" \ --parameters "extraFilesToStage=$EXTRA_FILES_TO_STAGE" \ --parameters "useStorageWriteApi=$USE_STORAGE_WRITE_API" \ @@ -229,6 +229,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -241,7 +242,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="jdbc-to-bigquery-flex-job" \ -DtemplateName="Jdbc_to_BigQuery_Flex" \ --Dparameters="driverJars=$DRIVER_JARS,driverClassName=$DRIVER_CLASS_NAME,connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ +-Dparameters="driverJars=$DRIVER_JARS,driverClassName=$DRIVER_CLASS_NAME,connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ -f v2/jdbc-to-googlecloud ``` @@ -286,28 +287,29 @@ resource "google_dataflow_flex_template_job" "jdbc_to_bigquery_flex" { name = "jdbc-to-bigquery-flex" region = var.region parameters = { - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - driverClassName = "com.mysql.jdbc.Driver" - connectionURL = "jdbc:mysql://some-host:3306/sampledb" - outputTable = ":." - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" + driverJars = "" + driverClassName = "" + connectionURL = "" + outputTable = "" + bigQueryLoadingTemporaryDirectory = "" + # connectionProperties = "" # username = "" # password = "" - # query = "select * from sampledb.sample_table" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # query = "" + # KMSEncryptionKey = "" # useColumnAlias = "false" # isTruncate = "false" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "" # numPartitions = "" # lowerBound = "" # upperBound = "" # fetchSize = "50000" # createDisposition = "CREATE_NEVER" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # bigQuerySchemaPath = "" + # outputDeadletterTable = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" } diff --git a/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub.md b/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub.md index 933422b961..2fba2e3cde 100644 --- a/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub.md +++ b/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub.md @@ -18,20 +18,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma-separated Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : The query to run on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The Pub/Sub topic to publish to, in the format projects//topics/. (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma-separated Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: The query to run on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The Pub/Sub topic to publish to. For example, `projects//topics/`. ### Optional parameters -* **username** : The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **password** : The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **username**: The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **password**: The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -223,17 +223,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub" { name = "jdbc-to-pubsub" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # connectionProperties = "" + # KMSEncryptionKey = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub_Auto.md b/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub_Auto.md index ed9e24018d..959bd0103e 100644 --- a/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub_Auto.md +++ b/v2/jdbc-to-googlecloud/README_Jdbc_to_PubSub_Auto.md @@ -15,23 +15,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma separate Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : Query to be executed on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma separate Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: Query to be executed on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The name of the topic to publish data to. For example, `projects//topics/`. ### Optional parameters -* **username** : User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **password** : Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **partitionColumn** : If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. -* **table** : Table to read from using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. -* **lowerBound** : Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). -* **upperBound** : Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **username**: User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **password**: Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **partitionColumn**: If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. +* **table**: Table to read from using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. +* **lowerBound**: Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **upperBound**: Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). @@ -232,17 +232,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub_auto" { name = "jdbc-to-pubsub-auto" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # connectionProperties = "" + # KMSEncryptionKey = "" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" diff --git a/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToBigQueryOptions.java b/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToBigQueryOptions.java index 8f2769b483..2afa227c67 100644 --- a/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToBigQueryOptions.java +++ b/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToBigQueryOptions.java @@ -103,11 +103,8 @@ public interface JdbcToBigQueryOptions description = "JDBC source SQL query", helpText = "The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. " - + "Some important SQL -> BigQuery type mappings to keep in mind are:\n" - + "DATETIME --> TIMESTAMP\n" - + "\nType casting may be required if your schemas do not match. " - + "This parameter can be set to a gs:// path pointing to a file in Cloud Storage to load the query from. " - + "The file encoding should be UTF-8.", + + "Some important SQL -> BigQuery type mappings to keep in mind are `DATETIME --> TIMESTAMP`." + + " Type casting may be required if your schemas do not match.", example = "select * from sampledb.sample_table") String getQuery(); @@ -251,7 +248,7 @@ public interface JdbcToBigQueryOptions optional = true, description = "Cloud Storage path to BigQuery JSON schema", helpText = - "The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to CREATE_IF_NEEDED, this parameter must be specified.", + "The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to `CREATE_IF_NEEDED`, this parameter must be specified.", example = "gs://your-bucket/your-schema.json") String getBigQuerySchemaPath(); diff --git a/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToPubsubOptions.java b/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToPubsubOptions.java index bdd3c21d96..64a6b97406 100644 --- a/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToPubsubOptions.java +++ b/v2/jdbc-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/options/JdbcToPubsubOptions.java @@ -109,9 +109,8 @@ public interface JdbcToPubsubOptions extends CommonTemplateOptions { order = 8, groupName = "Target", description = "Output Pub/Sub topic", - helpText = - "The Pub/Sub topic to publish to, in the format projects//topics/.", - example = "projects/your-project-id/topics/your-topic-name") + helpText = "The Pub/Sub topic to publish to.", + example = "projects//topics/") @Validation.Required String getOutputTopic(); diff --git a/v2/jms-to-pubsub/README_Jms_to_PubSub.md b/v2/jms-to-pubsub/README_Jms_to_PubSub.md index 62410db107..813d400df4 100644 --- a/v2/jms-to-pubsub/README_Jms_to_PubSub.md +++ b/v2/jms-to-pubsub/README_Jms_to_PubSub.md @@ -17,15 +17,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputName** : The name of the JMS topic or queue that data is read from. (Example: queue). -* **inputType** : The JMS destination type to read data from. Can be a queue or a topic. (Example: queue). -* **outputTopic** : The name of the Pub/Sub topic to publish data to, in the format `projects//topics/`. (Example: projects/your-project-id/topics/your-topic-name). -* **username** : The username to use for authentication on the JMS server. (Example: sampleusername). -* **password** : The password associated with the provided username. (Example: samplepassword). +* **inputName**: The name of the JMS topic or queue that data is read from. For example, `queue`. +* **inputType**: The JMS destination type to read data from. Can be a queue or a topic. For example, `queue`. +* **outputTopic**: The name of the Pub/Sub topic to publish data to. For example, `projects//topics/`. +* **username**: The username to use for authentication on the JMS server. For example, `sampleusername`. +* **password**: The password associated with the provided username. For example, `samplepassword`. ### Optional parameters -* **jmsServer** : The JMS (ActiveMQ) Server IP. (Example: tcp://10.0.0.1:61616). +* **jmsServer**: The JMS (ActiveMQ) Server IP. For example, `tcp://10.0.0.1:61616`. @@ -202,12 +202,12 @@ resource "google_dataflow_flex_template_job" "jms_to_pubsub" { name = "jms-to-pubsub" region = var.region parameters = { - inputName = "queue" - inputType = "queue" - outputTopic = "projects/your-project-id/topics/your-topic-name" - username = "sampleusername" - password = "samplepassword" - # jmsServer = "tcp://10.0.0.1:61616" + inputName = "" + inputType = "" + outputTopic = "" + username = "" + password = "" + # jmsServer = "" } } ``` diff --git a/v2/jms-to-pubsub/src/main/java/com/google/cloud/teleport/v2/templates/JmsToPubsub.java b/v2/jms-to-pubsub/src/main/java/com/google/cloud/teleport/v2/templates/JmsToPubsub.java index 0c83a8292f..2c416dc9fd 100644 --- a/v2/jms-to-pubsub/src/main/java/com/google/cloud/teleport/v2/templates/JmsToPubsub.java +++ b/v2/jms-to-pubsub/src/main/java/com/google/cloud/teleport/v2/templates/JmsToPubsub.java @@ -178,9 +178,8 @@ public interface JmsToPubsubOptions extends PipelineOptions { order = 4, groupName = "Target", description = "Output Pub/Sub topic", - helpText = - "The name of the Pub/Sub topic to publish data to, in the format `projects//topics/`.", - example = "projects/your-project-id/topics/your-topic-name") + helpText = "The name of the Pub/Sub topic to publish data to.", + example = "projects//topics/") @Validation.Required String getOutputTopic(); diff --git a/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/KafkaReadOptions.java b/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/KafkaReadOptions.java index 6d42248293..91717c2bf2 100644 --- a/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/KafkaReadOptions.java +++ b/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/KafkaReadOptions.java @@ -102,18 +102,11 @@ final class Offset { }, description = "Kafka Source Authentication Mode", helpText = - ("The mode of authentication to use with the Kafka cluster. " - + "Use " - + KafkaAuthenticationMethod.NONE - + " for no authentication, " - + KafkaAuthenticationMethod.SASL_PLAIN - + " for SASL/PLAIN username and password, " - + KafkaAuthenticationMethod.TLS - + "for certificate-based authentication. " - + KafkaAuthenticationMethod.APPLICATION_DEFAULT_CREDENTIALS - + " should be used only for Google Cloud Apache Kafka for BigQuery cluster since " - + "This allow you to authenticate with Google Cloud Apache Kafka for BigQuery using application default credentials")) - @Default.String(KafkaAuthenticationMethod.APPLICATION_DEFAULT_CREDENTIALS) + "The mode of authentication to use with the Kafka cluster. " + + "Use `KafkaAuthenticationMethod.NONE` for no authentication, `KafkaAuthenticationMethod.SASL_PLAIN` for SASL/PLAIN username and password, " + + "and `KafkaAuthenticationMethod.TLS` for certificate-based authentication. `KafkaAuthenticationMethod.APPLICATION_DEFAULT_CREDENTIALS` " + + "should be used only for Google Cloud Apache Kafka for BigQuery cluster, it allows to authenticate using application default credentials.") + @Default.String(KafkaAuthenticationMethod.SASL_PLAIN) String getKafkaReadAuthenticationMode(); void setKafkaReadAuthenticationMode(String value); @@ -127,7 +120,7 @@ final class Offset { description = "Secret Version ID For Kafka SASL/PLAIN Username", helpText = "The Google Cloud Secret Manager secret ID that contains the Kafka username " - + "to use with SASL_PLAIN authentication.", + + "to use with `SASL_PLAIN` authentication.", example = "projects//secrets//versions/") @Default.String("") String getKafkaReadUsernameSecretId(); @@ -142,7 +135,7 @@ final class Offset { optional = true, description = "Secret Version ID For Kafka SASL/PLAIN Password", helpText = - "The Google Cloud Secret Manager secret ID that contains the Kafka password to use with SASL_PLAIN authentication.", + "The Google Cloud Secret Manager secret ID that contains the Kafka password to use with `SASL_PLAIN` authentication.", example = "projects//secrets//versions/") @Default.String("") String getKafkaReadPasswordSecretId(); diff --git a/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/SchemaRegistryOptions.java b/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/SchemaRegistryOptions.java index 36c93b5f28..932c3420c7 100644 --- a/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/SchemaRegistryOptions.java +++ b/v2/kafka-common/src/main/java/com/google/cloud/teleport/v2/kafka/options/SchemaRegistryOptions.java @@ -34,7 +34,7 @@ public interface SchemaRegistryOptions extends PipelineOptions { }, description = "Kafka Message Format", helpText = - "The format of the Kafka messages to read. The supported values are AVRO_CONFLUENT_WIRE_FORMAT (Confluent Schema Registry encoded Avro), AVRO_BINARY_ENCODING (Plain binary Avro), and JSON.") + "The format of the Kafka messages to read. The supported values are `AVRO_CONFLUENT_WIRE_FORMAT` (Confluent Schema Registry encoded Avro), `AVRO_BINARY_ENCODING` (Plain binary Avro), and `JSON`.") @Default.String(MessageFormatConstants.AVRO_CONFLUENT_WIRE_FORMAT) String getMessageFormat(); @@ -53,9 +53,9 @@ public interface SchemaRegistryOptions extends PipelineOptions { description = "Schema Source", optional = true, helpText = - "The Kafka schema format. Can be provided as SINGLE_SCHEMA_FILE or SCHEMA_REGISTRY. " - + "If SINGLE_SCHEMA_FILE is specified, all messages should have the schema mentioned in the avro schema file. " - + "If SCHEMA_REGISTRY is specified, the messages can have either a single schema or multiple schemas.") + "The Kafka schema format. Can be provided as `SINGLE_SCHEMA_FILE` or `SCHEMA_REGISTRY`. " + + "If `SINGLE_SCHEMA_FILE` is specified, use the schema mentioned in the avro schema file for all messages. " + + "If `SCHEMA_REGISTRY` is specified, the messages can have either a single schema or multiple schemas.") @Default.String(SchemaFormat.SINGLE_SCHEMA_FILE) String getSchemaFormat(); diff --git a/v2/kafka-to-bigquery/README_Kafka_to_BigQuery.md b/v2/kafka-to-bigquery/README_Kafka_to_BigQuery.md index 3bd8ec7c75..c5ab89ace9 100644 --- a/v2/kafka-to-bigquery/README_Kafka_to_BigQuery.md +++ b/v2/kafka-to-bigquery/README_Kafka_to_BigQuery.md @@ -21,26 +21,26 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. +* **outputTableSpec**: The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. ### Optional parameters -* **readBootstrapServers** : Kafka Bootstrap Server list, separated by commas. (Example: localhost:9092,127.0.0.1:9093). -* **bootstrapServers** : The host address of the running Apache Kafka broker servers in a comma-separated list. Each host address must be in the format `35.70.252.199:9092`. (Example: localhost:9092,127.0.0.1:9093). -* **kafkaReadTopics** : Kafka topic(s) to read input from. (Example: topic1,topic2). -* **inputTopics** : The Apache Kafka input topics to read from in a comma-separated list. (Example: topic1,topic2). -* **outputDeadletterTable** : BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. (Example: your-project-id:your-dataset.your-table-name). -* **messageFormat** : The message format. Can be AVRO or JSON. Defaults to: JSON. -* **avroSchemaPath** : Cloud Storage path to Avro schema file. For example, gs://MyBucket/file.avsc. -* **useStorageWriteApiAtLeastOnce** : This parameter takes effect only if "Use BigQuery Storage Write API" is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **readBootstrapServers**: Kafka Bootstrap Server list, separated by commas. For example, `localhost:9092,127.0.0.1:9093`. +* **bootstrapServers**: The host address of the running Apache Kafka broker servers in a comma-separated list. Each host address must be in the format `35.70.252.199:9092`. For example, `localhost:9092,127.0.0.1:9093`. +* **kafkaReadTopics**: Kafka topic(s) to read input from. For example, `topic1,topic2`. +* **inputTopics**: The Apache Kafka input topics to read from in a comma-separated list. For example, `topic1,topic2`. +* **outputDeadletterTable**: BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. For example, `your-project-id:your-dataset.your-table-name`. +* **messageFormat**: The message format. Can be AVRO or JSON. Defaults to: JSON. +* **avroSchemaPath**: Cloud Storage path to Avro schema file. For example, gs://MyBucket/file.avsc. +* **useStorageWriteApiAtLeastOnce**: This parameter takes effect only if "Use BigQuery Storage Write API" is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. ## User-Defined functions (UDFs) @@ -261,15 +261,15 @@ resource "google_dataflow_flex_template_job" "kafka_to_bigquery" { region = var.region parameters = { outputTableSpec = "" - # readBootstrapServers = "localhost:9092,127.0.0.1:9093" - # bootstrapServers = "localhost:9092,127.0.0.1:9093" - # kafkaReadTopics = "topic1,topic2" - # inputTopics = "topic1,topic2" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" + # readBootstrapServers = "" + # bootstrapServers = "" + # kafkaReadTopics = "" + # inputTopics = "" + # outputDeadletterTable = "" # messageFormat = "JSON" # avroSchemaPath = "" # useStorageWriteApiAtLeastOnce = "false" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" # writeDisposition = "WRITE_APPEND" diff --git a/v2/kafka-to-bigquery/README_Kafka_to_BigQuery_Flex.md b/v2/kafka-to-bigquery/README_Kafka_to_BigQuery_Flex.md index 7400bca7b7..eae6292efe 100644 --- a/v2/kafka-to-bigquery/README_Kafka_to_BigQuery_Flex.md +++ b/v2/kafka-to-bigquery/README_Kafka_to_BigQuery_Flex.md @@ -23,53 +23,53 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **readBootstrapServerAndTopic** : Kafka Topic to read the input from. -* **writeMode** : Write Mode: write records to one table or multiple tables (based on schema). The DYNAMIC_TABLE_NAMES mode is supported only for AVRO_CONFLUENT_WIRE_FORMAT Source Message Format and SCHEMA_REGISTRY Schema Source. The target table name will be auto-generated based on the Avro schema name of each message, it could either be a single schema (creating a single table) or multiple schemas (creating multiple tables). The SINGLE_TABLE_NAME mode writes to a single table (single schema) specified by the user. Defaults to SINGLE_TABLE_NAME. -* **kafkaReadAuthenticationMode** : The mode of authentication to use with the Kafka cluster. Use NONE for no authentication, SASL_PLAIN for SASL/PLAIN username and password, TLSfor certificate-based authentication. APPLICATION_DEFAULT_CREDENTIALS should be used only for Google Cloud Apache Kafka for BigQuery cluster since This allow you to authenticate with Google Cloud Apache Kafka for BigQuery using application default credentials. -* **messageFormat** : The format of the Kafka messages to read. The supported values are AVRO_CONFLUENT_WIRE_FORMAT (Confluent Schema Registry encoded Avro), AVRO_BINARY_ENCODING (Plain binary Avro), and JSON. Defaults to: AVRO_CONFLUENT_WIRE_FORMAT. -* **useBigQueryDLQ** : If true, failed messages will be written to BigQuery with extra error information. Defaults to: false. +* **readBootstrapServerAndTopic**: Kafka Topic to read the input from. +* **writeMode**: Write records to one table or multiple tables (based on schema). The `DYNAMIC_TABLE_NAMES` mode is supported only for `AVRO_CONFLUENT_WIRE_FORMAT` Source Message Format and `SCHEMA_REGISTRY` Schema Source. The target table name is auto-generated based on the Avro schema name of each message, it could either be a single schema (creating a single table) or multiple schemas (creating multiple tables). The `SINGLE_TABLE_NAME` mode writes to a single table (single schema) specified by the user. Defaults to `SINGLE_TABLE_NAME`. +* **kafkaReadAuthenticationMode**: The mode of authentication to use with the Kafka cluster. Use `KafkaAuthenticationMethod.NONE` for no authentication, `KafkaAuthenticationMethod.SASL_PLAIN` for SASL/PLAIN username and password, and `KafkaAuthenticationMethod.TLS` for certificate-based authentication. `KafkaAuthenticationMethod.APPLICATION_DEFAULT_CREDENTIALS` should be used only for Google Cloud Apache Kafka for BigQuery cluster, it allows to authenticate using application default credentials. +* **messageFormat**: The format of the Kafka messages to read. The supported values are `AVRO_CONFLUENT_WIRE_FORMAT` (Confluent Schema Registry encoded Avro), `AVRO_BINARY_ENCODING` (Plain binary Avro), and `JSON`. Defaults to: AVRO_CONFLUENT_WIRE_FORMAT. +* **useBigQueryDLQ**: If true, failed messages will be written to BigQuery with extra error information. Defaults to: false. ### Optional parameters -* **outputTableSpec** : BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. -* **persistKafkaKey** : If true, the pipeline will persist the Kafka message key in the BigQuery table, in a `_key` field of type `BYTES`. Default is false (Key is ignored). -* **outputProject** : BigQuery output project in wehich the dataset resides. Tables will be created dynamically in the dataset. Defaults to empty. -* **outputDataset** : BigQuery output dataset to write the output to. Tables will be created dynamically in the dataset. If the tables are created beforehand, the table names should follow the specified naming convention. The name should be `bqTableNamePrefix + Avro Schema FullName` , each word will be separated by a hyphen '-'. Defaults to empty. -* **bqTableNamePrefix** : Naming prefix to be used while creating BigQuery output tables. Only applicable when using schema registry. Defaults to empty. -* **createDisposition** : BigQuery CreateDisposition. For example, CREATE_IF_NEEDED, CREATE_NEVER. Defaults to: CREATE_IF_NEEDED. -* **writeDisposition** : BigQuery WriteDisposition. For example, WRITE_APPEND, WRITE_EMPTY or WRITE_TRUNCATE. Defaults to: WRITE_APPEND. -* **useAutoSharding** : If true, the pipeline uses auto-sharding when writng to BigQueryThe default value is `true`. -* **numStorageWriteApiStreams** : Specifies the number of write streams, this parameter must be set. Default is 0. -* **storageWriteApiTriggeringFrequencySec** : Specifies the triggering frequency in seconds, this parameter must be set. Default is 5 seconds. -* **useStorageWriteApiAtLeastOnce** : This parameter takes effect only if "Use BigQuery Storage Write API" is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. -* **enableCommitOffsets** : Commit offsets of processed messages to Kafka. If enabled, this will minimize the gaps or duplicate processing of messages when restarting the pipeline. Requires specifying the Consumer Group ID. Defaults to: false. -* **consumerGroupId** : The unique identifier for the consumer group that this pipeline belongs to. Required if Commit Offsets to Kafka is enabled. Defaults to empty. -* **kafkaReadOffset** : The starting point for reading messages when no committed offsets exist. The earliest starts from the beginning, the latest from the newest message. Defaults to: latest. -* **kafkaReadUsernameSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka username to use with SASL_PLAIN authentication. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaReadPasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka password to use with SASL_PLAIN authentication. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaReadKeystoreLocation** : The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key to use when authenticating with the Kafka cluster. (Example: gs://your-bucket/keystore.jks). -* **kafkaReadTruststoreLocation** : The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the Kafka broker. -* **kafkaReadTruststorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for Kafka TLS authentication (Example: projects//secrets//versions/). -* **kafkaReadKeystorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the Java KeyStore (JKS) file for Kafka TLS authentication. (Example: projects//secrets//versions/). -* **kafkaReadKeyPasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for Kafka TLS authentication. (Example: projects//secrets//versions/). -* **schemaFormat** : The Kafka schema format. Can be provided as SINGLE_SCHEMA_FILE or SCHEMA_REGISTRY. If SINGLE_SCHEMA_FILE is specified, all messages should have the schema mentioned in the avro schema file. If SCHEMA_REGISTRY is specified, the messages can have either a single schema or multiple schemas. Defaults to: SINGLE_SCHEMA_FILE. -* **confluentAvroSchemaPath** : The Google Cloud Storage path to the single Avro schema file used to decode all of the messages in a topic. Defaults to empty. -* **schemaRegistryConnectionUrl** : The URL for the Confluent Schema Registry instance used to manage Avro schemas for message decoding. Defaults to empty. -* **binaryAvroSchemaPath** : The Google Cloud Storage path to the Avro schema file used to decode binary-encoded Avro messages. Defaults to empty. -* **schemaRegistryAuthenticationMode** : Schema Registry authentication mode. Can be NONE, TLS or OAUTH. Defaults to: NONE. -* **schemaRegistryTruststoreLocation** : Location of the SSL certificate where the trust store for authentication to Schema Registry are stored. (Example: /your-bucket/truststore.jks). -* **schemaRegistryTruststorePasswordSecretId** : SecretId in secret manager where the password to access secret in truststore is stored. (Example: projects/your-project-number/secrets/your-secret-name/versions/your-secret-version). -* **schemaRegistryKeystoreLocation** : Keystore location that contains the SSL certificate and private key. (Example: /your-bucket/keystore.jks). -* **schemaRegistryKeystorePasswordSecretId** : SecretId in secret manager where the password to access the keystore file (Example: projects/your-project-number/secrets/your-secret-name/versions/your-secret-version). -* **schemaRegistryKeyPasswordSecretId** : SecretId of password required to access the client's private key stored within the keystore (Example: projects/your-project-number/secrets/your-secret-name/versions/your-secret-version). -* **schemaRegistryOauthClientId** : Client ID used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. -* **schemaRegistryOauthClientSecretId** : The Google Cloud Secret Manager secret ID that contains the Client Secret to use to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. (Example: projects//secrets//versions/). -* **schemaRegistryOauthScope** : The access token scope used to authenticate the Schema Registry client in OAUTH mode. This field is optional, as the request can be made without a scope parameter passed. (Example: openid). -* **schemaRegistryOauthTokenEndpointUrl** : The HTTP(S)-based URL for the OAuth/OIDC identity provider used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. -* **outputDeadletterTable** : Fully Qualified BigQuery table name for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table.The table will be created by the template. (Example: your-project-id:your-dataset.your-table-name). -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. +* **outputTableSpec**: BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. +* **persistKafkaKey**: If true, the pipeline will persist the Kafka message key in the BigQuery table, in a `_key` field of type `BYTES`. Default is `false` (Key is ignored). +* **outputProject**: BigQuery output project in wehich the dataset resides. Tables will be created dynamically in the dataset. Defaults to empty. +* **outputDataset**: BigQuery output dataset to write the output to. Tables will be created dynamically in the dataset. If the tables are created beforehand, the table names should follow the specified naming convention. The name should be `bqTableNamePrefix + Avro Schema FullName` , each word will be separated by a hyphen `-`. Defaults to empty. +* **bqTableNamePrefix**: Naming prefix to be used while creating BigQuery output tables. Only applicable when using schema registry. Defaults to empty. +* **createDisposition**: BigQuery CreateDisposition. For example: `CREATE_IF_NEEDED`, `CREATE_NEVER`. Defaults to: CREATE_IF_NEEDED. +* **writeDisposition**: BigQuery WriteDisposition. For example: `WRITE_APPEND`, `WRITE_EMPTY` or `WRITE_TRUNCATE`. Defaults to: WRITE_APPEND. +* **useAutoSharding**: If true, the pipeline uses auto-sharding when writng to BigQueryThe default value is `true`. +* **numStorageWriteApiStreams**: Specifies the number of write streams, this parameter must be set. Default is `0`. +* **storageWriteApiTriggeringFrequencySec**: Specifies the triggering frequency in seconds, this parameter must be set. Default is 5 seconds. +* **useStorageWriteApiAtLeastOnce**: This parameter takes effect only if "Use BigQuery Storage Write API" is enabled. If enabled the at-least-once semantics will be used for Storage Write API, otherwise exactly-once semantics will be used. Defaults to: false. +* **enableCommitOffsets**: Commit offsets of processed messages to Kafka. If enabled, this will minimize the gaps or duplicate processing of messages when restarting the pipeline. Requires specifying the Consumer Group ID. Defaults to: false. +* **consumerGroupId**: The unique identifier for the consumer group that this pipeline belongs to. Required if Commit Offsets to Kafka is enabled. Defaults to empty. +* **kafkaReadOffset**: The starting point for reading messages when no committed offsets exist. The earliest starts from the beginning, the latest from the newest message. Defaults to: latest. +* **kafkaReadUsernameSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka username to use with `SASL_PLAIN` authentication. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaReadPasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka password to use with `SASL_PLAIN` authentication. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaReadKeystoreLocation**: The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key to use when authenticating with the Kafka cluster. For example, `gs://your-bucket/keystore.jks`. +* **kafkaReadTruststoreLocation**: The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the Kafka broker. +* **kafkaReadTruststorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for Kafka TLS authentication For example, `projects//secrets//versions/`. +* **kafkaReadKeystorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the Java KeyStore (JKS) file for Kafka TLS authentication. For example, `projects//secrets//versions/`. +* **kafkaReadKeyPasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for Kafka TLS authentication. For example, `projects//secrets//versions/`. +* **schemaFormat**: The Kafka schema format. Can be provided as `SINGLE_SCHEMA_FILE` or `SCHEMA_REGISTRY`. If `SINGLE_SCHEMA_FILE` is specified, use the schema mentioned in the avro schema file for all messages. If `SCHEMA_REGISTRY` is specified, the messages can have either a single schema or multiple schemas. Defaults to: SINGLE_SCHEMA_FILE. +* **confluentAvroSchemaPath**: The Google Cloud Storage path to the single Avro schema file used to decode all of the messages in a topic. Defaults to empty. +* **schemaRegistryConnectionUrl**: The URL for the Confluent Schema Registry instance used to manage Avro schemas for message decoding. Defaults to empty. +* **binaryAvroSchemaPath**: The Google Cloud Storage path to the Avro schema file used to decode binary-encoded Avro messages. Defaults to empty. +* **schemaRegistryAuthenticationMode**: Schema Registry authentication mode. Can be NONE, TLS or OAUTH. Defaults to: NONE. +* **schemaRegistryTruststoreLocation**: Location of the SSL certificate where the trust store for authentication to Schema Registry are stored. For example, `/your-bucket/truststore.jks`. +* **schemaRegistryTruststorePasswordSecretId**: SecretId in secret manager where the password to access secret in truststore is stored. For example, `projects/your-project-number/secrets/your-secret-name/versions/your-secret-version`. +* **schemaRegistryKeystoreLocation**: Keystore location that contains the SSL certificate and private key. For example, `/your-bucket/keystore.jks`. +* **schemaRegistryKeystorePasswordSecretId**: SecretId in secret manager where the password to access the keystore file For example, `projects/your-project-number/secrets/your-secret-name/versions/your-secret-version`. +* **schemaRegistryKeyPasswordSecretId**: SecretId of password required to access the client's private key stored within the keystore For example, `projects/your-project-number/secrets/your-secret-name/versions/your-secret-version`. +* **schemaRegistryOauthClientId**: Client ID used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. +* **schemaRegistryOauthClientSecretId**: The Google Cloud Secret Manager secret ID that contains the Client Secret to use to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. For example, `projects//secrets//versions/`. +* **schemaRegistryOauthScope**: The access token scope used to authenticate the Schema Registry client in OAUTH mode. This field is optional, as the request can be made without a scope parameter passed. For example, `openid`. +* **schemaRegistryOauthTokenEndpointUrl**: The HTTP(S)-based URL for the OAuth/OIDC identity provider used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. +* **outputDeadletterTable**: Fully Qualified BigQuery table name for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table.The table will be created by the template. For example, `your-project-id:your-dataset.your-table-name`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. ## User-Defined functions (UDFs) @@ -160,7 +160,7 @@ export TEMPLATE_SPEC_GCSPATH="gs://$BUCKET_NAME/templates/flex/Kafka_to_BigQuery ### Required export READ_BOOTSTRAP_SERVER_AND_TOPIC= export WRITE_MODE=SINGLE_TABLE_NAME -export KAFKA_READ_AUTHENTICATION_MODE=APPLICATION_DEFAULT_CREDENTIALS +export KAFKA_READ_AUTHENTICATION_MODE=SASL_PLAIN export MESSAGE_FORMAT=AVRO_CONFLUENT_WIRE_FORMAT export USE_BIG_QUERY_DLQ=false @@ -273,7 +273,7 @@ export REGION=us-central1 ### Required export READ_BOOTSTRAP_SERVER_AND_TOPIC= export WRITE_MODE=SINGLE_TABLE_NAME -export KAFKA_READ_AUTHENTICATION_MODE=APPLICATION_DEFAULT_CREDENTIALS +export KAFKA_READ_AUTHENTICATION_MODE=SASL_PLAIN export MESSAGE_FORMAT=AVRO_CONFLUENT_WIRE_FORMAT export USE_BIG_QUERY_DLQ=false @@ -372,7 +372,7 @@ resource "google_dataflow_flex_template_job" "kafka_to_bigquery_flex" { parameters = { readBootstrapServerAndTopic = "" writeMode = "SINGLE_TABLE_NAME" - kafkaReadAuthenticationMode = "APPLICATION_DEFAULT_CREDENTIALS" + kafkaReadAuthenticationMode = "SASL_PLAIN" messageFormat = "AVRO_CONFLUENT_WIRE_FORMAT" useBigQueryDLQ = "false" # outputTableSpec = "" @@ -389,29 +389,29 @@ resource "google_dataflow_flex_template_job" "kafka_to_bigquery_flex" { # enableCommitOffsets = "false" # consumerGroupId = "" # kafkaReadOffset = "latest" - # kafkaReadUsernameSecretId = "projects//secrets//versions/" - # kafkaReadPasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeystoreLocation = "gs://your-bucket/keystore.jks" + # kafkaReadUsernameSecretId = "" + # kafkaReadPasswordSecretId = "" + # kafkaReadKeystoreLocation = "" # kafkaReadTruststoreLocation = "" - # kafkaReadTruststorePasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeystorePasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeyPasswordSecretId = "projects//secrets//versions/" + # kafkaReadTruststorePasswordSecretId = "" + # kafkaReadKeystorePasswordSecretId = "" + # kafkaReadKeyPasswordSecretId = "" # schemaFormat = "SINGLE_SCHEMA_FILE" # confluentAvroSchemaPath = "" # schemaRegistryConnectionUrl = "" # binaryAvroSchemaPath = "" # schemaRegistryAuthenticationMode = "NONE" - # schemaRegistryTruststoreLocation = "/your-bucket/truststore.jks" - # schemaRegistryTruststorePasswordSecretId = "projects/your-project-number/secrets/your-secret-name/versions/your-secret-version" - # schemaRegistryKeystoreLocation = "/your-bucket/keystore.jks" - # schemaRegistryKeystorePasswordSecretId = "projects/your-project-number/secrets/your-secret-name/versions/your-secret-version" - # schemaRegistryKeyPasswordSecretId = "projects/your-project-number/secrets/your-secret-name/versions/your-secret-version" + # schemaRegistryTruststoreLocation = "" + # schemaRegistryTruststorePasswordSecretId = "" + # schemaRegistryKeystoreLocation = "" + # schemaRegistryKeystorePasswordSecretId = "" + # schemaRegistryKeyPasswordSecretId = "" # schemaRegistryOauthClientId = "" - # schemaRegistryOauthClientSecretId = "projects//secrets//versions/" - # schemaRegistryOauthScope = "openid" + # schemaRegistryOauthClientSecretId = "" + # schemaRegistryOauthScope = "" # schemaRegistryOauthTokenEndpointUrl = "" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # outputDeadletterTable = "" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" } diff --git a/v2/kafka-to-bigquery/src/main/java/com/google/cloud/teleport/v2/options/KafkaToBigQueryFlexOptions.java b/v2/kafka-to-bigquery/src/main/java/com/google/cloud/teleport/v2/options/KafkaToBigQueryFlexOptions.java index 4ad4a0bf4f..5d06d494db 100644 --- a/v2/kafka-to-bigquery/src/main/java/com/google/cloud/teleport/v2/options/KafkaToBigQueryFlexOptions.java +++ b/v2/kafka-to-bigquery/src/main/java/com/google/cloud/teleport/v2/options/KafkaToBigQueryFlexOptions.java @@ -52,7 +52,7 @@ public interface KafkaToBigQueryFlexOptions optional = true, description = "Persist the Kafka Message Key to the BigQuery table", helpText = - "If true, the pipeline will persist the Kafka message key in the BigQuery table, in a `_key` field of type `BYTES`. Default is false (Key is ignored).") + "If true, the pipeline will persist the Kafka message key in the BigQuery table, in a `_key` field of type `BYTES`. Default is `false` (Key is ignored).") @Default.Boolean(false) Boolean getPersistKafkaKey(); @@ -69,12 +69,12 @@ public interface KafkaToBigQueryFlexOptions optional = false, description = "Table Name Strategy", helpText = - "Write Mode: write records to one table or multiple tables (based on schema)." - + " The DYNAMIC_TABLE_NAMES mode is supported only for AVRO_CONFLUENT_WIRE_FORMAT Source Message Format" - + " and SCHEMA_REGISTRY Schema Source. The target table name will be auto-generated based on the Avro" + "Write records to one table or multiple tables (based on schema)." + + " The `DYNAMIC_TABLE_NAMES` mode is supported only for `AVRO_CONFLUENT_WIRE_FORMAT` Source Message Format" + + " and `SCHEMA_REGISTRY` Schema Source. The target table name is auto-generated based on the Avro" + " schema name of each message, it could either be a single schema (creating a single table) or" - + " multiple schemas (creating multiple tables). The SINGLE_TABLE_NAME mode writes to a single" - + " table (single schema) specified by the user. Defaults to SINGLE_TABLE_NAME.") + + " multiple schemas (creating multiple tables). The `SINGLE_TABLE_NAME` mode writes to a single" + + " table (single schema) specified by the user. Defaults to `SINGLE_TABLE_NAME`.") @Default.String("SINGLE_TABLE_NAME") String getWriteMode(); @@ -119,7 +119,7 @@ public interface KafkaToBigQueryFlexOptions "BigQuery output dataset to write the output to. Tables will be created dynamically in the dataset." + " If the tables are created beforehand, the table names should follow the specified naming convention." + " The name should be `bqTableNamePrefix + Avro Schema FullName` ," - + " each word will be separated by a hyphen '-'.") + + " each word will be separated by a hyphen `-`.") @Default.String("") String getOutputDataset(); @@ -149,7 +149,7 @@ public interface KafkaToBigQueryFlexOptions optional = true, description = "Write Disposition to use for BigQuery", helpText = - "BigQuery WriteDisposition. For example, WRITE_APPEND, WRITE_EMPTY or WRITE_TRUNCATE.", + "BigQuery WriteDisposition. For example: `WRITE_APPEND`, `WRITE_EMPTY` or `WRITE_TRUNCATE`.", hiddenUi = true) @Default.String("WRITE_APPEND") String getWriteDisposition(); @@ -165,7 +165,7 @@ public interface KafkaToBigQueryFlexOptions }, optional = true, description = "Create Disposition to use for BigQuery", - helpText = "BigQuery CreateDisposition. For example, CREATE_IF_NEEDED, CREATE_NEVER.", + helpText = "BigQuery CreateDisposition. For example: `CREATE_IF_NEEDED`, `CREATE_NEVER`.", hiddenUi = true) @Default.String("CREATE_IF_NEEDED") String getCreateDisposition(); @@ -204,7 +204,8 @@ public interface KafkaToBigQueryFlexOptions groupName = "Destination", optional = true, description = "Number of streams for BigQuery Storage Write API", - helpText = "Specifies the number of write streams, this parameter must be set. Default is 0.") + helpText = + "Specifies the number of write streams, this parameter must be set. Default is `0`.") @Override @Default.Integer(0) Integer getNumStorageWriteApiStreams(); diff --git a/v2/kafka-to-gcs/README_Kafka_to_GCS.md b/v2/kafka-to-gcs/README_Kafka_to_GCS.md index c6dd004458..fdec9d1f6f 100644 --- a/v2/kafka-to-gcs/README_Kafka_to_GCS.md +++ b/v2/kafka-to-gcs/README_Kafka_to_GCS.md @@ -14,16 +14,16 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **bootstrapServers** : Kafka Bootstrap Server list, separated by commas. (Example: localhost:9092,127.0.0.1:9093). -* **inputTopics** : Kafka topic(s) to read the input from. (Example: topic1,topic2). -* **outputFileFormat** : The file format of the desired output files. Can be TEXT, AVRO or PARQUET. Defaults to TEXT. -* **outputDirectory** : The path and filename prefix for writing output files. Must end with a slash. (Example: gs://your-bucket/your-path). -* **numShards** : The maximum number of output shards produced when writing. Default number is runner-dependent. +* **bootstrapServers**: Kafka Bootstrap Server list, separated by commas. For example, `localhost:9092,127.0.0.1:9093`. +* **inputTopics**: Kafka topic(s) to read the input from. For example, `topic1,topic2`. +* **outputFileFormat**: The file format of the desired output files. Can be TEXT, AVRO or PARQUET. Defaults to TEXT. +* **outputDirectory**: The path and filename prefix for writing output files. Must end with a slash. For example, `gs://your-bucket/your-path`. +* **numShards**: The maximum number of output shards produced when writing. Default number is runner-dependent. ### Optional parameters -* **windowDuration** : The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 5m). Defaults to: 5m. -* **outputFilenamePrefix** : The prefix to place on each windowed file. (Example: output-). Defaults to: output. +* **windowDuration**: The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `5m`. Defaults to: 5m. +* **outputFilenamePrefix**: The prefix to place on each windowed file. For example, `output-`. Defaults to: output. @@ -203,13 +203,13 @@ resource "google_dataflow_flex_template_job" "kafka_to_gcs" { name = "kafka-to-gcs" region = var.region parameters = { - bootstrapServers = "localhost:9092,127.0.0.1:9093" - inputTopics = "topic1,topic2" + bootstrapServers = "" + inputTopics = "" outputFileFormat = "TEXT" - outputDirectory = "gs://your-bucket/your-path" + outputDirectory = "" numShards = "0" # windowDuration = "5m" - # outputFilenamePrefix = "output-" + # outputFilenamePrefix = "output" } } ``` diff --git a/v2/kafka-to-gcs/README_Kafka_to_Gcs_Flex.md b/v2/kafka-to-gcs/README_Kafka_to_Gcs_Flex.md index 7e612c0cc6..b000d2df1e 100644 --- a/v2/kafka-to-gcs/README_Kafka_to_Gcs_Flex.md +++ b/v2/kafka-to-gcs/README_Kafka_to_Gcs_Flex.md @@ -14,42 +14,42 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **readBootstrapServerAndTopic** : Kafka Topic to read the input from. -* **outputDirectory** : The path and filename prefix for writing output files. Must end with a slash. (Example: gs://your-bucket/your-path/). -* **kafkaReadAuthenticationMode** : The mode of authentication to use with the Kafka cluster. Use NONE for no authentication, SASL_PLAIN for SASL/PLAIN username and password, TLSfor certificate-based authentication. APPLICATION_DEFAULT_CREDENTIALS should be used only for Google Cloud Apache Kafka for BigQuery cluster since This allow you to authenticate with Google Cloud Apache Kafka for BigQuery using application default credentials. -* **messageFormat** : The format of the Kafka messages to read. The supported values are AVRO_CONFLUENT_WIRE_FORMAT (Confluent Schema Registry encoded Avro), AVRO_BINARY_ENCODING (Plain binary Avro), and JSON. Defaults to: AVRO_CONFLUENT_WIRE_FORMAT. -* **useBigQueryDLQ** : If true, failed messages will be written to BigQuery with extra error information. Defaults to: false. +* **readBootstrapServerAndTopic**: Kafka Topic to read the input from. +* **outputDirectory**: The path and filename prefix for writing output files. Must end with a slash. For example, `gs://your-bucket/your-path/`. +* **kafkaReadAuthenticationMode**: The mode of authentication to use with the Kafka cluster. Use `KafkaAuthenticationMethod.NONE` for no authentication, `KafkaAuthenticationMethod.SASL_PLAIN` for SASL/PLAIN username and password, and `KafkaAuthenticationMethod.TLS` for certificate-based authentication. `KafkaAuthenticationMethod.APPLICATION_DEFAULT_CREDENTIALS` should be used only for Google Cloud Apache Kafka for BigQuery cluster, it allows to authenticate using application default credentials. +* **messageFormat**: The format of the Kafka messages to read. The supported values are `AVRO_CONFLUENT_WIRE_FORMAT` (Confluent Schema Registry encoded Avro), `AVRO_BINARY_ENCODING` (Plain binary Avro), and `JSON`. Defaults to: AVRO_CONFLUENT_WIRE_FORMAT. +* **useBigQueryDLQ**: If true, failed messages will be written to BigQuery with extra error information. Defaults to: false. ### Optional parameters -* **windowDuration** : The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 5m). Defaults to: 5m. -* **outputFilenamePrefix** : The prefix to place on each windowed file. (Example: output-). Defaults to: output. -* **numShards** : The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Default value is decided by Dataflow. -* **enableCommitOffsets** : Commit offsets of processed messages to Kafka. If enabled, this will minimize the gaps or duplicate processing of messages when restarting the pipeline. Requires specifying the Consumer Group ID. Defaults to: false. -* **consumerGroupId** : The unique identifier for the consumer group that this pipeline belongs to. Required if Commit Offsets to Kafka is enabled. Defaults to empty. -* **kafkaReadOffset** : The starting point for reading messages when no committed offsets exist. The earliest starts from the beginning, the latest from the newest message. Defaults to: latest. -* **kafkaReadUsernameSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka username to use with SASL_PLAIN authentication. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaReadPasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka password to use with SASL_PLAIN authentication. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaReadKeystoreLocation** : The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key to use when authenticating with the Kafka cluster. (Example: gs://your-bucket/keystore.jks). -* **kafkaReadTruststoreLocation** : The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the Kafka broker. -* **kafkaReadTruststorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for Kafka TLS authentication (Example: projects//secrets//versions/). -* **kafkaReadKeystorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the Java KeyStore (JKS) file for Kafka TLS authentication. (Example: projects//secrets//versions/). -* **kafkaReadKeyPasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for Kafka TLS authentication. (Example: projects//secrets//versions/). -* **schemaFormat** : The Kafka schema format. Can be provided as SINGLE_SCHEMA_FILE or SCHEMA_REGISTRY. If SINGLE_SCHEMA_FILE is specified, all messages should have the schema mentioned in the avro schema file. If SCHEMA_REGISTRY is specified, the messages can have either a single schema or multiple schemas. Defaults to: SINGLE_SCHEMA_FILE. -* **confluentAvroSchemaPath** : The Google Cloud Storage path to the single Avro schema file used to decode all of the messages in a topic. Defaults to empty. -* **schemaRegistryConnectionUrl** : The URL for the Confluent Schema Registry instance used to manage Avro schemas for message decoding. Defaults to empty. -* **binaryAvroSchemaPath** : The Google Cloud Storage path to the Avro schema file used to decode binary-encoded Avro messages. Defaults to empty. -* **schemaRegistryAuthenticationMode** : Schema Registry authentication mode. Can be NONE, TLS or OAUTH. Defaults to: NONE. -* **schemaRegistryTruststoreLocation** : Location of the SSL certificate where the trust store for authentication to Schema Registry are stored. (Example: /your-bucket/truststore.jks). -* **schemaRegistryTruststorePasswordSecretId** : SecretId in secret manager where the password to access secret in truststore is stored. (Example: projects/your-project-number/secrets/your-secret-name/versions/your-secret-version). -* **schemaRegistryKeystoreLocation** : Keystore location that contains the SSL certificate and private key. (Example: /your-bucket/keystore.jks). -* **schemaRegistryKeystorePasswordSecretId** : SecretId in secret manager where the password to access the keystore file (Example: projects/your-project-number/secrets/your-secret-name/versions/your-secret-version). -* **schemaRegistryKeyPasswordSecretId** : SecretId of password required to access the client's private key stored within the keystore (Example: projects/your-project-number/secrets/your-secret-name/versions/your-secret-version). -* **schemaRegistryOauthClientId** : Client ID used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. -* **schemaRegistryOauthClientSecretId** : The Google Cloud Secret Manager secret ID that contains the Client Secret to use to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. (Example: projects//secrets//versions/). -* **schemaRegistryOauthScope** : The access token scope used to authenticate the Schema Registry client in OAUTH mode. This field is optional, as the request can be made without a scope parameter passed. (Example: openid). -* **schemaRegistryOauthTokenEndpointUrl** : The HTTP(S)-based URL for the OAuth/OIDC identity provider used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. -* **outputDeadletterTable** : Fully Qualified BigQuery table name for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table.The table will be created by the template. (Example: your-project-id:your-dataset.your-table-name). +* **windowDuration**: The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `5m`. Defaults to: 5m. +* **outputFilenamePrefix**: The prefix to place on each windowed file. For example, `output-`. Defaults to: output. +* **numShards**: The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Default value is decided by Dataflow. +* **enableCommitOffsets**: Commit offsets of processed messages to Kafka. If enabled, this will minimize the gaps or duplicate processing of messages when restarting the pipeline. Requires specifying the Consumer Group ID. Defaults to: false. +* **consumerGroupId**: The unique identifier for the consumer group that this pipeline belongs to. Required if Commit Offsets to Kafka is enabled. Defaults to empty. +* **kafkaReadOffset**: The starting point for reading messages when no committed offsets exist. The earliest starts from the beginning, the latest from the newest message. Defaults to: latest. +* **kafkaReadUsernameSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka username to use with `SASL_PLAIN` authentication. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaReadPasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka password to use with `SASL_PLAIN` authentication. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaReadKeystoreLocation**: The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key to use when authenticating with the Kafka cluster. For example, `gs://your-bucket/keystore.jks`. +* **kafkaReadTruststoreLocation**: The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the Kafka broker. +* **kafkaReadTruststorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for Kafka TLS authentication For example, `projects//secrets//versions/`. +* **kafkaReadKeystorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the Java KeyStore (JKS) file for Kafka TLS authentication. For example, `projects//secrets//versions/`. +* **kafkaReadKeyPasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for Kafka TLS authentication. For example, `projects//secrets//versions/`. +* **schemaFormat**: The Kafka schema format. Can be provided as `SINGLE_SCHEMA_FILE` or `SCHEMA_REGISTRY`. If `SINGLE_SCHEMA_FILE` is specified, use the schema mentioned in the avro schema file for all messages. If `SCHEMA_REGISTRY` is specified, the messages can have either a single schema or multiple schemas. Defaults to: SINGLE_SCHEMA_FILE. +* **confluentAvroSchemaPath**: The Google Cloud Storage path to the single Avro schema file used to decode all of the messages in a topic. Defaults to empty. +* **schemaRegistryConnectionUrl**: The URL for the Confluent Schema Registry instance used to manage Avro schemas for message decoding. Defaults to empty. +* **binaryAvroSchemaPath**: The Google Cloud Storage path to the Avro schema file used to decode binary-encoded Avro messages. Defaults to empty. +* **schemaRegistryAuthenticationMode**: Schema Registry authentication mode. Can be NONE, TLS or OAUTH. Defaults to: NONE. +* **schemaRegistryTruststoreLocation**: Location of the SSL certificate where the trust store for authentication to Schema Registry are stored. For example, `/your-bucket/truststore.jks`. +* **schemaRegistryTruststorePasswordSecretId**: SecretId in secret manager where the password to access secret in truststore is stored. For example, `projects/your-project-number/secrets/your-secret-name/versions/your-secret-version`. +* **schemaRegistryKeystoreLocation**: Keystore location that contains the SSL certificate and private key. For example, `/your-bucket/keystore.jks`. +* **schemaRegistryKeystorePasswordSecretId**: SecretId in secret manager where the password to access the keystore file For example, `projects/your-project-number/secrets/your-secret-name/versions/your-secret-version`. +* **schemaRegistryKeyPasswordSecretId**: SecretId of password required to access the client's private key stored within the keystore For example, `projects/your-project-number/secrets/your-secret-name/versions/your-secret-version`. +* **schemaRegistryOauthClientId**: Client ID used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. +* **schemaRegistryOauthClientSecretId**: The Google Cloud Secret Manager secret ID that contains the Client Secret to use to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. For example, `projects//secrets//versions/`. +* **schemaRegistryOauthScope**: The access token scope used to authenticate the Schema Registry client in OAUTH mode. This field is optional, as the request can be made without a scope parameter passed. For example, `openid`. +* **schemaRegistryOauthTokenEndpointUrl**: The HTTP(S)-based URL for the OAuth/OIDC identity provider used to authenticate the Schema Registry client in OAUTH mode. Required for AVRO_CONFLUENT_WIRE_FORMAT message format. +* **outputDeadletterTable**: Fully Qualified BigQuery table name for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table.The table will be created by the template. For example, `your-project-id:your-dataset.your-table-name`. @@ -130,7 +130,7 @@ export TEMPLATE_SPEC_GCSPATH="gs://$BUCKET_NAME/templates/flex/Kafka_to_Gcs_Flex ### Required export READ_BOOTSTRAP_SERVER_AND_TOPIC= export OUTPUT_DIRECTORY= -export KAFKA_READ_AUTHENTICATION_MODE=APPLICATION_DEFAULT_CREDENTIALS +export KAFKA_READ_AUTHENTICATION_MODE=SASL_PLAIN export MESSAGE_FORMAT=AVRO_CONFLUENT_WIRE_FORMAT export USE_BIG_QUERY_DLQ=false @@ -221,7 +221,7 @@ export REGION=us-central1 ### Required export READ_BOOTSTRAP_SERVER_AND_TOPIC= export OUTPUT_DIRECTORY= -export KAFKA_READ_AUTHENTICATION_MODE=APPLICATION_DEFAULT_CREDENTIALS +export KAFKA_READ_AUTHENTICATION_MODE=SASL_PLAIN export MESSAGE_FORMAT=AVRO_CONFLUENT_WIRE_FORMAT export USE_BIG_QUERY_DLQ=false @@ -308,38 +308,38 @@ resource "google_dataflow_flex_template_job" "kafka_to_gcs_flex" { region = var.region parameters = { readBootstrapServerAndTopic = "" - outputDirectory = "gs://your-bucket/your-path/" - kafkaReadAuthenticationMode = "APPLICATION_DEFAULT_CREDENTIALS" + outputDirectory = "" + kafkaReadAuthenticationMode = "SASL_PLAIN" messageFormat = "AVRO_CONFLUENT_WIRE_FORMAT" useBigQueryDLQ = "false" # windowDuration = "5m" - # outputFilenamePrefix = "output-" + # outputFilenamePrefix = "output" # numShards = "0" # enableCommitOffsets = "false" # consumerGroupId = "" # kafkaReadOffset = "latest" - # kafkaReadUsernameSecretId = "projects//secrets//versions/" - # kafkaReadPasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeystoreLocation = "gs://your-bucket/keystore.jks" + # kafkaReadUsernameSecretId = "" + # kafkaReadPasswordSecretId = "" + # kafkaReadKeystoreLocation = "" # kafkaReadTruststoreLocation = "" - # kafkaReadTruststorePasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeystorePasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeyPasswordSecretId = "projects//secrets//versions/" + # kafkaReadTruststorePasswordSecretId = "" + # kafkaReadKeystorePasswordSecretId = "" + # kafkaReadKeyPasswordSecretId = "" # schemaFormat = "SINGLE_SCHEMA_FILE" # confluentAvroSchemaPath = "" # schemaRegistryConnectionUrl = "" # binaryAvroSchemaPath = "" # schemaRegistryAuthenticationMode = "NONE" - # schemaRegistryTruststoreLocation = "/your-bucket/truststore.jks" - # schemaRegistryTruststorePasswordSecretId = "projects/your-project-number/secrets/your-secret-name/versions/your-secret-version" - # schemaRegistryKeystoreLocation = "/your-bucket/keystore.jks" - # schemaRegistryKeystorePasswordSecretId = "projects/your-project-number/secrets/your-secret-name/versions/your-secret-version" - # schemaRegistryKeyPasswordSecretId = "projects/your-project-number/secrets/your-secret-name/versions/your-secret-version" + # schemaRegistryTruststoreLocation = "" + # schemaRegistryTruststorePasswordSecretId = "" + # schemaRegistryKeystoreLocation = "" + # schemaRegistryKeystorePasswordSecretId = "" + # schemaRegistryKeyPasswordSecretId = "" # schemaRegistryOauthClientId = "" - # schemaRegistryOauthClientSecretId = "projects//secrets//versions/" - # schemaRegistryOauthScope = "openid" + # schemaRegistryOauthClientSecretId = "" + # schemaRegistryOauthScope = "" # schemaRegistryOauthTokenEndpointUrl = "" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" + # outputDeadletterTable = "" } } ``` diff --git a/v2/kafka-to-kafka/README_Kafka_to_Kafka.md b/v2/kafka-to-kafka/README_Kafka_to_Kafka.md index 7439b745e8..57621c6bdc 100644 --- a/v2/kafka-to-kafka/README_Kafka_to_Kafka.md +++ b/v2/kafka-to-kafka/README_Kafka_to_Kafka.md @@ -13,30 +13,30 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **readBootstrapServerAndTopic** : Kafka Bootstrap server and topic to read the input from. (Example: localhost:9092;topic1,topic2). -* **kafkaReadAuthenticationMode** : The mode of authentication to use with the Kafka cluster. Use NONE for no authentication, SASL_PLAIN for SASL/PLAIN username and password, TLSfor certificate-based authentication. APPLICATION_DEFAULT_CREDENTIALS should be used only for Google Cloud Apache Kafka for BigQuery cluster since This allow you to authenticate with Google Cloud Apache Kafka for BigQuery using application default credentials. -* **writeBootstrapServerAndTopic** : Kafka topic to write the output to. -* **kafkaWriteAuthenticationMethod** : The mode of authentication to use with the Kafka cluster. Use NONE for no authentication, SASL_PLAIN for SASL/PLAIN username and password, and TLS for certificate-based authentication. Defaults to: APPLICATION_DEFAULT_CREDENTIALS. +* **readBootstrapServerAndTopic**: Kafka Bootstrap server and topic to read the input from. For example, `localhost:9092;topic1,topic2`. +* **kafkaReadAuthenticationMode**: The mode of authentication to use with the Kafka cluster. Use `KafkaAuthenticationMethod.NONE` for no authentication, `KafkaAuthenticationMethod.SASL_PLAIN` for SASL/PLAIN username and password, and `KafkaAuthenticationMethod.TLS` for certificate-based authentication. `KafkaAuthenticationMethod.APPLICATION_DEFAULT_CREDENTIALS` should be used only for Google Cloud Apache Kafka for BigQuery cluster, it allows to authenticate using application default credentials. +* **writeBootstrapServerAndTopic**: Kafka topic to write the output to. +* **kafkaWriteAuthenticationMethod**: The mode of authentication to use with the Kafka cluster. Use NONE for no authentication, SASL_PLAIN for SASL/PLAIN username and password, and TLS for certificate-based authentication. Defaults to: APPLICATION_DEFAULT_CREDENTIALS. ### Optional parameters -* **enableCommitOffsets** : Commit offsets of processed messages to Kafka. If enabled, this will minimize the gaps or duplicate processing of messages when restarting the pipeline. Requires specifying the Consumer Group ID. Defaults to: false. -* **consumerGroupId** : The unique identifier for the consumer group that this pipeline belongs to. Required if Commit Offsets to Kafka is enabled. Defaults to empty. -* **kafkaReadOffset** : The starting point for reading messages when no committed offsets exist. The earliest starts from the beginning, the latest from the newest message. Defaults to: latest. -* **kafkaReadUsernameSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka username to use with SASL_PLAIN authentication. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaReadPasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka password to use with SASL_PLAIN authentication. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaReadKeystoreLocation** : The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key to use when authenticating with the Kafka cluster. (Example: gs://your-bucket/keystore.jks). -* **kafkaReadTruststoreLocation** : The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the Kafka broker. -* **kafkaReadTruststorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for Kafka TLS authentication (Example: projects//secrets//versions/). -* **kafkaReadKeystorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the Java KeyStore (JKS) file for Kafka TLS authentication. (Example: projects//secrets//versions/). -* **kafkaReadKeyPasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for Kafka TLS authentication. (Example: projects//secrets//versions/). -* **kafkaWriteUsernameSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka username for SASL_PLAIN authentication with the destination Kafka cluster. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaWritePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the Kafka password to use for SASL_PLAIN authentication with the destination Kafka cluster. (Example: projects//secrets//versions/). Defaults to empty. -* **kafkaWriteKeystoreLocation** : The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key for authenticating with the destination Kafka cluster. (Example: gs:///.jks). -* **kafkaWriteTruststoreLocation** : The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the destination Kafka broker. -* **kafkaWriteTruststorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for TLS authentication with the destination Kafka cluster. (Example: projects//secrets//versions/). -* **kafkaWriteKeystorePasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to access the Java KeyStore (JKS) file to use for TLS authentication with the destination Kafka cluster. (Example: projects//secrets//versions/). -* **kafkaWriteKeyPasswordSecretId** : The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for TLS authentication with the destination Kafka cluster. (Example: projects//secrets//versions/). +* **enableCommitOffsets**: Commit offsets of processed messages to Kafka. If enabled, this will minimize the gaps or duplicate processing of messages when restarting the pipeline. Requires specifying the Consumer Group ID. Defaults to: false. +* **consumerGroupId**: The unique identifier for the consumer group that this pipeline belongs to. Required if Commit Offsets to Kafka is enabled. Defaults to empty. +* **kafkaReadOffset**: The starting point for reading messages when no committed offsets exist. The earliest starts from the beginning, the latest from the newest message. Defaults to: latest. +* **kafkaReadUsernameSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka username to use with `SASL_PLAIN` authentication. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaReadPasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka password to use with `SASL_PLAIN` authentication. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaReadKeystoreLocation**: The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key to use when authenticating with the Kafka cluster. For example, `gs://your-bucket/keystore.jks`. +* **kafkaReadTruststoreLocation**: The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the Kafka broker. +* **kafkaReadTruststorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for Kafka TLS authentication For example, `projects//secrets//versions/`. +* **kafkaReadKeystorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the Java KeyStore (JKS) file for Kafka TLS authentication. For example, `projects//secrets//versions/`. +* **kafkaReadKeyPasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for Kafka TLS authentication. For example, `projects//secrets//versions/`. +* **kafkaWriteUsernameSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka username for SASL_PLAIN authentication with the destination Kafka cluster. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaWritePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the Kafka password to use for SASL_PLAIN authentication with the destination Kafka cluster. For example, `projects//secrets//versions/`. Defaults to empty. +* **kafkaWriteKeystoreLocation**: The Google Cloud Storage path to the Java KeyStore (JKS) file that contains the TLS certificate and private key for authenticating with the destination Kafka cluster. For example, `gs:///.jks`. +* **kafkaWriteTruststoreLocation**: The Google Cloud Storage path to the Java TrustStore (JKS) file that contains the trusted certificates to use to verify the identity of the destination Kafka broker. +* **kafkaWriteTruststorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the Java TrustStore (JKS) file for TLS authentication with the destination Kafka cluster. For example, `projects//secrets//versions/`. +* **kafkaWriteKeystorePasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to access the Java KeyStore (JKS) file to use for TLS authentication with the destination Kafka cluster. For example, `projects//secrets//versions/`. +* **kafkaWriteKeyPasswordSecretId**: The Google Cloud Secret Manager secret ID that contains the password to use to access the private key within the Java KeyStore (JKS) file for TLS authentication with the destination Kafka cluster. For example, `projects//secrets//versions/`. @@ -116,7 +116,7 @@ export TEMPLATE_SPEC_GCSPATH="gs://$BUCKET_NAME/templates/flex/Kafka_to_Kafka" ### Required export READ_BOOTSTRAP_SERVER_AND_TOPIC= -export KAFKA_READ_AUTHENTICATION_MODE=APPLICATION_DEFAULT_CREDENTIALS +export KAFKA_READ_AUTHENTICATION_MODE=SASL_PLAIN export WRITE_BOOTSTRAP_SERVER_AND_TOPIC= export KAFKA_WRITE_AUTHENTICATION_METHOD=APPLICATION_DEFAULT_CREDENTIALS @@ -183,7 +183,7 @@ export REGION=us-central1 ### Required export READ_BOOTSTRAP_SERVER_AND_TOPIC= -export KAFKA_READ_AUTHENTICATION_MODE=APPLICATION_DEFAULT_CREDENTIALS +export KAFKA_READ_AUTHENTICATION_MODE=SASL_PLAIN export WRITE_BOOTSTRAP_SERVER_AND_TOPIC= export KAFKA_WRITE_AUTHENTICATION_METHOD=APPLICATION_DEFAULT_CREDENTIALS @@ -258,27 +258,27 @@ resource "google_dataflow_flex_template_job" "kafka_to_kafka" { name = "kafka-to-kafka" region = var.region parameters = { - readBootstrapServerAndTopic = "localhost:9092;topic1,topic2" - kafkaReadAuthenticationMode = "APPLICATION_DEFAULT_CREDENTIALS" + readBootstrapServerAndTopic = "" + kafkaReadAuthenticationMode = "SASL_PLAIN" writeBootstrapServerAndTopic = "" kafkaWriteAuthenticationMethod = "APPLICATION_DEFAULT_CREDENTIALS" # enableCommitOffsets = "false" # consumerGroupId = "" # kafkaReadOffset = "latest" - # kafkaReadUsernameSecretId = "projects//secrets//versions/" - # kafkaReadPasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeystoreLocation = "gs://your-bucket/keystore.jks" + # kafkaReadUsernameSecretId = "" + # kafkaReadPasswordSecretId = "" + # kafkaReadKeystoreLocation = "" # kafkaReadTruststoreLocation = "" - # kafkaReadTruststorePasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeystorePasswordSecretId = "projects//secrets//versions/" - # kafkaReadKeyPasswordSecretId = "projects//secrets//versions/" - # kafkaWriteUsernameSecretId = "projects//secrets//versions/" - # kafkaWritePasswordSecretId = "projects//secrets//versions/" - # kafkaWriteKeystoreLocation = "gs:///.jks" + # kafkaReadTruststorePasswordSecretId = "" + # kafkaReadKeystorePasswordSecretId = "" + # kafkaReadKeyPasswordSecretId = "" + # kafkaWriteUsernameSecretId = "" + # kafkaWritePasswordSecretId = "" + # kafkaWriteKeystoreLocation = "" # kafkaWriteTruststoreLocation = "" - # kafkaWriteTruststorePasswordSecretId = "projects//secrets//versions/" - # kafkaWriteKeystorePasswordSecretId = "projects//secrets//versions/" - # kafkaWriteKeyPasswordSecretId = "projects//secrets//versions/" + # kafkaWriteTruststorePasswordSecretId = "" + # kafkaWriteKeystorePasswordSecretId = "" + # kafkaWriteKeyPasswordSecretId = "" } } ``` diff --git a/v2/kafka-to-pubsub/src/main/java/com/google/cloud/teleport/v2/options/KafkaToPubsubOptions.java b/v2/kafka-to-pubsub/src/main/java/com/google/cloud/teleport/v2/options/KafkaToPubsubOptions.java index 71602eca71..d2ce1897af 100644 --- a/v2/kafka-to-pubsub/src/main/java/com/google/cloud/teleport/v2/options/KafkaToPubsubOptions.java +++ b/v2/kafka-to-pubsub/src/main/java/com/google/cloud/teleport/v2/options/KafkaToPubsubOptions.java @@ -57,9 +57,8 @@ public interface KafkaToPubsubOptions order = 3, groupName = "Target", description = "Output Pub/Sub topic", - helpText = - "The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name'", - example = "projects/your-project-id/topics/your-topic-name") + helpText = "The name of the topic to publish data to.", + example = "projects//topics/") @Validation.Required String getOutputTopic(); diff --git a/v2/kinesis-to-pubsub/README_Kinesis_To_Pubsub.md b/v2/kinesis-to-pubsub/README_Kinesis_To_Pubsub.md index dc8cf7a4c6..6857f688ea 100644 --- a/v2/kinesis-to-pubsub/README_Kinesis_To_Pubsub.md +++ b/v2/kinesis-to-pubsub/README_Kinesis_To_Pubsub.md @@ -13,15 +13,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **secretId1** : First Secret ID containing aws key id. -* **secretId2** : Second Secret ID containing aws key id. -* **awsRegion** : AWS Region. -* **kinesisDataStream** : Name of the Kinesis Data stream to read from. Enter the full name of the Kinesis Data stream. -* **outputPubsubTopic** : The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). +* **secretId1**: First Secret ID containing aws key id. +* **secretId2**: Second Secret ID containing aws key id. +* **awsRegion**: AWS Region. +* **kinesisDataStream**: Name of the Kinesis Data stream to read from. Enter the full name of the Kinesis Data stream. +* **outputPubsubTopic**: The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' For example, `projects/your-project-id/topics/your-topic-name`. ### Optional parameters -* **awsDataFormat** : Data format of input. +* **awsDataFormat**: Data format of input. @@ -202,7 +202,7 @@ resource "google_dataflow_flex_template_job" "kinesis_to_pubsub" { secretId2 = "" awsRegion = "" kinesisDataStream = "" - outputPubsubTopic = "projects/your-project-id/topics/your-topic-name" + outputPubsubTopic = "" # awsDataFormat = "" } } diff --git a/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery.md b/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery.md index 25c58a869d..038f2a77f8 100644 --- a/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery.md +++ b/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery.md @@ -18,21 +18,21 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **mongoDbUri** : The MongoDB connection URI in the format `mongodb+srv://:@.`. -* **database** : Database in MongoDB to read the collection from. (Example: my-db). -* **collection** : Name of the collection inside MongoDB database. (Example: my-collection). -* **userOption** : `FLATTEN`, `JSON`, or `NONE`. `FLATTEN` flattens the documents to the single level. `JSON` stores document in BigQuery JSON format. `NONE` stores the whole document as a JSON-formatted STRING. Defaults to: NONE. -* **outputTableSpec** : The BigQuery table to write to. For example, `bigquery-project:dataset.output_table`. +* **mongoDbUri**: The MongoDB connection URI in the format `mongodb+srv://:@.`. +* **database**: Database in MongoDB to read the collection from. For example, `my-db`. +* **collection**: Name of the collection inside MongoDB database. For example, `my-collection`. +* **userOption**: `FLATTEN`, `JSON`, or `NONE`. `FLATTEN` flattens the documents to the single level. `JSON` stores document in BigQuery JSON format. `NONE` stores the whole document as a JSON-formatted STRING. Defaults to: NONE. +* **outputTableSpec**: The BigQuery table to write to. For example, `bigquery-project:dataset.output_table`. ### Optional parameters -* **KMSEncryptionKey** : Cloud KMS Encryption Key to decrypt the mongodb uri connection string. If Cloud KMS key is passed in, the mongodb uri connection string must all be passed in encrypted. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **filter** : Bson filter in json format. (Example: { "val": { $gt: 0, $lt: 9 }}). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. (Example: gs://your-bucket/your-schema.json). -* **javascriptDocumentTransformGcsPath** : The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://your-bucket/your-transforms/*.js). -* **javascriptDocumentTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is myTransform. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). (Example: transform). +* **KMSEncryptionKey**: Cloud KMS Encryption Key to decrypt the mongodb uri connection string. If Cloud KMS key is passed in, the mongodb uri connection string must all be passed in encrypted. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **filter**: Bson filter in json format. For example, `{ "val": { $gt: 0, $lt: 9 }}`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. For example, `gs://your-bucket/your-schema.json`. +* **javascriptDocumentTransformGcsPath**: The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://your-bucket/your-transforms/*.js`. +* **javascriptDocumentTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is myTransform. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). For example, `transform`. @@ -228,17 +228,17 @@ resource "google_dataflow_flex_template_job" "mongodb_to_bigquery" { region = var.region parameters = { mongoDbUri = "" - database = "my-db" - collection = "my-collection" + database = "" + collection = "" userOption = "NONE" outputTableSpec = "" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" - # filter = "{ "val": { $gt: 0, $lt: 9 }}" + # KMSEncryptionKey = "" + # filter = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" - # javascriptDocumentTransformGcsPath = "gs://your-bucket/your-transforms/*.js" - # javascriptDocumentTransformFunctionName = "transform" + # bigQuerySchemaPath = "" + # javascriptDocumentTransformGcsPath = "" + # javascriptDocumentTransformFunctionName = "" } } ``` diff --git a/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery_CDC.md b/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery_CDC.md index 5dbf51e129..172a7a8769 100644 --- a/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery_CDC.md +++ b/v2/mongodb-to-googlecloud/README_MongoDB_to_BigQuery_CDC.md @@ -19,24 +19,24 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **mongoDbUri** : The MongoDB connection URI in the format `mongodb+srv://:@.`. -* **database** : Database in MongoDB to read the collection from. (Example: my-db). -* **collection** : Name of the collection inside MongoDB database. (Example: my-collection). -* **userOption** : `FLATTEN`, `JSON`, or `NONE`. `FLATTEN` flattens the documents to the single level. `JSON` stores document in BigQuery JSON format. `NONE` stores the whole document as a JSON-formatted STRING. Defaults to: NONE. -* **inputTopic** : The Pub/Sub input topic to read from, in the format of projects//topics/. -* **outputTableSpec** : The BigQuery table to write to. For example, `bigquery-project:dataset.output_table`. +* **mongoDbUri**: The MongoDB connection URI in the format `mongodb+srv://:@.`. +* **database**: Database in MongoDB to read the collection from. For example, `my-db`. +* **collection**: Name of the collection inside MongoDB database. For example, `my-collection`. +* **userOption**: `FLATTEN`, `JSON`, or `NONE`. `FLATTEN` flattens the documents to the single level. `JSON` stores document in BigQuery JSON format. `NONE` stores the whole document as a JSON-formatted STRING. Defaults to: NONE. +* **inputTopic**: The Pub/Sub input topic to read from, in the format of `projects//topics/`. +* **outputTableSpec**: The BigQuery table to write to. For example, `bigquery-project:dataset.output_table`. ### Optional parameters -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly- once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **KMSEncryptionKey** : Cloud KMS Encryption Key to decrypt the mongodb uri connection string. If Cloud KMS key is passed in, the mongodb uri connection string must all be passed in encrypted. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **filter** : Bson filter in json format. (Example: { "val": { $gt: 0, $lt: 9 }}). -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. (Example: gs://your-bucket/your-schema.json). -* **javascriptDocumentTransformGcsPath** : The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://your-bucket/your-transforms/*.js). -* **javascriptDocumentTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is myTransform. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). (Example: transform). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly- once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **KMSEncryptionKey**: Cloud KMS Encryption Key to decrypt the mongodb uri connection string. If Cloud KMS key is passed in, the mongodb uri connection string must all be passed in encrypted. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **filter**: Bson filter in json format. For example, `{ "val": { $gt: 0, $lt: 9 }}`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. For example, `gs://your-bucket/your-schema.json`. +* **javascriptDocumentTransformGcsPath**: The Cloud Storage URI of the `.js` file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://your-bucket/your-transforms/*.js`. +* **javascriptDocumentTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is myTransform. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). For example, `transform`. @@ -241,20 +241,20 @@ resource "google_dataflow_flex_template_job" "mongodb_to_bigquery_cdc" { region = var.region parameters = { mongoDbUri = "" - database = "my-db" - collection = "my-collection" + database = "" + collection = "" userOption = "NONE" inputTopic = "" outputTableSpec = "" # useStorageWriteApiAtLeastOnce = "false" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" - # filter = "{ "val": { $gt: 0, $lt: 9 }}" + # KMSEncryptionKey = "" + # filter = "" # useStorageWriteApi = "false" # numStorageWriteApiStreams = "0" # storageWriteApiTriggeringFrequencySec = "" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" - # javascriptDocumentTransformGcsPath = "gs://your-bucket/your-transforms/*.js" - # javascriptDocumentTransformFunctionName = "transform" + # bigQuerySchemaPath = "" + # javascriptDocumentTransformGcsPath = "" + # javascriptDocumentTransformFunctionName = "" } } ``` diff --git a/v2/mongodb-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/mongodb/options/MongoDbToBigQueryOptions.java b/v2/mongodb-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/mongodb/options/MongoDbToBigQueryOptions.java index 7730361a81..c0b7d01bef 100644 --- a/v2/mongodb-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/mongodb/options/MongoDbToBigQueryOptions.java +++ b/v2/mongodb-to-googlecloud/src/main/java/com/google/cloud/teleport/v2/mongodb/options/MongoDbToBigQueryOptions.java @@ -105,7 +105,7 @@ public interface PubSubOptions extends PipelineOptions, DataflowPipelineOptions groupName = "Source", description = "Pub/Sub input topic", helpText = - "The Pub/Sub input topic to read from, in the format of projects//topics/.") + "The Pub/Sub input topic to read from, in the format of `projects//topics/`.") String getInputTopic(); void setInputTopic(String inputTopic); diff --git a/v2/mqtt-to-pubsub/README_Mqtt_to_PubSub.md b/v2/mqtt-to-pubsub/README_Mqtt_to_PubSub.md index 7f6ff8fd2e..41137b1187 100644 --- a/v2/mqtt-to-pubsub/README_Mqtt_to_PubSub.md +++ b/v2/mqtt-to-pubsub/README_Mqtt_to_PubSub.md @@ -19,14 +19,14 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputTopic** : The name of the MQTT topic that data is read from. (Example: topic). -* **outputTopic** : The name of the output Pub/Sub topic that data is written to. (Example: projects/your-project-id/topics/your-topic-name). -* **username** : The username to use for authentication on the MQTT server. (Example: sampleusername). -* **password** : The password associated with the provided username. (Example: samplepassword). +* **inputTopic**: The name of the MQTT topic that data is read from. For example, `topic`. +* **outputTopic**: The name of the output Pub/Sub topic that data is written to. For example, `projects/your-project-id/topics/your-topic-name`. +* **username**: The username to use for authentication on the MQTT server. For example, `sampleusername`. +* **password**: The password associated with the provided username. For example, `samplepassword`. ### Optional parameters -* **brokerServer** : The MQTT broker server IP or host. (Example: tcp://host:1883). +* **brokerServer**: The MQTT broker server IP or host. For example, `tcp://host:1883`. @@ -200,11 +200,11 @@ resource "google_dataflow_flex_template_job" "mqtt_to_pubsub" { name = "mqtt-to-pubsub" region = var.region parameters = { - inputTopic = "topic" - outputTopic = "projects/your-project-id/topics/your-topic-name" - username = "sampleusername" - password = "samplepassword" - # brokerServer = "tcp://host:1883" + inputTopic = "" + outputTopic = "" + username = "" + password = "" + # brokerServer = "" } } ``` diff --git a/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub.md b/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub.md index 67ea295e9b..0a81250a78 100644 --- a/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub.md +++ b/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub.md @@ -18,20 +18,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma-separated Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : The query to run on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The Pub/Sub topic to publish to, in the format projects//topics/. (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma-separated Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: The query to run on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The Pub/Sub topic to publish to. For example, `projects//topics/`. ### Optional parameters -* **username** : The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **password** : The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **username**: The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **password**: The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -223,17 +223,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub" { name = "jdbc-to-pubsub" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # connectionProperties = "" + # KMSEncryptionKey = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md b/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md index ccdb7a6f23..de73de30df 100644 --- a/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md +++ b/v2/mysql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md @@ -15,23 +15,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma separate Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : Query to be executed on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma separate Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: Query to be executed on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The name of the topic to publish data to. For example, `projects//topics/`. ### Optional parameters -* **username** : User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **password** : Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **partitionColumn** : If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. -* **table** : Table to read from using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. -* **lowerBound** : Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). -* **upperBound** : Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **username**: User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **password**: Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **partitionColumn**: If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. +* **table**: Table to read from using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. +* **lowerBound**: Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **upperBound**: Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). @@ -232,17 +232,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub_auto" { name = "jdbc-to-pubsub-auto" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # connectionProperties = "" + # KMSEncryptionKey = "" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" diff --git a/v2/mysql-to-googlecloud/README_MySQL_to_BigQuery.md b/v2/mysql-to-googlecloud/README_MySQL_to_BigQuery.md index 55f077b121..2c2b183ba0 100644 --- a/v2/mysql-to-googlecloud/README_MySQL_to_BigQuery.md +++ b/v2/mysql-to-googlecloud/README_MySQL_to_BigQuery.md @@ -23,34 +23,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **connectionURL** : The JDBC connection URL string. For example, `jdbc:mysql://some-host:3306/sampledb`. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key. (Example: jdbc:mysql://some-host:3306/sampledb). -* **outputTable** : The BigQuery output table location. (Example: :.). -* **bigQueryLoadingTemporaryDirectory** : The temporary directory for the BigQuery loading process. (Example: gs://your-bucket/your-files/temp_dir). +* **connectionURL**: The JDBC connection URL string. For example, `jdbc:mysql://some-host:3306/sampledb`. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **outputTable**: The BigQuery output table location. For example, `:.`. +* **bigQueryLoadingTemporaryDirectory**: The temporary directory for the BigQuery loading process. For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. (Example: unicode=true;characterEncoding=UTF-8). -* **username** : The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **password** : The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **query** : The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are: -DATETIME --> TIMESTAMP - -Type casting may be required if your schemas do not match. This parameter can be set to a gs:// path pointing to a file in Cloud Storage to load the query from. The file encoding should be UTF-8. (Example: select * from sampledb.sample_table). -* **KMSEncryptionKey** : The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **useColumnAlias** : If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. -* **isTruncate** : If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. -* **partitionColumn** : If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. -* **table** : The table to read from when using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. -* **lowerBound** : The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **upperBound** : The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **fetchSize** : The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. -* **createDisposition** : The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to CREATE_IF_NEEDED, this parameter must be specified. (Example: gs://your-bucket/your-schema.json). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. For example, `unicode=true;characterEncoding=UTF-8`. +* **username**: The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **password**: The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **query**: The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are `DATETIME --> TIMESTAMP`. Type casting may be required if your schemas do not match. For example, `select * from sampledb.sample_table`. +* **KMSEncryptionKey**: The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **useColumnAlias**: If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. +* **isTruncate**: If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. +* **partitionColumn**: If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. +* **table**: The table to read from when using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. +* **lowerBound**: The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **upperBound**: The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **fetchSize**: The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. +* **createDisposition**: The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to `CREATE_IF_NEEDED`, this parameter must be specified. For example, `gs://your-bucket/your-schema.json`. +* **outputDeadletterTable**: The BigQuery table to use for messages that failed to reach the output table, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the pipeline will fail on write errors.This parameter can only be specified if `useStorageWriteApi` or `useStorageWriteApiAtLeastOnce` is set to true. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. @@ -149,6 +147,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -176,6 +175,7 @@ gcloud dataflow flex-template run "mysql-to-bigquery-job" \ --parameters "fetchSize=$FETCH_SIZE" \ --parameters "createDisposition=$CREATE_DISPOSITION" \ --parameters "bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH" \ + --parameters "outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE" \ --parameters "disabledAlgorithms=$DISABLED_ALGORITHMS" \ --parameters "extraFilesToStage=$EXTRA_FILES_TO_STAGE" \ --parameters "useStorageWriteApi=$USE_STORAGE_WRITE_API" \ @@ -218,6 +218,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -230,7 +231,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="mysql-to-bigquery-job" \ -DtemplateName="MySQL_to_BigQuery" \ --Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ +-Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ -f v2/mysql-to-googlecloud ``` @@ -275,26 +276,27 @@ resource "google_dataflow_flex_template_job" "mysql_to_bigquery" { name = "mysql-to-bigquery" region = var.region parameters = { - connectionURL = "jdbc:mysql://some-host:3306/sampledb" - outputTable = ":." - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" + connectionURL = "" + outputTable = "" + bigQueryLoadingTemporaryDirectory = "" + # connectionProperties = "" # username = "" # password = "" - # query = "select * from sampledb.sample_table" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # query = "" + # KMSEncryptionKey = "" # useColumnAlias = "false" # isTruncate = "false" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" # fetchSize = "50000" # createDisposition = "CREATE_NEVER" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # bigQuerySchemaPath = "" + # outputDeadletterTable = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" } diff --git a/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub.md b/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub.md index e7f38605fa..f00889584a 100644 --- a/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub.md +++ b/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub.md @@ -18,20 +18,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma-separated Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : The query to run on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The Pub/Sub topic to publish to, in the format projects//topics/. (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma-separated Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: The query to run on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The Pub/Sub topic to publish to. For example, `projects//topics/`. ### Optional parameters -* **username** : The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **password** : The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **username**: The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **password**: The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -223,17 +223,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub" { name = "jdbc-to-pubsub" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # connectionProperties = "" + # KMSEncryptionKey = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub_Auto.md b/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub_Auto.md index 4d77c043e0..58e16a9142 100644 --- a/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub_Auto.md +++ b/v2/oracle-to-googlecloud/README_Jdbc_to_PubSub_Auto.md @@ -15,23 +15,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma separate Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : Query to be executed on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma separate Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: Query to be executed on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The name of the topic to publish data to. For example, `projects//topics/`. ### Optional parameters -* **username** : User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **password** : Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **partitionColumn** : If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. -* **table** : Table to read from using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. -* **lowerBound** : Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). -* **upperBound** : Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **username**: User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **password**: Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **partitionColumn**: If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. +* **table**: Table to read from using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. +* **lowerBound**: Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **upperBound**: Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). @@ -232,17 +232,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub_auto" { name = "jdbc-to-pubsub-auto" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # connectionProperties = "" + # KMSEncryptionKey = "" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" diff --git a/v2/oracle-to-googlecloud/README_Oracle_to_BigQuery.md b/v2/oracle-to-googlecloud/README_Oracle_to_BigQuery.md index ed7c592aea..b96dc5ddec 100644 --- a/v2/oracle-to-googlecloud/README_Oracle_to_BigQuery.md +++ b/v2/oracle-to-googlecloud/README_Oracle_to_BigQuery.md @@ -23,34 +23,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **connectionURL** : The JDBC connection URL string. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key. Note the difference between an Oracle non-RAC database connection string (`jdbc:oracle:thin:@some-host::`) and an Oracle RAC database connection string (`jdbc:oracle:thin:@//some-host[:]/`). (Example: jdbc:oracle:thin:@some-host::). -* **outputTable** : The BigQuery output table location. (Example: :.). -* **bigQueryLoadingTemporaryDirectory** : The temporary directory for the BigQuery loading process. (Example: gs://your-bucket/your-files/temp_dir). +* **connectionURL**: The JDBC connection URL string. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key. Note the difference between an Oracle non-RAC database connection string (`jdbc:oracle:thin:@some-host::`) and an Oracle RAC database connection string (`jdbc:oracle:thin:@//some-host[:]/`). For example, `jdbc:oracle:thin:@some-host::`. +* **outputTable**: The BigQuery output table location. For example, `:.`. +* **bigQueryLoadingTemporaryDirectory**: The temporary directory for the BigQuery loading process. For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. (Example: unicode=true;characterEncoding=UTF-8). -* **username** : The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **password** : The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **query** : The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are: -DATETIME --> TIMESTAMP - -Type casting may be required if your schemas do not match. This parameter can be set to a gs:// path pointing to a file in Cloud Storage to load the query from. The file encoding should be UTF-8. (Example: select * from sampledb.sample_table). -* **KMSEncryptionKey** : The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **useColumnAlias** : If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. -* **isTruncate** : If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. -* **partitionColumn** : If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. -* **table** : The table to read from when using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. -* **lowerBound** : The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **upperBound** : The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **fetchSize** : The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. -* **createDisposition** : The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to CREATE_IF_NEEDED, this parameter must be specified. (Example: gs://your-bucket/your-schema.json). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. For example, `unicode=true;characterEncoding=UTF-8`. +* **username**: The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **password**: The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **query**: The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are `DATETIME --> TIMESTAMP`. Type casting may be required if your schemas do not match. For example, `select * from sampledb.sample_table`. +* **KMSEncryptionKey**: The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **useColumnAlias**: If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. +* **isTruncate**: If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. +* **partitionColumn**: If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. +* **table**: The table to read from when using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. +* **lowerBound**: The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **upperBound**: The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **fetchSize**: The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. +* **createDisposition**: The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to `CREATE_IF_NEEDED`, this parameter must be specified. For example, `gs://your-bucket/your-schema.json`. +* **outputDeadletterTable**: The BigQuery table to use for messages that failed to reach the output table, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the pipeline will fail on write errors.This parameter can only be specified if `useStorageWriteApi` or `useStorageWriteApiAtLeastOnce` is set to true. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. @@ -149,6 +147,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -176,6 +175,7 @@ gcloud dataflow flex-template run "oracle-to-bigquery-job" \ --parameters "fetchSize=$FETCH_SIZE" \ --parameters "createDisposition=$CREATE_DISPOSITION" \ --parameters "bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH" \ + --parameters "outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE" \ --parameters "disabledAlgorithms=$DISABLED_ALGORITHMS" \ --parameters "extraFilesToStage=$EXTRA_FILES_TO_STAGE" \ --parameters "useStorageWriteApi=$USE_STORAGE_WRITE_API" \ @@ -218,6 +218,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -230,7 +231,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="oracle-to-bigquery-job" \ -DtemplateName="Oracle_to_BigQuery" \ --Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ +-Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ -f v2/oracle-to-googlecloud ``` @@ -275,26 +276,27 @@ resource "google_dataflow_flex_template_job" "oracle_to_bigquery" { name = "oracle-to-bigquery" region = var.region parameters = { - connectionURL = "jdbc:oracle:thin:@some-host::" - outputTable = ":." - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" + connectionURL = "" + outputTable = "" + bigQueryLoadingTemporaryDirectory = "" + # connectionProperties = "" # username = "" # password = "" - # query = "select * from sampledb.sample_table" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # query = "" + # KMSEncryptionKey = "" # useColumnAlias = "false" # isTruncate = "false" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" # fetchSize = "50000" # createDisposition = "CREATE_NEVER" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # bigQuerySchemaPath = "" + # outputDeadletterTable = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" } diff --git a/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub.md b/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub.md index 7f93601630..395fcba04a 100644 --- a/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub.md +++ b/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub.md @@ -18,20 +18,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma-separated Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : The query to run on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The Pub/Sub topic to publish to, in the format projects//topics/. (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma-separated Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: The query to run on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The Pub/Sub topic to publish to. For example, `projects//topics/`. ### Optional parameters -* **username** : The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **password** : The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **username**: The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **password**: The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -223,17 +223,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub" { name = "jdbc-to-pubsub" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # connectionProperties = "" + # KMSEncryptionKey = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md b/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md index 66f3d6cf7e..412fd6d0cb 100644 --- a/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md +++ b/v2/postgresql-to-googlecloud/README_Jdbc_to_PubSub_Auto.md @@ -15,23 +15,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma separate Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : Query to be executed on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma separate Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: Query to be executed on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The name of the topic to publish data to. For example, `projects//topics/`. ### Optional parameters -* **username** : User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **password** : Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **partitionColumn** : If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. -* **table** : Table to read from using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. -* **lowerBound** : Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). -* **upperBound** : Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **username**: User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **password**: Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **partitionColumn**: If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. +* **table**: Table to read from using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. +* **lowerBound**: Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **upperBound**: Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). @@ -232,17 +232,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub_auto" { name = "jdbc-to-pubsub-auto" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # connectionProperties = "" + # KMSEncryptionKey = "" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" diff --git a/v2/postgresql-to-googlecloud/README_PostgreSQL_to_BigQuery.md b/v2/postgresql-to-googlecloud/README_PostgreSQL_to_BigQuery.md index 3c69cc5bcc..6d556f65bc 100644 --- a/v2/postgresql-to-googlecloud/README_PostgreSQL_to_BigQuery.md +++ b/v2/postgresql-to-googlecloud/README_PostgreSQL_to_BigQuery.md @@ -23,34 +23,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **connectionURL** : The JDBC connection URL string. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key. (Example: jdbc:postgresql://some-host:5432/sampledb). -* **outputTable** : The BigQuery output table location. (Example: :.). -* **bigQueryLoadingTemporaryDirectory** : The temporary directory for the BigQuery loading process. (Example: gs://your-bucket/your-files/temp_dir). +* **connectionURL**: The JDBC connection URL string. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key. For example, `jdbc:postgresql://some-host:5432/sampledb`. +* **outputTable**: The BigQuery output table location. For example, `:.`. +* **bigQueryLoadingTemporaryDirectory**: The temporary directory for the BigQuery loading process. For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. (Example: unicode=true;characterEncoding=UTF-8). -* **username** : The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **password** : The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **query** : The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are: -DATETIME --> TIMESTAMP - -Type casting may be required if your schemas do not match. This parameter can be set to a gs:// path pointing to a file in Cloud Storage to load the query from. The file encoding should be UTF-8. (Example: select * from sampledb.sample_table). -* **KMSEncryptionKey** : The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **useColumnAlias** : If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. -* **isTruncate** : If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. -* **partitionColumn** : If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. -* **table** : The table to read from when using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. -* **lowerBound** : The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **upperBound** : The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **fetchSize** : The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. -* **createDisposition** : The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to CREATE_IF_NEEDED, this parameter must be specified. (Example: gs://your-bucket/your-schema.json). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. For example, `unicode=true;characterEncoding=UTF-8`. +* **username**: The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **password**: The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **query**: The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are `DATETIME --> TIMESTAMP`. Type casting may be required if your schemas do not match. For example, `select * from sampledb.sample_table`. +* **KMSEncryptionKey**: The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **useColumnAlias**: If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. +* **isTruncate**: If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. +* **partitionColumn**: If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. +* **table**: The table to read from when using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. +* **lowerBound**: The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **upperBound**: The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **fetchSize**: The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. +* **createDisposition**: The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to `CREATE_IF_NEEDED`, this parameter must be specified. For example, `gs://your-bucket/your-schema.json`. +* **outputDeadletterTable**: The BigQuery table to use for messages that failed to reach the output table, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the pipeline will fail on write errors.This parameter can only be specified if `useStorageWriteApi` or `useStorageWriteApiAtLeastOnce` is set to true. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. @@ -149,6 +147,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -176,6 +175,7 @@ gcloud dataflow flex-template run "postgresql-to-bigquery-job" \ --parameters "fetchSize=$FETCH_SIZE" \ --parameters "createDisposition=$CREATE_DISPOSITION" \ --parameters "bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH" \ + --parameters "outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE" \ --parameters "disabledAlgorithms=$DISABLED_ALGORITHMS" \ --parameters "extraFilesToStage=$EXTRA_FILES_TO_STAGE" \ --parameters "useStorageWriteApi=$USE_STORAGE_WRITE_API" \ @@ -218,6 +218,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -230,7 +231,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="postgresql-to-bigquery-job" \ -DtemplateName="PostgreSQL_to_BigQuery" \ --Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ +-Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ -f v2/postgresql-to-googlecloud ``` @@ -275,26 +276,27 @@ resource "google_dataflow_flex_template_job" "postgresql_to_bigquery" { name = "postgresql-to-bigquery" region = var.region parameters = { - connectionURL = "jdbc:postgresql://some-host:5432/sampledb" - outputTable = ":." - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" + connectionURL = "" + outputTable = "" + bigQueryLoadingTemporaryDirectory = "" + # connectionProperties = "" # username = "" # password = "" - # query = "select * from sampledb.sample_table" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # query = "" + # KMSEncryptionKey = "" # useColumnAlias = "false" # isTruncate = "false" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" # fetchSize = "50000" # createDisposition = "CREATE_NEVER" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # bigQuerySchemaPath = "" + # outputDeadletterTable = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" } diff --git a/v2/pubsub-binary-to-bigquery/README_PubSub_Avro_to_BigQuery.md b/v2/pubsub-binary-to-bigquery/README_PubSub_Avro_to_BigQuery.md index 5552b01456..c32974299d 100644 --- a/v2/pubsub-binary-to-bigquery/README_PubSub_Avro_to_BigQuery.md +++ b/v2/pubsub-binary-to-bigquery/README_PubSub_Avro_to_BigQuery.md @@ -19,19 +19,19 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **schemaPath** : The Cloud Storage location of the Avro schema file. For example, `gs://path/to/my/schema.avsc`. -* **inputSubscription** : The Pub/Sub input subscription to read from. (Example: projects//subscription/). -* **outputTableSpec** : The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. -* **outputTopic** : The Pub/Sub topic to use for unprocessed records. (Example: projects//topics/). +* **schemaPath**: The Cloud Storage location of the Avro schema file. For example, `gs://path/to/my/schema.avsc`. +* **inputSubscription**: The Pub/Sub input subscription to read from. For example, `projects//subscription/`. +* **outputTableSpec**: The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. +* **outputTopic**: The Pub/Sub topic to use for unprocessed records. For example, `projects//topics/`. ### Optional parameters -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. @@ -221,9 +221,9 @@ resource "google_dataflow_flex_template_job" "pubsub_avro_to_bigquery" { region = var.region parameters = { schemaPath = "" - inputSubscription = "projects//subscription/" + inputSubscription = "" outputTableSpec = "" - outputTopic = "projects//topics/" + outputTopic = "" # useStorageWriteApiAtLeastOnce = "false" # writeDisposition = "WRITE_APPEND" # createDisposition = "CREATE_IF_NEEDED" diff --git a/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Flex.md b/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Flex.md index 01d0802b7d..0768dcf896 100644 --- a/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Flex.md +++ b/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Flex.md @@ -23,26 +23,26 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **protoSchemaPath** : The Cloud Storage location of the self-contained proto schema file. For example, gs://path/to/my/file.pb. You can generate this file with the `--descriptor_set_out` flag of the protoc command. The `--include_imports` flag guarantees that the file is self-contained. -* **fullMessageName** : The full proto message name. For example, `package.name`. `MessageName`, where `package.name` is the value provided for the `package` statement and not the `java_package` statement. -* **inputSubscription** : The Pub/Sub input subscription to read from. (Example: projects//subscription/). -* **outputTableSpec** : The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. -* **outputTopic** : The Pub/Sub topic to use for unprocessed records. (Example: projects//topics/). +* **protoSchemaPath**: The Cloud Storage location of the self-contained proto schema file. For example, `gs://path/to/my/file.pb`. You can generate this file with the `--descriptor_set_out` flag of the protoc command. The `--include_imports` flag guarantees that the file is self-contained. +* **fullMessageName**: The full proto message name. For example, `package.name`. `MessageName`, where `package.name` is the value provided for the `package` statement and not the `java_package` statement. +* **inputSubscription**: The Pub/Sub input subscription to read from. For example, `projects//subscription/`. +* **outputTableSpec**: The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. +* **outputTopic**: The Pub/Sub topic to use for unprocessed records. For example, `projects//topics/`. ### Optional parameters -* **preserveProtoFieldNames** : To preserve the original proto field name in JSON, set this property to true. To use more standard JSON names, set to false. For example, `false` would change `field_name` to `fieldName`. Defaults to: false. -* **bigQueryTableSchemaPath** : The Cloud Storage path to the BigQuery schema path. If this value isn't provided, then the schema is inferred from the Proto schema. (Example: gs://MyBucket/bq_schema.json). -* **udfOutputTopic** : The Pub/Sub topic storing the UDF errors. If this value isn't provided, UDF errors are sent to the same topic as `outputTopic`. (Example: projects/your-project-id/topics/your-topic-name). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **preserveProtoFieldNames**: To preserve the original proto field name in JSON, set this property to `true`. To use more standard JSON names, set to `false`. For example, `false` would change `field_name` to `fieldName`. Defaults to: `false`. +* **bigQueryTableSchemaPath**: The Cloud Storage path to the BigQuery schema path. If this value isn't provided, then the schema is inferred from the Proto schema. For example, `gs://MyBucket/bq_schema.json`. +* **udfOutputTopic**: The Pub/Sub topic storing the UDF errors. If this value isn't provided, UDF errors are sent to the same topic as `outputTopic`. For example, `projects/your-project-id/topics/your-topic-name`. +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. ## User-Defined functions (UDFs) @@ -264,16 +264,16 @@ resource "google_dataflow_flex_template_job" "pubsub_proto_to_bigquery_flex" { parameters = { protoSchemaPath = "" fullMessageName = "" - inputSubscription = "projects//subscription/" + inputSubscription = "" outputTableSpec = "" - outputTopic = "projects//topics/" + outputTopic = "" # preserveProtoFieldNames = "false" - # bigQueryTableSchemaPath = "gs://MyBucket/bq_schema.json" - # udfOutputTopic = "projects/your-project-id/topics/your-topic-name" + # bigQueryTableSchemaPath = "" + # udfOutputTopic = "" # useStorageWriteApiAtLeastOnce = "false" # writeDisposition = "WRITE_APPEND" # createDisposition = "CREATE_IF_NEEDED" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" # useStorageWriteApi = "false" diff --git a/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Xlang.md b/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Xlang.md index 7987ba66bd..c0bde36166 100644 --- a/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Xlang.md +++ b/v2/pubsub-binary-to-bigquery/README_PubSub_Proto_to_BigQuery_Xlang.md @@ -23,25 +23,25 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **protoSchemaPath** : The Cloud Storage location of the self-contained proto schema file. For example, gs://path/to/my/file.pb. You can generate this file with the `--descriptor_set_out` flag of the protoc command. The `--include_imports` flag guarantees that the file is self-contained. -* **fullMessageName** : The full proto message name. For example, `package.name`. `MessageName`, where `package.name` is the value provided for the `package` statement and not the `java_package` statement. -* **inputSubscription** : The Pub/Sub input subscription to read from. (Example: projects//subscription/). -* **outputTableSpec** : The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. -* **outputTopic** : The Pub/Sub topic to use for unprocessed records. (Example: projects//topics/). +* **protoSchemaPath**: The Cloud Storage location of the self-contained proto schema file. For example, `gs://path/to/my/file.pb`. You can generate this file with the `--descriptor_set_out` flag of the protoc command. The `--include_imports` flag guarantees that the file is self-contained. +* **fullMessageName**: The full proto message name. For example, `package.name`. `MessageName`, where `package.name` is the value provided for the `package` statement and not the `java_package` statement. +* **inputSubscription**: The Pub/Sub input subscription to read from. For example, `projects//subscription/`. +* **outputTableSpec**: The BigQuery output table location to write the output to. For example, `:.`.Depending on the `createDisposition` specified, the output table might be created automatically using the user provided Avro schema. +* **outputTopic**: The Pub/Sub topic to use for unprocessed records. For example, `projects//topics/`. ### Optional parameters -* **preserveProtoFieldNames** : To preserve the original proto field name in JSON, set this property to true. To use more standard JSON names, set to false. For example, `false` would change `field_name` to `fieldName`. Defaults to: false. -* **bigQueryTableSchemaPath** : The Cloud Storage path to the BigQuery schema path. If this value isn't provided, then the schema is inferred from the Proto schema. (Example: gs://MyBucket/bq_schema.json). -* **udfOutputTopic** : The Pub/Sub topic storing the UDF errors. If this value isn't provided, UDF errors are sent to the same topic as `outputTopic`. (Example: projects/your-project-id/topics/your-topic-name). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **writeDisposition** : The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. -* **createDisposition** : The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **preserveProtoFieldNames**: To preserve the original proto field name in JSON, set this property to `true`. To use more standard JSON names, set to `false`. For example, `false` would change `field_name` to `fieldName`. Defaults to: `false`. +* **bigQueryTableSchemaPath**: The Cloud Storage path to the BigQuery schema path. If this value isn't provided, then the schema is inferred from the Proto schema. For example, `gs://MyBucket/bq_schema.json`. +* **udfOutputTopic**: The Pub/Sub topic storing the UDF errors. If this value isn't provided, UDF errors are sent to the same topic as `outputTopic`. For example, `projects/your-project-id/topics/your-topic-name`. +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **writeDisposition**: The BigQuery WriteDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload) value. For example, `WRITE_APPEND`, `WRITE_EMPTY`, or `WRITE_TRUNCATE`. Defaults to `WRITE_APPEND`. +* **createDisposition**: The BigQuery CreateDisposition (https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#jobconfigurationload). For example, `CREATE_IF_NEEDED` and `CREATE_NEVER`. Defaults to `CREATE_IF_NEEDED`. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. @@ -250,17 +250,17 @@ resource "google_dataflow_flex_template_job" "pubsub_proto_to_bigquery_xlang" { parameters = { protoSchemaPath = "" fullMessageName = "" - inputSubscription = "projects//subscription/" + inputSubscription = "" outputTableSpec = "" - outputTopic = "projects//topics/" + outputTopic = "" # preserveProtoFieldNames = "false" - # bigQueryTableSchemaPath = "gs://MyBucket/bq_schema.json" - # udfOutputTopic = "projects/your-project-id/topics/your-topic-name" + # bigQueryTableSchemaPath = "" + # udfOutputTopic = "" # useStorageWriteApiAtLeastOnce = "false" # writeDisposition = "WRITE_APPEND" # createDisposition = "CREATE_IF_NEEDED" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" # useStorageWriteApi = "false" # numStorageWriteApiStreams = "0" # storageWriteApiTriggeringFrequencySec = "" diff --git a/v2/pubsub-binary-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/PubsubProtoToBigQuery.java b/v2/pubsub-binary-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/PubsubProtoToBigQuery.java index a5bcd521b5..8ef6f65738 100644 --- a/v2/pubsub-binary-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/PubsubProtoToBigQuery.java +++ b/v2/pubsub-binary-to-bigquery/src/main/java/com/google/cloud/teleport/v2/templates/PubsubProtoToBigQuery.java @@ -164,7 +164,7 @@ public interface PubSubProtoToBigQueryOptions description = "Cloud Storage Path to the Proto Schema File", helpText = "The Cloud Storage location of the self-contained proto schema file. For example," - + " gs://path/to/my/file.pb. You can generate this file with" + + " `gs://path/to/my/file.pb`. You can generate this file with" + " the `--descriptor_set_out` flag of the protoc command." + " The `--include_imports` flag guarantees that the file is self-contained.") @Required @@ -190,9 +190,9 @@ public interface PubSubProtoToBigQueryOptions optional = true, description = "Preserve Proto Field Names", helpText = - "To preserve the original proto field name in JSON, set this property to true. " - + "To use more standard JSON names, set to false." - + " For example, `false` would change `field_name` to `fieldName`. Defaults to: false.") + "To preserve the original proto field name in JSON, set this property to `true`. " + + "To use more standard JSON names, set to `false`." + + " For example, `false` would change `field_name` to `fieldName`. Defaults to: `false`.") @Default.Boolean(false) Boolean getPreserveProtoFieldNames(); diff --git a/v2/pubsub-cdc-to-bigquery/README_PubSub_CDC_to_BigQuery.md b/v2/pubsub-cdc-to-bigquery/README_PubSub_CDC_to_BigQuery.md index 483a0b6f89..1783b80970 100644 --- a/v2/pubsub-cdc-to-bigquery/README_PubSub_CDC_to_BigQuery.md +++ b/v2/pubsub-cdc-to-bigquery/README_PubSub_CDC_to_BigQuery.md @@ -15,30 +15,30 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name' (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **outputTableNameTemplate** : The location of the BigQuery table to write the output to. If a table does not already exist one will be created automatically. Defaults to: _metadata_table. +* **inputSubscription**: Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name' For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **outputTableNameTemplate**: The location of the BigQuery table to write the output to. If a table does not already exist one will be created automatically. Defaults to: _metadata_table. ### Optional parameters -* **autoMapTables** : Determines if new columns and tables should be automatically created in BigQuery. Defaults to: true. -* **schemaFilePath** : This is the file location that contains the table definition to be used when creating the table in BigQuery. If left blank the table will get created with generic string typing. -* **outputDatasetTemplate** : The name for the dataset to contain the replica table. Defaults to: {_metadata_dataset}. -* **outputTableSpec** : BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. -* **outputDeadletterTable** : BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. (Example: your-project-id:your-dataset.your-table-name). -* **deadLetterQueueDirectory** : The name of the directory on Cloud Storage you want to write dead letters messages to. -* **windowDuration** : The window duration/size in which DLQ data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 5m). Defaults to: 5s. -* **threadCount** : The number of parallel threads you want to split your data into. Defaults to: 100. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. -* **pythonTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-transforms/*.py). -* **pythonRuntimeVersion** : The runtime version to use for this Python UDF. -* **pythonTextTransformFunctionName** : The name of the function to call from your JavaScript file. Use only letters, digits, and underscores. (Example: transform_udf1). -* **runtimeRetries** : The number of times a runtime will be retried before failing. Defaults to: 5. -* **useStorageWriteApi** : If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. -* **numStorageWriteApiStreams** : When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. -* **storageWriteApiTriggeringFrequencySec** : When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. +* **autoMapTables**: Determines if new columns and tables should be automatically created in BigQuery. Defaults to: true. +* **schemaFilePath**: This is the file location that contains the table definition to be used when creating the table in BigQuery. If left blank the table will get created with generic string typing. +* **outputDatasetTemplate**: The name for the dataset to contain the replica table. Defaults to: {_metadata_dataset}. +* **outputTableSpec**: BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. +* **outputDeadletterTable**: BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. For example, `your-project-id:your-dataset.your-table-name`. +* **deadLetterQueueDirectory**: The name of the directory on Cloud Storage you want to write dead letters messages to. +* **windowDuration**: The window duration/size in which DLQ data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `5m`. Defaults to: 5s. +* **threadCount**: The number of parallel threads you want to split your data into. Defaults to: 100. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. +* **pythonTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-transforms/*.py`. +* **pythonRuntimeVersion**: The runtime version to use for this Python UDF. +* **pythonTextTransformFunctionName**: The name of the function to call from your JavaScript file. Use only letters, digits, and underscores. For example, `transform_udf1`. +* **runtimeRetries**: The number of times a runtime will be retried before failing. Defaults to: 5. +* **useStorageWriteApi**: If true, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **numStorageWriteApiStreams**: When using the Storage Write API, specifies the number of write streams. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. Defaults to: 0. +* **storageWriteApiTriggeringFrequencySec**: When using the Storage Write API, specifies the triggering frequency, in seconds. If `useStorageWriteApi` is `true` and `useStorageWriteApiAtLeastOnce` is `false`, then you must set this parameter. ## User-Defined functions (UDFs) @@ -270,22 +270,22 @@ resource "google_dataflow_flex_template_job" "pubsub_cdc_to_bigquery" { name = "pubsub-cdc-to-bigquery" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" + inputSubscription = "" outputTableNameTemplate = "_metadata_table" # autoMapTables = "true" # schemaFilePath = "" # outputDatasetTemplate = "{_metadata_dataset}" # outputTableSpec = "" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" + # outputDeadletterTable = "" # deadLetterQueueDirectory = "" - # windowDuration = "5m" + # windowDuration = "5s" # threadCount = "100" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" - # pythonTextTransformGcsPath = "gs://your-bucket/your-transforms/*.py" + # pythonTextTransformGcsPath = "" # pythonRuntimeVersion = "" - # pythonTextTransformFunctionName = "transform_udf1" + # pythonTextTransformFunctionName = "" # runtimeRetries = "5" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" diff --git a/v2/pubsub-to-jms/README_Pubsub_to_Jms.md b/v2/pubsub-to-jms/README_Pubsub_to_Jms.md index 3f6c19ffcb..765d1431b6 100644 --- a/v2/pubsub-to-jms/README_Pubsub_to_Jms.md +++ b/v2/pubsub-to-jms/README_Pubsub_to_Jms.md @@ -14,15 +14,15 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name' (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **outputName** : JMS Queue/Topic Name to write the input to. (Example: queue). -* **outputType** : JMS Destination Type to Write the input to. (Example: queue). -* **username** : JMS username for authentication with JMS server (Example: sampleusername). -* **password** : Password for username provided for authentication with JMS server (Example: samplepassword). +* **inputSubscription**: Pub/Sub subscription to read the input from, in the format of 'projects/your-project-id/subscriptions/your-subscription-name' For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **outputName**: JMS Queue/Topic Name to write the input to. For example, `queue`. +* **outputType**: JMS Destination Type to Write the input to. For example, `queue`. +* **username**: JMS username for authentication with JMS server For example, `sampleusername`. +* **password**: Password for username provided for authentication with JMS server For example, `samplepassword`. ### Optional parameters -* **jmsServer** : Server IP for JMS Host (Example: host:5672). +* **jmsServer**: Server IP for JMS Host For example, `host:5672`. @@ -199,12 +199,12 @@ resource "google_dataflow_flex_template_job" "pubsub_to_jms" { name = "pubsub-to-jms" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - outputName = "queue" - outputType = "queue" - username = "sampleusername" - password = "samplepassword" - # jmsServer = "host:5672" + inputSubscription = "" + outputName = "" + outputType = "" + username = "" + password = "" + # jmsServer = "" } } ``` diff --git a/v2/pubsub-to-kafka/README_PubSub_to_Kafka.md b/v2/pubsub-to-kafka/README_PubSub_to_Kafka.md index d86eaa04c8..6d4ca3f500 100644 --- a/v2/pubsub-to-kafka/README_PubSub_to_Kafka.md +++ b/v2/pubsub-to-kafka/README_PubSub_to_Kafka.md @@ -14,18 +14,18 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputTopic** : The name of the topic from which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). -* **outputTopic** : Kafka topic to write the input from pubsub. (Example: topic). -* **outputDeadLetterTopic** : The Pub/Sub topic to publish deadletter records to. The name should be in the format of projects/your-project-id/topics/your-topic-name. +* **inputTopic**: The name of the topic from which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' For example, `projects/your-project-id/topics/your-topic-name`. +* **outputTopic**: Kafka topic to write the input from pubsub. For example, `topic`. +* **outputDeadLetterTopic**: The Pub/Sub topic to publish deadletter records to. The name should be in the format of projects/your-project-id/topics/your-topic-name. ### Optional parameters -* **bootstrapServer** : Kafka Bootstrap Server (Example: localhost:9092). -* **secretStoreUrl** : URL to credentials in Vault. -* **vaultToken** : Token to use for Vault. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. +* **bootstrapServer**: Kafka Bootstrap Server For example, `localhost:9092`. +* **secretStoreUrl**: URL to credentials in Vault. +* **vaultToken**: Token to use for Vault. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. ## User-Defined functions (UDFs) @@ -221,13 +221,13 @@ resource "google_dataflow_flex_template_job" "pubsub_to_kafka" { name = "pubsub-to-kafka" region = var.region parameters = { - inputTopic = "projects/your-project-id/topics/your-topic-name" - outputTopic = "topic" + inputTopic = "" + outputTopic = "" outputDeadLetterTopic = "" - # bootstrapServer = "localhost:9092" + # bootstrapServer = "" # secretStoreUrl = "" # vaultToken = "" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" } diff --git a/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB.md b/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB.md index 5e13f69c30..1eeb98c5fb 100644 --- a/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB.md +++ b/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB.md @@ -23,24 +23,24 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : Name of the Pub/Sub subscription. (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **mongoDBUri** : Comma separated list of MongoDB servers. (Example: host1:port,host2:port,host3:port). -* **database** : Database in MongoDB to store the collection. (Example: my-db). -* **collection** : Name of the collection in the MongoDB database. (Example: my-collection). -* **deadletterTable** : The BigQuery table that stores messages caused by failures, such as mismatched schema, malformed JSON, and so on. (Example: your-project-id:your-dataset.your-table-name). +* **inputSubscription**: Name of the Pub/Sub subscription. For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **mongoDBUri**: Comma separated list of MongoDB servers. For example, `host1:port,host2:port,host3:port`. +* **database**: Database in MongoDB to store the collection. For example, `my-db`. +* **collection**: Name of the collection in the MongoDB database. For example, `my-collection`. +* **deadletterTable**: The BigQuery table that stores messages caused by failures, such as mismatched schema, malformed JSON, and so on. For example, `your-project-id:your-dataset.your-table-name`. ### Optional parameters -* **batchSize** : Batch size used for batch insertion of documents into MongoDB. Defaults to: 1000. -* **batchSizeBytes** : Batch size in bytes. Defaults to: 5242880. -* **maxConnectionIdleTime** : Maximum idle time allowed in seconds before connection timeout occurs. Defaults to: 60000. -* **sslEnabled** : Boolean value indicating whether the connection to MongoDB is SSL enabled. Defaults to: true. -* **ignoreSSLCertificate** : Boolean value indicating whether to ignore the SSL certificate. Defaults to: true. -* **withOrdered** : Boolean value enabling ordered bulk insertions into MongoDB. Defaults to: true. -* **withSSLInvalidHostNameAllowed** : Boolean value indicating whether an invalid hostname is allowed for the SSL connection. Defaults to: true. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. +* **batchSize**: Batch size used for batch insertion of documents into MongoDB. Defaults to: 1000. +* **batchSizeBytes**: Batch size in bytes. Defaults to: 5242880. +* **maxConnectionIdleTime**: Maximum idle time allowed in seconds before connection timeout occurs. Defaults to: 60000. +* **sslEnabled**: Boolean value indicating whether the connection to MongoDB is SSL enabled. Defaults to: true. +* **ignoreSSLCertificate**: Boolean value indicating whether to ignore the SSL certificate. Defaults to: true. +* **withOrdered**: Boolean value enabling ordered bulk insertions into MongoDB. Defaults to: true. +* **withSSLInvalidHostNameAllowed**: Boolean value indicating whether an invalid hostname is allowed for the SSL connection. Defaults to: true. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. ## User-Defined functions (UDFs) @@ -254,11 +254,11 @@ resource "google_dataflow_flex_template_job" "cloud_pubsub_to_mongodb" { name = "cloud-pubsub-to-mongodb" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - mongoDBUri = "host1:port,host2:port,host3:port" - database = "my-db" - collection = "my-collection" - deadletterTable = "your-project-id:your-dataset.your-table-name" + inputSubscription = "" + mongoDBUri = "" + database = "" + collection = "" + deadletterTable = "" # batchSize = "1000" # batchSizeBytes = "5242880" # maxConnectionIdleTime = "60000" @@ -266,7 +266,7 @@ resource "google_dataflow_flex_template_job" "cloud_pubsub_to_mongodb" { # ignoreSSLCertificate = "true" # withOrdered = "true" # withSSLInvalidHostNameAllowed = "true" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" } diff --git a/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB_Xlang.md b/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB_Xlang.md index d5a9977e44..21c811395e 100644 --- a/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB_Xlang.md +++ b/v2/pubsub-to-mongodb/README_Cloud_PubSub_to_MongoDB_Xlang.md @@ -23,23 +23,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : Name of the Pub/Sub subscription. (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **mongoDBUri** : Comma separated list of MongoDB servers. (Example: host1:port,host2:port,host3:port). -* **database** : Database in MongoDB to store the collection. (Example: my-db). -* **collection** : Name of the collection in the MongoDB database. (Example: my-collection). -* **deadletterTable** : The BigQuery table that stores messages caused by failures, such as mismatched schema, malformed JSON, and so on. (Example: your-project-id:your-dataset.your-table-name). +* **inputSubscription**: Name of the Pub/Sub subscription. For example, `projects/your-project-id/subscriptions/your-subscription-name`. +* **mongoDBUri**: Comma separated list of MongoDB servers. For example, `host1:port,host2:port,host3:port`. +* **database**: Database in MongoDB to store the collection. For example, `my-db`. +* **collection**: Name of the collection in the MongoDB database. For example, `my-collection`. +* **deadletterTable**: The BigQuery table that stores messages caused by failures, such as mismatched schema, malformed JSON, and so on. For example, `your-project-id:your-dataset.your-table-name`. ### Optional parameters -* **batchSize** : Batch size used for batch insertion of documents into MongoDB. Defaults to: 1000. -* **batchSizeBytes** : Batch size in bytes. Defaults to: 5242880. -* **maxConnectionIdleTime** : Maximum idle time allowed in seconds before connection timeout occurs. Defaults to: 60000. -* **sslEnabled** : Boolean value indicating whether the connection to MongoDB is SSL enabled. Defaults to: true. -* **ignoreSSLCertificate** : Boolean value indicating whether to ignore the SSL certificate. Defaults to: true. -* **withOrdered** : Boolean value enabling ordered bulk insertions into MongoDB. Defaults to: true. -* **withSSLInvalidHostNameAllowed** : Boolean value indicating whether an invalid hostname is allowed for the SSL connection. Defaults to: true. -* **pythonExternalTextTransformGcsPath** : The Cloud Storage path pattern for the Python code containing your user-defined functions. (Example: gs://your-bucket/your-function.py). -* **pythonExternalTextTransformFunctionName** : The name of the function to call from your Python file. Use only letters, digits, and underscores. (Example: 'transform' or 'transform_udf1'). +* **batchSize**: Batch size used for batch insertion of documents into MongoDB. Defaults to: 1000. +* **batchSizeBytes**: Batch size in bytes. Defaults to: 5242880. +* **maxConnectionIdleTime**: Maximum idle time allowed in seconds before connection timeout occurs. Defaults to: 60000. +* **sslEnabled**: Boolean value indicating whether the connection to MongoDB is SSL enabled. Defaults to: true. +* **ignoreSSLCertificate**: Boolean value indicating whether to ignore the SSL certificate. Defaults to: true. +* **withOrdered**: Boolean value enabling ordered bulk insertions into MongoDB. Defaults to: true. +* **withSSLInvalidHostNameAllowed**: Boolean value indicating whether an invalid hostname is allowed for the SSL connection. Defaults to: true. +* **pythonExternalTextTransformGcsPath**: The Cloud Storage path pattern for the Python code containing your user-defined functions. For example, `gs://your-bucket/your-function.py`. +* **pythonExternalTextTransformFunctionName**: The name of the function to call from your Python file. Use only letters, digits, and underscores. For example, `'transform' or 'transform_udf1'`. @@ -240,11 +240,11 @@ resource "google_dataflow_flex_template_job" "cloud_pubsub_to_mongodb_xlang" { name = "cloud-pubsub-to-mongodb-xlang" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - mongoDBUri = "host1:port,host2:port,host3:port" - database = "my-db" - collection = "my-collection" - deadletterTable = "your-project-id:your-dataset.your-table-name" + inputSubscription = "" + mongoDBUri = "" + database = "" + collection = "" + deadletterTable = "" # batchSize = "1000" # batchSizeBytes = "5242880" # maxConnectionIdleTime = "60000" @@ -252,8 +252,8 @@ resource "google_dataflow_flex_template_job" "cloud_pubsub_to_mongodb_xlang" { # ignoreSSLCertificate = "true" # withOrdered = "true" # withSSLInvalidHostNameAllowed = "true" - # pythonExternalTextTransformGcsPath = "gs://your-bucket/your-function.py" - # pythonExternalTextTransformFunctionName = "'transform' or 'transform_udf1'" + # pythonExternalTextTransformGcsPath = "" + # pythonExternalTextTransformFunctionName = "" } } ``` diff --git a/v2/pubsub-to-redis/README_Cloud_PubSub_to_Redis.md b/v2/pubsub-to-redis/README_Cloud_PubSub_to_Redis.md index a654b6cef8..48d20cc9b0 100644 --- a/v2/pubsub-to-redis/README_Cloud_PubSub_to_Redis.md +++ b/v2/pubsub-to-redis/README_Cloud_PubSub_to_Redis.md @@ -26,20 +26,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **inputSubscription** : The Pub/Sub subscription to read the input from, in the format projects//subscriptions/. (Example: projects/your-project-id/subscriptions/your-subscription-name). -* **redisHost** : The Redis database host. (Example: your.cloud.db.redislabs.com). Defaults to: 127.0.0.1. -* **redisPort** : The Redis database port. (Example: 12345). Defaults to: 6379. -* **redisPassword** : The Redis database password. Defaults to empty. +* **inputSubscription**: The Pub/Sub subscription to read the input from. For example, `projects//subscriptions/`. +* **redisHost**: The Redis database host. For example, `your.cloud.db.redislabs.com`. Defaults to: 127.0.0.1. +* **redisPort**: The Redis database port. For example, `12345`. Defaults to: 6379. +* **redisPassword**: The Redis database password. Defaults to `empty`. ### Optional parameters -* **sslEnabled** : The Redis database SSL parameter. Defaults to: false. -* **redisSinkType** : The Redis sink. Supported values are `STRING_SINK, HASH_SINK, STREAMS_SINK, and LOGGING_SINK`. (Example: STRING_SINK). Defaults to: STRING_SINK. -* **connectionTimeout** : The Redis connection timeout in milliseconds. (Example: 2000). Defaults to: 2000. -* **ttl** : The key expiration time in seconds. The `ttl` default for `HASH_SINK` is -1, which means it never expires. -* **javascriptTextTransformGcsPath** : The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. (Example: gs://my-bucket/my-udfs/my_file.js). -* **javascriptTextTransformFunctionName** : The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). -* **javascriptTextTransformReloadIntervalMinutes** : Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is 0, UDF reloading is disabled. The default value is 0. +* **sslEnabled**: The Redis database SSL parameter. Defaults to: false. +* **redisSinkType**: The Redis sink. Supported values are `STRING_SINK, HASH_SINK, STREAMS_SINK, and LOGGING_SINK`. For example, `STRING_SINK`. Defaults to: STRING_SINK. +* **connectionTimeout**: The Redis connection timeout in milliseconds. For example, `2000`. Defaults to: 2000. +* **ttl**: The key expiration time in seconds. The `ttl` default for `HASH_SINK` is -1, which means it never expires. +* **javascriptTextTransformGcsPath**: The Cloud Storage URI of the .js file that defines the JavaScript user-defined function (UDF) to use. For example, `gs://my-bucket/my-udfs/my_file.js`. +* **javascriptTextTransformFunctionName**: The name of the JavaScript user-defined function (UDF) to use. For example, if your JavaScript function code is `myTransform(inJson) { /*...do stuff...*/ }`, then the function name is `myTransform`. For sample JavaScript UDFs, see UDF Examples (https://github.com/GoogleCloudPlatform/DataflowTemplates#udf-examples). +* **javascriptTextTransformReloadIntervalMinutes**: Specifies how frequently to reload the UDF, in minutes. If the value is greater than 0, Dataflow periodically checks the UDF file in Cloud Storage, and reloads the UDF if the file is modified. This parameter allows you to update the UDF while the pipeline is running, without needing to restart the job. If the value is `0`, UDF reloading is disabled. The default value is `0`. ## User-Defined functions (UDFs) @@ -241,15 +241,15 @@ resource "google_dataflow_flex_template_job" "cloud_pubsub_to_redis" { name = "cloud-pubsub-to-redis" region = var.region parameters = { - inputSubscription = "projects/your-project-id/subscriptions/your-subscription-name" - redisHost = "your.cloud.db.redislabs.com" - redisPort = "12345" + inputSubscription = "" + redisHost = "127.0.0.1" + redisPort = "6379" redisPassword = "" # sslEnabled = "false" # redisSinkType = "STRING_SINK" # connectionTimeout = "2000" # ttl = "-1" - # javascriptTextTransformGcsPath = "gs://my-bucket/my-udfs/my_file.js" + # javascriptTextTransformGcsPath = "" # javascriptTextTransformFunctionName = "" # javascriptTextTransformReloadIntervalMinutes = "0" } diff --git a/v2/pubsub-to-redis/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToRedis.java b/v2/pubsub-to-redis/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToRedis.java index 25db3a35c1..c0e35e550c 100644 --- a/v2/pubsub-to-redis/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToRedis.java +++ b/v2/pubsub-to-redis/src/main/java/com/google/cloud/teleport/v2/templates/PubSubToRedis.java @@ -135,10 +135,8 @@ public interface PubSubToRedisOptions order = 1, groupName = "Source", description = "Pub/Sub input subscription", - helpText = - "The Pub/Sub subscription to read the input from, in the format" - + " projects//subscriptions/.", - example = "projects/your-project-id/subscriptions/your-subscription-name") + helpText = "The Pub/Sub subscription to read the input from.", + example = "projects//subscriptions/") String getInputSubscription(); void setInputSubscription(String value); @@ -171,7 +169,7 @@ public interface PubSubToRedisOptions order = 4, groupName = "Target", description = "Redis DB Password", - helpText = "The Redis database password. Defaults to empty.") + helpText = "The Redis database password. Defaults to `empty`.") @Default.String("") @Validation.Required String getRedisPassword(); diff --git a/v2/sourcedb-to-spanner/README_Cloud_Datastream_to_Spanner.md b/v2/sourcedb-to-spanner/README_Cloud_Datastream_to_Spanner.md index 4045eadecd..dbf26b9348 100644 --- a/v2/sourcedb-to-spanner/README_Cloud_Datastream_to_Spanner.md +++ b/v2/sourcedb-to-spanner/README_Cloud_Datastream_to_Spanner.md @@ -42,41 +42,41 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **instanceId** : The Spanner instance where the changes are replicated. -* **databaseId** : The Spanner database where the changes are replicated. +* **instanceId**: The Spanner instance where the changes are replicated. +* **databaseId**: The Spanner database where the changes are replicated. ### Optional parameters -* **inputFilePattern** : The Cloud Storage file location that contains the Datastream files to replicate. Typically, this is the root path for a stream. Support for this feature has been disabled. -* **inputFileFormat** : The format of the output file produced by Datastream. For example `avro,json`. Default, `avro`. -* **sessionFilePath** : Session file path in Cloud Storage that contains mapping information from HarbourBridge. -* **projectId** : The Spanner project ID. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. (Example: https://batch-spanner.googleapis.com). Defaults to: https://batch-spanner.googleapis.com. -* **gcsPubSubSubscription** : The Pub/Sub subscription being used in a Cloud Storage notification policy. The name should be in the format of projects//subscriptions/. -* **streamName** : The name or template for the stream to poll for schema information and source type. -* **shadowTablePrefix** : The prefix used to name shadow tables. Default: `shadow_`. -* **shouldCreateShadowTables** : This flag indicates whether shadow tables must be created in Cloud Spanner database. Defaults to: true. -* **rfcStartDateTime** : The starting DateTime used to fetch from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z. -* **fileReadConcurrency** : The number of concurrent DataStream files to read. Defaults to: 30. -* **deadLetterQueueDirectory** : The file path used when storing the error queue output. The default file path is a directory under the Dataflow job's temp location. -* **dlqRetryMinutes** : The number of minutes between dead letter queue retries. Defaults to 10. -* **dlqMaxRetryCount** : The max number of times temporary errors can be retried through DLQ. Defaults to 500. -* **dataStreamRootUrl** : Datastream API Root URL. Defaults to: https://datastream.googleapis.com/. -* **datastreamSourceType** : This is the type of source database that Datastream connects to. Example - mysql/oracle. Need to be set when testing without an actual running Datastream. -* **roundJsonDecimals** : This flag if set, rounds the decimal values in json columns to a number that can be stored without loss of precision. Defaults to: false. -* **runMode** : This is the run mode type, whether regular or with retryDLQ. Defaults to: regular. -* **transformationContextFilePath** : Transformation context file path in cloud storage used to populate data used in transformations performed during migrations Eg: The shard id to db name to identify the db from which a row was migrated. -* **directoryWatchDurationInMinutes** : The Duration for which the pipeline should keep polling a directory in GCS. Datastreamoutput files are arranged in a directory structure which depicts the timestamp of the event grouped by minutes. This parameter should be approximately equal tomaximum delay which could occur between event occurring in source database and the same event being written to GCS by Datastream. 99.9 percentile = 10 minutes. Defaults to: 10. -* **spannerPriority** : The request priority for Cloud Spanner calls. The value must be one of: [HIGH,MEDIUM,LOW]. Defaults to HIGH. -* **dlqGcsPubSubSubscription** : The Pub/Sub subscription being used in a Cloud Storage notification policy for DLQ retry directory when running in regular mode. The name should be in the format of projects//subscriptions/. When set, the deadLetterQueueDirectory and dlqRetryMinutes are ignored. -* **transformationJarPath** : Custom jar location in Cloud Storage that contains the custom transformation logic for processing records in forward migration. Defaults to empty. -* **transformationClassName** : Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. -* **transformationCustomParameters** : String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. -* **filteredEventsDirectory** : This is the file path to store the events filtered via custom transformation. Default is a directory under the Dataflow job's temp location. The default value is enough under most conditions. -* **shardingContextFilePath** : Sharding context file path in cloud storage is used to populate the shard id in spanner database for each source shard.It is of the format Map>. -* **tableOverrides** : These are the table name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1, SpannerTableName1}, {SourceTableName2, SpannerTableName2}]This example shows mapping Singers table to Vocalists and Albums table to Records. (Example: [{Singers, Vocalists}, {Albums, Records}]). Defaults to empty. -* **columnOverrides** : These are the column name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1.SourceColumnName1, SourceTableName1.SpannerColumnName1}, {SourceTableName2.SourceColumnName1, SourceTableName2.SpannerColumnName1}]Note that the SourceTableName should remain the same in both the source and spanner pair. To override table names, use tableOverrides.The example shows mapping SingerName to TalentName and AlbumName to RecordName in Singers and Albums table respectively. (Example: [{Singers.SingerName, Singers.TalentName}, {Albums.AlbumName, Albums.RecordName}]). Defaults to empty. -* **schemaOverridesFilePath** : A file which specifies the table and the column name overrides from source to spanner. Defaults to empty. +* **inputFilePattern**: The Cloud Storage file location that contains the Datastream files to replicate. Typically, this is the root path for a stream. Support for this feature has been disabled. +* **inputFileFormat**: The format of the output file produced by Datastream. For example `avro,json`. Defaults to `avro`. +* **sessionFilePath**: Session file path in Cloud Storage that contains mapping information from HarbourBridge. +* **projectId**: The Spanner project ID. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. For example, `https://batch-spanner.googleapis.com`. Defaults to: https://batch-spanner.googleapis.com. +* **gcsPubSubSubscription**: The Pub/Sub subscription being used in a Cloud Storage notification policy. For the name, use the format `projects//subscriptions/`. +* **streamName**: The name or template for the stream to poll for schema information and source type. +* **shadowTablePrefix**: The prefix used to name shadow tables. Default: `shadow_`. +* **shouldCreateShadowTables**: This flag indicates whether shadow tables must be created in Cloud Spanner database. Defaults to: true. +* **rfcStartDateTime**: The starting DateTime used to fetch from Cloud Storage (https://tools.ietf.org/html/rfc3339). Defaults to: 1970-01-01T00:00:00.00Z. +* **fileReadConcurrency**: The number of concurrent DataStream files to read. Defaults to: 30. +* **deadLetterQueueDirectory**: The file path used when storing the error queue output. The default file path is a directory under the Dataflow job's temp location. +* **dlqRetryMinutes**: The number of minutes between dead letter queue retries. Defaults to `10`. +* **dlqMaxRetryCount**: The max number of times temporary errors can be retried through DLQ. Defaults to `500`. +* **dataStreamRootUrl**: Datastream API Root URL. Defaults to: https://datastream.googleapis.com/. +* **datastreamSourceType**: This is the type of source database that Datastream connects to. Example - mysql/oracle. Need to be set when testing without an actual running Datastream. +* **roundJsonDecimals**: This flag if set, rounds the decimal values in json columns to a number that can be stored without loss of precision. Defaults to: false. +* **runMode**: This is the run mode type, whether regular or with retryDLQ. Defaults to: regular. +* **transformationContextFilePath**: Transformation context file path in cloud storage used to populate data used in transformations performed during migrations Eg: The shard id to db name to identify the db from which a row was migrated. +* **directoryWatchDurationInMinutes**: The Duration for which the pipeline should keep polling a directory in GCS. Datastreamoutput files are arranged in a directory structure which depicts the timestamp of the event grouped by minutes. This parameter should be approximately equal tomaximum delay which could occur between event occurring in source database and the same event being written to GCS by Datastream. 99.9 percentile = 10 minutes. Defaults to: 10. +* **spannerPriority**: The request priority for Cloud Spanner calls. The value must be one of: [`HIGH`,`MEDIUM`,`LOW`]. Defaults to `HIGH`. +* **dlqGcsPubSubSubscription**: The Pub/Sub subscription being used in a Cloud Storage notification policy for DLQ retry directory when running in regular mode. For the name, use the format `projects//subscriptions/`. When set, the deadLetterQueueDirectory and dlqRetryMinutes are ignored. +* **transformationJarPath**: Custom JAR file location in Cloud Storage for the file that contains the custom transformation logic for processing records in forward migration. Defaults to empty. +* **transformationClassName**: Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. +* **transformationCustomParameters**: String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. +* **filteredEventsDirectory**: This is the file path to store the events filtered via custom transformation. Default is a directory under the Dataflow job's temp location. The default value is enough under most conditions. +* **shardingContextFilePath**: Sharding context file path in cloud storage is used to populate the shard id in spanner database for each source shard.It is of the format Map>. +* **tableOverrides**: These are the table name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1, SpannerTableName1}, {SourceTableName2, SpannerTableName2}]This example shows mapping Singers table to Vocalists and Albums table to Records. For example, `[{Singers, Vocalists}, {Albums, Records}]`. Defaults to empty. +* **columnOverrides**: These are the column name overrides from source to spanner. They are written in thefollowing format: [{SourceTableName1.SourceColumnName1, SourceTableName1.SpannerColumnName1}, {SourceTableName2.SourceColumnName1, SourceTableName2.SpannerColumnName1}]Note that the SourceTableName should remain the same in both the source and spanner pair. To override table names, use tableOverrides.The example shows mapping SingerName to TalentName and AlbumName to RecordName in Singers and Albums table respectively. For example, `[{Singers.SingerName, Singers.TalentName}, {Albums.AlbumName, Albums.RecordName}]`. Defaults to empty. +* **schemaOverridesFilePath**: A file which specifies the table and the column name overrides from source to spanner. Defaults to empty. @@ -360,8 +360,8 @@ resource "google_dataflow_flex_template_job" "cloud_datastream_to_spanner" { # transformationCustomParameters = "" # filteredEventsDirectory = "" # shardingContextFilePath = "" - # tableOverrides = "[{Singers, Vocalists}, {Albums, Records}]" - # columnOverrides = "[{Singers.SingerName, Singers.TalentName}, {Albums.AlbumName, Albums.RecordName}]" + # tableOverrides = "" + # columnOverrides = "" # schemaOverridesFilePath = "" } } diff --git a/v2/sourcedb-to-spanner/README_Sourcedb_to_Spanner_Flex.md b/v2/sourcedb-to-spanner/README_Sourcedb_to_Spanner_Flex.md index b3d786e330..0a539a5c60 100644 --- a/v2/sourcedb-to-spanner/README_Sourcedb_to_Spanner_Flex.md +++ b/v2/sourcedb-to-spanner/README_Sourcedb_to_Spanner_Flex.md @@ -27,30 +27,31 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **sourceConfigURL** : The JDBC connection URL string. For example, `jdbc:mysql://127.4.5.30:3306/my-db?autoReconnect=true&maxReconnects=10&unicode=true&characterEncoding=UTF-8` or the shard config. -* **instanceId** : The destination Cloud Spanner instance. -* **databaseId** : The destination Cloud Spanner database. -* **projectId** : This is the name of the Cloud Spanner project. -* **outputDirectory** : This directory is used to dump the failed/skipped/filtered records in a migration. +* **sourceConfigURL**: The JDBC connection URL string. For example, `jdbc:mysql://127.4.5.30:3306/my-db?autoReconnect=true&maxReconnects=10&unicode=true&characterEncoding=UTF-8` or the shard config. +* **instanceId**: The destination Cloud Spanner instance. +* **databaseId**: The destination Cloud Spanner database. +* **projectId**: This is the name of the Cloud Spanner project. +* **outputDirectory**: This directory is used to dump the failed/skipped/filtered records in a migration. ### Optional parameters -* **sourceDbDialect** : Possible values are `MYSQL` and `POSTGRESQL`. Defaults to: MYSQL. -* **jdbcDriverJars** : The comma-separated list of driver JAR files. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). Defaults to empty. -* **jdbcDriverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). Defaults to: com.mysql.jdbc.Driver. -* **username** : The username to be used for the JDBC connection. Defaults to empty. -* **password** : The password to be used for the JDBC connection. Defaults to empty. -* **tables** : Tables to migrate from source. Defaults to empty. -* **numPartitions** : The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. Defaults to: 0. -* **spannerHost** : The Cloud Spanner endpoint to call in the template. (Example: https://batch-spanner.googleapis.com). Defaults to: https://batch-spanner.googleapis.com. -* **maxConnections** : Configures the JDBC connection pool on each worker with maximum number of connections. Use a negative number for no limit. (Example: -1). Defaults to: 0. -* **sessionFilePath** : Session file path in Cloud Storage that contains mapping information from Spanner Migration Tool. Defaults to empty. -* **transformationJarPath** : Custom jar location in Cloud Storage that contains the custom transformation logic for processing records. Defaults to empty. -* **transformationClassName** : Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. -* **transformationCustomParameters** : String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. -* **namespace** : Namespace to exported. For PostgreSQL, if no namespace is provided, 'public' will be used. Defaults to empty. -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **sourceDbDialect**: Possible values are `MYSQL` and `POSTGRESQL`. Defaults to: MYSQL. +* **jdbcDriverJars**: The comma-separated list of driver JAR files. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. Defaults to empty. +* **jdbcDriverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. Defaults to: com.mysql.jdbc.Driver. +* **username**: The username to be used for the JDBC connection. Defaults to empty. +* **password**: The password to be used for the JDBC connection. Defaults to empty. +* **tables**: Tables to migrate from source. Defaults to empty. +* **numPartitions**: The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. Defaults to: 0. +* **fetchSize**: The number of rows to fetch per page read for JDBC source. If not set, the default of JdbcIO of 50_000 rows gets used. If source dialect is Mysql, please see the note below. This ultimately translated to Statement.setFetchSize call at Jdbc layer. It should ONLY be used if the default value throws memory errors.Note for MySql Source: FetchSize is ignored by the Mysql connector unless, `useCursorFetch=true` is also part of the connection properties.In case, the fetchSize parameter is explicitly set, for MySql dialect, the pipeline will add `useCursorFetch=true` to the connection properties by default. +* **spannerHost**: The Cloud Spanner endpoint to call in the template. For example, `https://batch-spanner.googleapis.com`. Defaults to: https://batch-spanner.googleapis.com. +* **maxConnections**: Configures the JDBC connection pool on each worker with maximum number of connections. Use a negative number for no limit. For example, `-1`. Defaults to: 0. +* **sessionFilePath**: Session file path in Cloud Storage that contains mapping information from Spanner Migration Tool. Defaults to empty. +* **transformationJarPath**: Custom jar location in Cloud Storage that contains the custom transformation logic for processing records. Defaults to empty. +* **transformationClassName**: Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. +* **transformationCustomParameters**: String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. +* **namespace**: Namespace to exported. For PostgreSQL, if no namespace is provided, 'public' will be used. Defaults to empty. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -143,6 +144,7 @@ export USERNAME="" export PASSWORD="" export TABLES="" export NUM_PARTITIONS=0 +export FETCH_SIZE= export SPANNER_HOST=https://batch-spanner.googleapis.com export MAX_CONNECTIONS=0 export SESSION_FILE_PATH="" @@ -165,6 +167,7 @@ gcloud dataflow flex-template run "sourcedb-to-spanner-flex-job" \ --parameters "password=$PASSWORD" \ --parameters "tables=$TABLES" \ --parameters "numPartitions=$NUM_PARTITIONS" \ + --parameters "fetchSize=$FETCH_SIZE" \ --parameters "instanceId=$INSTANCE_ID" \ --parameters "databaseId=$DATABASE_ID" \ --parameters "projectId=$PROJECT_ID" \ @@ -210,6 +213,7 @@ export USERNAME="" export PASSWORD="" export TABLES="" export NUM_PARTITIONS=0 +export FETCH_SIZE= export SPANNER_HOST=https://batch-spanner.googleapis.com export MAX_CONNECTIONS=0 export SESSION_FILE_PATH="" @@ -227,7 +231,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="sourcedb-to-spanner-flex-job" \ -DtemplateName="Sourcedb_to_Spanner_Flex" \ --Dparameters="sourceDbDialect=$SOURCE_DB_DIALECT,jdbcDriverJars=$JDBC_DRIVER_JARS,jdbcDriverClassName=$JDBC_DRIVER_CLASS_NAME,sourceConfigURL=$SOURCE_CONFIG_URL,username=$USERNAME,password=$PASSWORD,tables=$TABLES,numPartitions=$NUM_PARTITIONS,instanceId=$INSTANCE_ID,databaseId=$DATABASE_ID,projectId=$PROJECT_ID,spannerHost=$SPANNER_HOST,maxConnections=$MAX_CONNECTIONS,sessionFilePath=$SESSION_FILE_PATH,outputDirectory=$OUTPUT_DIRECTORY,transformationJarPath=$TRANSFORMATION_JAR_PATH,transformationClassName=$TRANSFORMATION_CLASS_NAME,transformationCustomParameters=$TRANSFORMATION_CUSTOM_PARAMETERS,namespace=$NAMESPACE,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE" \ +-Dparameters="sourceDbDialect=$SOURCE_DB_DIALECT,jdbcDriverJars=$JDBC_DRIVER_JARS,jdbcDriverClassName=$JDBC_DRIVER_CLASS_NAME,sourceConfigURL=$SOURCE_CONFIG_URL,username=$USERNAME,password=$PASSWORD,tables=$TABLES,numPartitions=$NUM_PARTITIONS,fetchSize=$FETCH_SIZE,instanceId=$INSTANCE_ID,databaseId=$DATABASE_ID,projectId=$PROJECT_ID,spannerHost=$SPANNER_HOST,maxConnections=$MAX_CONNECTIONS,sessionFilePath=$SESSION_FILE_PATH,outputDirectory=$OUTPUT_DIRECTORY,transformationJarPath=$TRANSFORMATION_JAR_PATH,transformationClassName=$TRANSFORMATION_CLASS_NAME,transformationCustomParameters=$TRANSFORMATION_CUSTOM_PARAMETERS,namespace=$NAMESPACE,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE" \ -f v2/sourcedb-to-spanner ``` @@ -278,21 +282,22 @@ resource "google_dataflow_flex_template_job" "sourcedb_to_spanner_flex" { projectId = "" outputDirectory = "" # sourceDbDialect = "MYSQL" - # jdbcDriverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" + # jdbcDriverJars = "" # jdbcDriverClassName = "com.mysql.jdbc.Driver" # username = "" # password = "" # tables = "" # numPartitions = "0" + # fetchSize = "" # spannerHost = "https://batch-spanner.googleapis.com" - # maxConnections = "-1" + # maxConnections = "0" # sessionFilePath = "" # transformationJarPath = "" # transformationClassName = "" # transformationCustomParameters = "" # namespace = "" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/sourcedb-to-spanner/pom.xml b/v2/sourcedb-to-spanner/pom.xml index ce040f6115..52b002a0a2 100644 --- a/v2/sourcedb-to-spanner/pom.xml +++ b/v2/sourcedb-to-spanner/pom.xml @@ -142,5 +142,15 @@ 5.0.0 test + + org.apache.beam + beam-sdks-java-io-cassandra + + + org.apache.commons + commons-collections4 + 4.1 + compile + diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/CassandraMappings.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/CassandraMappings.java new file mode 100644 index 0000000000..7385d04c3a --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/CassandraMappings.java @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.mappings; + +import com.google.auto.value.AutoValue; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper.CassandraFieldMapper; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper.CassandraRowValueExtractor; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper.CassandraRowValueMapper; +import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.UnifiedTypeMapping; +import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.UnifiedMappingProvider; +import com.google.common.collect.ImmutableMap; + +/** Represent Unified type mapping, value extractor and value mappings for Cassandra. */ +@AutoValue +public abstract class CassandraMappings { + public abstract ImmutableMap typeMapping(); + + public abstract ImmutableMap> fieldMapping(); + + public static Builder builder() { + return new AutoValue_CassandraMappings.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + abstract ImmutableMap.Builder typeMappingBuilder(); + + abstract ImmutableMap.Builder> fieldMappingBuilder(); + + public Builder put( + String cassandraType, + UnifiedMappingProvider.Type type, + CassandraRowValueExtractor rowValueExtractor, + CassandraRowValueMapper rowValueMapper) { + this.typeMappingBuilder() + .put(cassandraType.toUpperCase(), UnifiedMappingProvider.getMapping(type)); + this.fieldMappingBuilder() + .put( + cassandraType.toUpperCase(), + CassandraFieldMapper.create(rowValueExtractor, rowValueMapper)); + return this; + } + + public abstract CassandraMappings build(); + } +} diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/CassandraMappingsProvider.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/CassandraMappingsProvider.java new file mode 100644 index 0000000000..dd89bf16bf --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/CassandraMappingsProvider.java @@ -0,0 +1,135 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.mappings; + +import com.datastax.driver.core.Duration; +import com.datastax.driver.core.LocalDate; +import com.datastax.driver.core.Row; +import com.datastax.driver.core.TypeCodec; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper.CassandraFieldMapper; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper.CassandraRowValueExtractor; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper.CassandraRowValueMapper; +import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.UnifiedTypeMapping; +import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.CustomSchema.IntervalNano; +import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.UnifiedMappingProvider; +import com.google.common.collect.ImmutableMap; +import java.nio.ByteBuffer; +import java.util.Date; +import org.apache.avro.LogicalTypes; +import org.apache.avro.generic.GenericRecordBuilder; +import org.apache.commons.codec.binary.Hex; + +public class CassandraMappingsProvider { + /** Pass the value as is to avro. */ + private static final CassandraRowValueMapper valuePassThrough = (value, schema) -> value; + + /** Pass the value as a string to avro. */ + private static final CassandraRowValueMapper toString = (value, schema) -> value.toString(); + + /** Pass the value as an integer to avro. */ + private static final CassandraRowValueMapper toInt = (value, schema) -> value.intValue(); + + /** Map {@link ByteBuffer} to a Hex encoded String. */ + private static final CassandraRowValueMapper ByteBufferToHexString = + (value, schema) -> new String(Hex.encodeHex(value.array())); + + /** + * Map {@link LocalDate} to {@link LogicalTypes.Date}. Cassandra Date type encodes number of days + * since epoch, without any time or time zone component. + * + *

See: types + * for additional information on date type. + */ + private static final CassandraRowValueMapper localDateToAvroLogicalDate = + (value, schema) -> value.getDaysSinceEpoch(); + + private static final CassandraRowValueExtractor getDuration = + (row, name) -> row.get(name, TypeCodec.duration()); + + private static final CassandraRowValueMapper durationToAvro = + (value, schema) -> + new GenericRecordBuilder(IntervalNano.SCHEMA) + .set(IntervalNano.MONTHS_FIELD_NAME, value.getMonths()) + .set(IntervalNano.DAYS_FIELD_NAME, value.getDays()) + .set(IntervalNano.NANOS_FIELD_NAME, value.getNanoseconds()) + .build(); + + /** + * Cassandra represents `Time` field as 64 bit singed integer representing number of nanoseconds + * since midnight. See types documentation + * for further details. + */ + private static final CassandraRowValueMapper cassandraTimeToIntervalNano = + (value, schema) -> + new GenericRecordBuilder(IntervalNano.SCHEMA) + .set(IntervalNano.NANOS_FIELD_NAME, value) + .build(); + + private static final CassandraRowValueMapper dateToAvro = + (value, schema) -> value.getTime() * 1000L; + + private static final CassandraMappings CASSANDRA_MAPPINGS = + CassandraMappings.builder() + .put("ASCII", UnifiedMappingProvider.Type.STRING, Row::getString, valuePassThrough) + .put("BIGINT", UnifiedMappingProvider.Type.LONG, Row::getLong, valuePassThrough) + .put("BLOB", UnifiedMappingProvider.Type.STRING, Row::getBytes, ByteBufferToHexString) + .put("BOOLEAN", UnifiedMappingProvider.Type.BOOLEAN, Row::getBool, valuePassThrough) + .put("COUNTER", UnifiedMappingProvider.Type.LONG, Row::getLong, valuePassThrough) + .put("DATE", UnifiedMappingProvider.Type.DATE, Row::getDate, localDateToAvroLogicalDate) + // The Cassandra decimal does not have precision and scale fixed in the + // schema which would be needed if we want to map it to Avro Decimal. + .put("DECIMAL", UnifiedMappingProvider.Type.STRING, Row::getDecimal, toString) + .put("DOUBLE", UnifiedMappingProvider.Type.DOUBLE, Row::getDouble, valuePassThrough) + .put("DURATION", UnifiedMappingProvider.Type.INTERVAL_NANO, getDuration, durationToAvro) + .put("FLOAT", UnifiedMappingProvider.Type.FLOAT, Row::getFloat, valuePassThrough) + .put("INET", UnifiedMappingProvider.Type.STRING, Row::getInet, toString) + .put("INT", UnifiedMappingProvider.Type.INTEGER, Row::getInt, valuePassThrough) + .put("SMALLINT", UnifiedMappingProvider.Type.INTEGER, Row::getShort, toInt) + .put("TEXT", UnifiedMappingProvider.Type.STRING, Row::getString, valuePassThrough) + .put( + "TIME", + UnifiedMappingProvider.Type.INTERVAL_NANO, + Row::getTime, + cassandraTimeToIntervalNano) + .put("TIMESTAMP", UnifiedMappingProvider.Type.TIMESTAMP, Row::getTimestamp, dateToAvro) + .put("TIMEUUID", UnifiedMappingProvider.Type.STRING, Row::getUUID, toString) + .put("TINYINT", UnifiedMappingProvider.Type.INTEGER, Row::getByte, toInt) + .put("UUID", UnifiedMappingProvider.Type.STRING, Row::getUUID, toString) + .put("VARCHAR", UnifiedMappingProvider.Type.STRING, Row::getString, valuePassThrough) + .put("VARINT", UnifiedMappingProvider.Type.NUMBER, Row::getVarint, toString) + .put( + "UNSUPPORTED", + UnifiedMappingProvider.Type.UNSUPPORTED, + (row, name) -> null, + (value, schema) -> null) + .build(); + + private CassandraMappingsProvider() {} + + /** Mappings for unified type interface. */ + public static ImmutableMap getMapping() { + return CASSANDRA_MAPPINGS.typeMapping(); + } + + /** + * Field Mappers for {@link + * com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper.CassandraSourceRowMapper}. + */ + public static ImmutableMap> getFieldMapping() { + return CASSANDRA_MAPPINGS.fieldMapping(); + } +} diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/package-info.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/package-info.java new file mode 100644 index 0000000000..a690b8a56d --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/mappings/package-info.java @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2024 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +/** Schema and Value mapping for Cassandra. */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.mappings; diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraFieldMapper.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraFieldMapper.java new file mode 100644 index 0000000000..c8fb0cd393 --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraFieldMapper.java @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper; + +import com.datastax.driver.core.Row; +import com.google.auto.value.AutoValue; +import java.io.Serializable; +import org.apache.avro.Schema; + +@AutoValue +public abstract class CassandraFieldMapper implements Serializable { + + public static CassandraFieldMapper create( + CassandraRowValueExtractor rowValueExtractor, CassandraRowValueMapper rowValueMapper) { + return new AutoValue_CassandraFieldMapper(rowValueExtractor, rowValueMapper); + } + + public Object mapValue(Row row, String fieldName, Schema fieldSchema) { + T extractedValue = rowValueExtractor().extract(row, fieldName); + if (extractedValue == null) { + return null; + } + Object avroValue = rowValueMapper().map(extractedValue, fieldSchema); + return avroValue; + } + + abstract CassandraRowValueExtractor rowValueExtractor(); + + abstract CassandraRowValueMapper rowValueMapper(); +} diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowMapper.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowMapper.java new file mode 100644 index 0000000000..7097fa9295 --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowMapper.java @@ -0,0 +1,85 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper; + +import com.datastax.driver.core.Row; +import com.google.auto.value.AutoValue; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.mappings.CassandraMappingsProvider; +import com.google.cloud.teleport.v2.source.reader.io.row.SourceRow; +import com.google.cloud.teleport.v2.source.reader.io.schema.SourceSchemaReference; +import com.google.cloud.teleport.v2.source.reader.io.schema.SourceTableSchema; +import com.google.common.collect.ImmutableMap; +import java.io.Serializable; +import java.time.Instant; +import java.util.concurrent.TimeUnit; +import org.apache.avro.Schema; +import org.apache.commons.collections4.Transformer; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; + +@AutoValue +abstract class CassandraRowMapper implements Transformer, Serializable { + public static final ImmutableMap> MAPPINGS = + CassandraMappingsProvider.getFieldMapping(); + + public static CassandraRowMapper create( + SourceSchemaReference sourceSchemaReference, SourceTableSchema sourceTableSchema) { + return new AutoValue_CassandraRowMapper(sourceSchemaReference, sourceTableSchema); + } + + abstract SourceSchemaReference sourceSchemaReference(); + + abstract SourceTableSchema sourceTableSchema(); + + long getCurrentTimeMicros() { + Instant now = Instant.now(); + long nanos = TimeUnit.SECONDS.toNanos(now.getEpochSecond()) + now.getNano(); + return TimeUnit.NANOSECONDS.toMicros(nanos); + } + + public @UnknownKeyFor @NonNull @Initialized SourceRow map( + @UnknownKeyFor @NonNull @Initialized Row row) { + /* Todo Decide if any of the element time like max time or min time is needed here. */ + long time = getCurrentTimeMicros(); + + SourceRow.Builder sourceRowBuilder = + SourceRow.builder(sourceSchemaReference(), sourceTableSchema(), "", time); + + sourceTableSchema() + .sourceColumnNameToSourceColumnType() + .forEach( + (key, value) -> { + Schema schema = sourceTableSchema().getAvroPayload().getField(key).schema(); + // The Unified avro mapping produces a union of the mapped type with null type + // except for "Unsupported" case. + if (schema.isUnion()) { + schema = schema.getTypes().get(1); + } + sourceRowBuilder.setField( + key, + MAPPINGS + .getOrDefault(value.getName().toUpperCase(), MAPPINGS.get("UNSUPPORTED")) + .mapValue(row, key, schema)); + }); + return sourceRowBuilder.build(); + } + + @Override + public SourceRow transform(Row row) { + return map(row); + } +} diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowValueExtractor.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowValueExtractor.java new file mode 100644 index 0000000000..135f79c429 --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowValueExtractor.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper; + +import com.datastax.driver.core.Row; +import com.datastax.oss.driver.api.core.cql.ResultSet; +import java.io.Serializable; +import javax.annotation.Nullable; + +public interface CassandraRowValueExtractor extends Serializable { + + /** + * Extract the requested field from the result set. + * + * @param row row derived from {@link ResultSet}. + * @param fieldName name of the field to extract. + * @return extracted value. + * @throws IllegalArgumentException - thrown from Cassandra driver for invalid names. + */ + @Nullable + T extract(Row row, String fieldName) throws IllegalArgumentException; +} diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowValueMapper.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowValueMapper.java new file mode 100644 index 0000000000..9114128e9f --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraRowValueMapper.java @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper; + +import java.io.Serializable; +import org.apache.avro.Schema; +import org.apache.avro.Schema.Field; +import org.checkerframework.checker.nullness.qual.NonNull; + +public interface CassandraRowValueMapper extends Serializable { + + /** + * Map the extracted value to an object accepted by {@link + * org.apache.avro.generic.GenericRecordBuilder#set(Field, Object)} as per the schema of the + * field. + * + * @param value extracted value. + * @param schema Avro Schema. + * @return mapped object. + */ + Object map(@NonNull T value, Schema schema); +} diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraSourceRowMapper.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraSourceRowMapper.java new file mode 100644 index 0000000000..a4e00663bd --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraSourceRowMapper.java @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper; + +import com.datastax.driver.core.ResultSet; +import com.google.auto.value.AutoValue; +import com.google.cloud.teleport.v2.source.reader.io.row.SourceRow; +import com.google.cloud.teleport.v2.source.reader.io.schema.SourceSchemaReference; +import com.google.cloud.teleport.v2.source.reader.io.schema.SourceTableSchema; +import java.io.Serializable; +import java.util.Iterator; +import java.util.concurrent.Future; +import org.apache.beam.sdk.io.cassandra.Mapper; +import org.apache.commons.collections4.iterators.TransformIterator; +import org.checkerframework.checker.initialization.qual.Initialized; +import org.checkerframework.checker.nullness.qual.NonNull; +import org.checkerframework.checker.nullness.qual.Nullable; +import org.checkerframework.checker.nullness.qual.UnknownKeyFor; + +@AutoValue +public abstract class CassandraSourceRowMapper implements Mapper, Serializable { + abstract SourceSchemaReference sourceSchemaReference(); + + abstract SourceTableSchema sourceTableSchema(); + + @Override + public @UnknownKeyFor @NonNull @Initialized Iterator map( + @UnknownKeyFor @NonNull @Initialized ResultSet resultSet) { + var ret = new TransformIterator(); + ret.setIterator(resultSet.iterator()); + ret.setTransformer(CassandraRowMapper.create(sourceSchemaReference(), sourceTableSchema())); + return ret; + } + + @Override + public @UnknownKeyFor @NonNull @Initialized Future<@UnknownKeyFor @Nullable @Initialized Void> + deleteAsync(SourceRow entity) { + throw new UnsupportedOperationException("Only Read from Cassandra is supported"); + } + + @Override + public @UnknownKeyFor @NonNull @Initialized Future<@UnknownKeyFor @Nullable @Initialized Void> + saveAsync(SourceRow entity) { + throw new UnsupportedOperationException("Only Read from Cassandra is supported"); + } + + public static Builder builder() { + return new AutoValue_CassandraSourceRowMapper.Builder(); + } + + @AutoValue.Builder + public abstract static class Builder { + + public abstract Builder setSourceSchemaReference(SourceSchemaReference value); + + public abstract Builder setSourceTableSchema(SourceTableSchema value); + + public abstract CassandraSourceRowMapper build(); + } +} diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/package-info.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/package-info.java new file mode 100644 index 0000000000..de36cd6fb4 --- /dev/null +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/package-info.java @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2024 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +/** Row mapper for mapping Cassandra Rows to Avro Generic Record. */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper; diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscovery.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscovery.java index 445beda528..5abbfd96a4 100644 --- a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscovery.java +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscovery.java @@ -113,7 +113,7 @@ public ImmutableMap> discoverTabl DataSource dataSource, SourceSchemaReference schemaReference, ImmutableList tables) throws SchemaDiscoveryException, RetriableSchemaDiscoveryException { Log.info( - "CassandraSchemaDiscovery discoverTableSchema started dataSource = {}, sourceSchemaReference = {}, talbes = {}", + "CassandraSchemaDiscovery discoverTableSchema started dataSource = {}, sourceSchemaReference = {}, tables = {}", dataSource, schemaReference, tables); @@ -123,7 +123,7 @@ public ImmutableMap> discoverTabl ImmutableMap> schema = this.discoverTableSchema(dataSource.cassandra(), schemaReference.cassandra(), tables); Log.info( - "CassandraSchemaDiscovery discoverTableSchema completed dataSource = {}, sourceSchemaReference = {}, talbes = {}, schema = {}", + "CassandraSchemaDiscovery discoverTableSchema completed dataSource = {}, sourceSchemaReference = {}, tables = {}, schema = {}", dataSource, schemaReference, tables, @@ -147,7 +147,7 @@ private ImmutableMap> discoverTab return builder.build(); } catch (DriverException e) { Log.error( - "CassandraSchemaDiscovery discoverTableSchema dataSource = {}, sourceSchemaReference = {}, talbes = {}", + "CassandraSchemaDiscovery discoverTableSchema dataSource = {}, sourceSchemaReference = {}, tables = {}", dataSource, schemaReference, tables, diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/UnifiedTypeMapper.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/UnifiedTypeMapper.java index 3a887ba9f0..b346ce86fa 100644 --- a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/UnifiedTypeMapper.java +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/UnifiedTypeMapper.java @@ -15,6 +15,7 @@ */ package com.google.cloud.teleport.v2.source.reader.io.schema.typemapping; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.mappings.CassandraMappingsProvider; import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.MysqlMappingProvider; import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.PostgreSQLMappingProvider; import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.Unsupported; @@ -33,13 +34,16 @@ public final class UnifiedTypeMapper { /** - * A static map of the type mappings for all source database types constructed at class load time. - * TODO(vardhanvthigle): Support other mappings beyond Mysql. + * A static map of the type MAPPINGS for all source database types constructed at class load time. */ private static final ImmutableMap> mappers = ImmutableMap.of( - MapperType.MYSQL, MysqlMappingProvider.getMapping(), - MapperType.POSTGRESQL, PostgreSQLMappingProvider.getMapping()); + MapperType.MYSQL, + MysqlMappingProvider.getMapping(), + MapperType.POSTGRESQL, + PostgreSQLMappingProvider.getMapping(), + MapperType.CASSANDRA, + CassandraMappingsProvider.getMapping()); private final MapperType mapperType; @@ -96,6 +100,7 @@ public enum MapperType { MYSQL, POSTGRESQL, ORACLE, - SQLSERVER + SQLSERVER, + CASSANDRA } } diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/CustomSchema.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/CustomSchema.java index 846e433017..a97031984e 100644 --- a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/CustomSchema.java +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/CustomSchema.java @@ -70,6 +70,62 @@ public static final class Interval { private Interval() {} } + /** Schema to represent Interval from years upto nanoseconds precision. */ + public static final class IntervalNano { + public static final String RECORD_NAME = "intervalNano"; + + /** Years in the duration. * */ + public static final String YEARS_FIELD_NAME = "years"; + + /** Months in the duration. * */ + public static final String MONTHS_FIELD_NAME = "months"; + + /** Days in the duration. * */ + public static final String DAYS_FIELD_NAME = "days"; + + /** Hours in the duration. * */ + public static final String HOURS_FIELD_NAME = "hours"; + + /** Minutes in the duration. * */ + public static final String MINUTES_FIELD_NAME = "minutes"; + + /** Seconds in the duration. * */ + public static final String SECONDS_FIELD_NAME = "seconds"; + + /** Nano Seconds in the duration. * */ + public static final String NANOS_FIELD_NAME = "nanos"; + + public static final Schema SCHEMA = + SchemaBuilder.builder() + .record(RECORD_NAME) + .fields() + .name(YEARS_FIELD_NAME) + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name(MONTHS_FIELD_NAME) + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name(DAYS_FIELD_NAME) + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name(HOURS_FIELD_NAME) + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name(MINUTES_FIELD_NAME) + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name(SECONDS_FIELD_NAME) + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name(NANOS_FIELD_NAME) + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .endRecord(); + + /** Static final class wrapping only constants. * */ + private IntervalNano() {} + } + public static final class TimeStampTz { public static final String RECORD_NAME = "timestampTz"; public static final String TIMESTAMP_FIELD_NAME = "timestamp"; diff --git a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/UnifiedMappingProvider.java b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/UnifiedMappingProvider.java index 8b712048dc..d86c631b3f 100644 --- a/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/UnifiedMappingProvider.java +++ b/v2/sourcedb-to-spanner/src/main/java/com/google/cloud/teleport/v2/source/reader/io/schema/typemapping/provider/unified/UnifiedMappingProvider.java @@ -21,6 +21,7 @@ import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.CustomLogical.TimeIntervalMicros; import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.CustomSchema.DateTime; import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.CustomSchema.Interval; +import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.provider.unified.CustomSchema.IntervalNano; import com.google.common.collect.ImmutableMap; import java.util.Map; import org.apache.avro.LogicalTypes; @@ -62,6 +63,7 @@ public enum Type { TIME_WITH_TIME_ZONE, VARCHAR, UNSUPPORTED, + INTERVAL_NANO, } // Implementation Detail, ImmutableMap.of(...) supports only upto 10 arguments. @@ -93,6 +95,7 @@ public enum Type { .addToSchema(SchemaBuilder.builder().longType())) .put(Type.TIMESTAMP_WITH_TIME_ZONE, CustomSchema.TimeStampTz.SCHEMA) .put(Type.TIME_WITH_TIME_ZONE, CustomSchema.TimeTz.SCHEMA) + .put(Type.INTERVAL_NANO, IntervalNano.SCHEMA) .build() .entrySet() .stream() diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraSourceRowMapperTest.java b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraSourceRowMapperTest.java new file mode 100644 index 0000000000..9f457d2272 --- /dev/null +++ b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/rowmapper/CassandraSourceRowMapperTest.java @@ -0,0 +1,189 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.source.reader.io.cassandra.rowmapper; + +import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.PRIMITIVE_TYPES_TABLE; +import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.PRIMITIVE_TYPES_TABLE_AVRO_ROWS; +import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_CONFIG; +import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_CQLSH; +import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_KEYSPACE; +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; +import static org.mockito.Mockito.when; + +import com.datastax.driver.core.Cluster; +import com.datastax.driver.core.ResultSet; +import com.datastax.driver.core.Row; +import com.datastax.driver.core.policies.DCAwareRoundRobinPolicy; +import com.datastax.oss.driver.api.core.cql.SimpleStatement; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.iowrapper.CassandraConnector; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.iowrapper.CassandraDataSource; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.schema.CassandraSchemaDiscovery; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.schema.CassandraSchemaReference; +import com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.SharedEmbeddedCassandra; +import com.google.cloud.teleport.v2.source.reader.io.datasource.DataSource; +import com.google.cloud.teleport.v2.source.reader.io.exception.RetriableSchemaDiscoveryException; +import com.google.cloud.teleport.v2.source.reader.io.row.SourceRow; +import com.google.cloud.teleport.v2.source.reader.io.schema.SourceSchemaReference; +import com.google.cloud.teleport.v2.source.reader.io.schema.SourceTableSchema; +import com.google.cloud.teleport.v2.source.reader.io.schema.typemapping.UnifiedTypeMapper.MapperType; +import com.google.cloud.teleport.v2.spanner.migrations.schema.SourceColumnType; +import com.google.common.collect.ImmutableList; +import java.io.IOException; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.Mockito; +import org.mockito.junit.MockitoJUnitRunner; + +/** Test class for {@link CassandraSourceRowMapper}. */ +@RunWith(MockitoJUnitRunner.class) +public class CassandraSourceRowMapperTest { + + private static SharedEmbeddedCassandra sharedEmbeddedCassandra = null; + + @BeforeClass + public static void startEmbeddedCassandra() throws IOException { + if (sharedEmbeddedCassandra == null) { + sharedEmbeddedCassandra = new SharedEmbeddedCassandra(TEST_CONFIG, TEST_CQLSH); + } + } + + @AfterClass + public static void stopEmbeddedCassandra() throws Exception { + if (sharedEmbeddedCassandra != null) { + sharedEmbeddedCassandra.close(); + sharedEmbeddedCassandra = null; + } + } + + @Test + public void testCassandraSourceRowMapperBasic() throws RetriableSchemaDiscoveryException { + + SourceSchemaReference sourceSchemaReference = + SourceSchemaReference.ofCassandra( + CassandraSchemaReference.builder().setKeyspaceName(TEST_KEYSPACE).build()); + + DataSource dataSource = + DataSource.ofCassandra( + CassandraDataSource.builder() + .setClusterName(sharedEmbeddedCassandra.getInstance().getClusterName()) + .setContactPoints(sharedEmbeddedCassandra.getInstance().getContactPoints()) + .setLocalDataCenter(sharedEmbeddedCassandra.getInstance().getLocalDataCenter()) + .build()); + + SourceTableSchema.Builder sourceTableSchemaBuilder = + SourceTableSchema.builder(MapperType.CASSANDRA).setTableName(PRIMITIVE_TYPES_TABLE); + new CassandraSchemaDiscovery() + .discoverTableSchema( + dataSource, sourceSchemaReference, ImmutableList.of(PRIMITIVE_TYPES_TABLE)) + .get(PRIMITIVE_TYPES_TABLE) + .forEach(sourceTableSchemaBuilder::addSourceColumnNameToSourceColumnType); + + CassandraSourceRowMapper cassandraSourceRowMapper = + CassandraSourceRowMapper.builder() + .setSourceSchemaReference(sourceSchemaReference) + .setSourceTableSchema(sourceTableSchemaBuilder.build()) + .build(); + + ResultSet resultSet; + String query = "SELECT * FROM " + PRIMITIVE_TYPES_TABLE; + com.datastax.oss.driver.api.core.cql.SimpleStatement statement = + SimpleStatement.newInstance(query); + Cluster cluster = + Cluster.builder() + .addContactPointsWithPorts(dataSource.cassandra().contactPoints()) + .withClusterName(dataSource.cassandra().clusterName()) + .withoutJMXReporting() + .withLoadBalancingPolicy( + new DCAwareRoundRobinPolicy.Builder() + .withLocalDc(dataSource.cassandra().localDataCenter()) + .build()) + .build(); + try (CassandraConnector cassandraConnectorWithSchemaReference = + new CassandraConnector(dataSource.cassandra(), sourceSchemaReference.cassandra())) { + resultSet = cluster.connect(TEST_KEYSPACE).execute(query); + ImmutableList.Builder readRowsBuilder = ImmutableList.builder(); + cassandraSourceRowMapper.map(resultSet).forEachRemaining(row -> readRowsBuilder.add(row)); + ImmutableList readRows = readRowsBuilder.build(); + + readRows.forEach(r -> assertThat(r.tableName() == PRIMITIVE_TYPES_TABLE)); + readRows.forEach(r -> assertThat(r.sourceSchemaReference() == sourceSchemaReference)); + assertThat( + readRows.stream() + .map(r -> r.getPayload().toString()) + .sorted() + .collect(ImmutableList.toImmutableList())) + .isEqualTo( + PRIMITIVE_TYPES_TABLE_AVRO_ROWS.stream() + .sorted() + .collect(ImmutableList.toImmutableList())); + + // Since we will use CassandraIO only for reads, we don't need to support the `deleteAsync` + // and `saveAsync` functions of the CassandraIO mapper interface. + assertThrows( + UnsupportedOperationException.class, + () -> cassandraSourceRowMapper.deleteAsync(readRows.get(1))); + assertThrows( + UnsupportedOperationException.class, + () -> cassandraSourceRowMapper.saveAsync(readRows.get(1))); + } + } + + @Test + public void testCassandraSourceRowForUnsupportedType() { + ResultSet mockResultSet = Mockito.mock(ResultSet.class); + Row mockRow = Mockito.mock(Row.class); + final String testIntCol = "testIntCol"; + when(mockRow.getInt(testIntCol)).thenReturn(42); + when(mockResultSet.iterator()).thenReturn(ImmutableList.of(mockRow).stream().iterator()); + + SourceSchemaReference sourceSchemaReference = + SourceSchemaReference.ofCassandra( + CassandraSchemaReference.builder().setKeyspaceName(TEST_KEYSPACE).build()); + + SourceTableSchema sourceTableSchema = + SourceTableSchema.builder(MapperType.CASSANDRA) + .setTableName("testTable") + .addSourceColumnNameToSourceColumnType( + testIntCol, new SourceColumnType("int", null, null)) + .addSourceColumnNameToSourceColumnType( + "UnSupportedCol1", new SourceColumnType("UnseenColumnType", null, null)) + .addSourceColumnNameToSourceColumnType( + "UnSupportedCol2", new SourceColumnType("UNSUPPORTED", null, null)) + .build(); + + CassandraSourceRowMapper cassandraSourceRowMapper = + CassandraSourceRowMapper.builder() + .setSourceSchemaReference(sourceSchemaReference) + .setSourceTableSchema(sourceTableSchema) + .build(); + + ImmutableList.Builder readRowsBuilder = ImmutableList.builder(); + cassandraSourceRowMapper.map(mockResultSet).forEachRemaining(row -> readRowsBuilder.add(row)); + ImmutableList readRows = readRowsBuilder.build(); + + assertThat( + readRows.stream() + .map(r -> r.getPayload().toString()) + .sorted() + .collect(ImmutableList.toImmutableList())) + .isEqualTo( + ImmutableList.of( + "{\"testIntCol\": 42, \"UnSupportedCol1\": null, \"UnSupportedCol2\": null}")); + } +} diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscoveryTest.java b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscoveryTest.java index be7c1be510..bee3ab67cb 100644 --- a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscoveryTest.java +++ b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/schema/CassandraSchemaDiscoveryTest.java @@ -15,11 +15,11 @@ */ package com.google.cloud.teleport.v2.source.reader.io.cassandra.schema; +import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.BASIC_TEST_TABLE_SCHEMA; import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_CONFIG; import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_CQLSH; import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_KEYSPACE; import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_TABLES; -import static com.google.cloud.teleport.v2.source.reader.io.cassandra.testutils.BasicTestSchema.TEST_TABLE_SCHEMA; import static com.google.common.truth.Truth.assertThat; import static org.junit.Assert.assertThrows; @@ -102,8 +102,10 @@ public void testDiscoverTableSchemaBasic() throws IOException, RetriableSchemaDi CassandraSchemaDiscovery cassandraSchemaDiscovery = new CassandraSchemaDiscovery(); ImmutableMap> schema = cassandraSchemaDiscovery.discoverTableSchema( - cassandraDataSource, cassandraSchemaReference, TEST_TABLES); - assertThat(schema).isEqualTo(TEST_TABLE_SCHEMA); + cassandraDataSource, + cassandraSchemaReference, + BASIC_TEST_TABLE_SCHEMA.keySet().asList()); + assertThat(schema).isEqualTo(BASIC_TEST_TABLE_SCHEMA); } @Test diff --git a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/testutils/BasicTestSchema.java b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/testutils/BasicTestSchema.java index b8de00a3b6..8bbb572b49 100644 --- a/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/testutils/BasicTestSchema.java +++ b/v2/sourcedb-to-spanner/src/test/java/com/google/cloud/teleport/v2/source/reader/io/cassandra/testutils/BasicTestSchema.java @@ -29,15 +29,26 @@ public class BasicTestSchema { public static final String TEST_KEYSPACE = "test_keyspace"; public static final String TEST_CONFIG = TEST_RESOURCE_ROOT + "basicConfig.yaml"; public static final String TEST_CQLSH = TEST_RESOURCE_ROOT + "basicTest.cql"; + public static final String BASIC_TEST_TABLE = "basic_test_table"; + public static final String PRIMITIVE_TYPES_TABLE = "primitive_types_table"; public static final ImmutableMap> - TEST_TABLE_SCHEMA = + BASIC_TEST_TABLE_SCHEMA = ImmutableMap.of( - "basic_test_table", + BASIC_TEST_TABLE, ImmutableMap.of( "id", new SourceColumnType("TEXT", new Long[] {}, new Long[] {}), "name", new SourceColumnType("TEXT", new Long[] {}, new Long[] {}))); public static final ImmutableList TEST_TABLES = - ImmutableList.copyOf(TEST_TABLE_SCHEMA.keySet()); + ImmutableList.of(BASIC_TEST_TABLE, PRIMITIVE_TYPES_TABLE); + + public static final ImmutableList PRIMITIVE_TYPES_TABLE_AVRO_ROWS = + ImmutableList.of( + "{\"primary_key\": \"dfcad8f3-3cdc-49c7-bce9-575f307c0637\", \"ascii_col\": \"ascii1\", \"bigint_col\": 1234567890, \"blob_col\": \"cafebabe\", \"boolean_col\": true, \"date_col\": 19694, \"decimal_col\": \"123.456\", \"double_col\": 123.456789, \"duration_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 45296000000000}, \"float_col\": 123.45, \"inet_col\": \"/127.0.0.1\", \"int_col\": 12345, \"smallint_col\": 123, \"text_col\": \"text1\", \"time_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 45296789000000}, \"timestamp_col\": 1733296987117000000, \"timeuuid_col\": \"9b9419da-b210-11ef-890e-9d9a41af9e54\", \"tinyint_col\": 123, \"uuid_col\": \"c3de3455-6b4e-4a81-a6d7-ab61610f08c6\", \"varchar_col\": \"varchar1\", \"varint_col\": \"1234567890123456789\"}", + "{\"primary_key\": \"fe3263a0-1577-4851-95f8-3af47628baa4\", \"ascii_col\": \"ascii2\", \"bigint_col\": 9876543210, \"blob_col\": \"deadbeef\", \"boolean_col\": false, \"date_col\": 19298, \"decimal_col\": \"987.654\", \"double_col\": 987.654321, \"duration_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": -45296000000000}, \"float_col\": 987.65, \"inet_col\": \"/0:0:0:0:0:0:0:1\", \"int_col\": 98765, \"smallint_col\": 987, \"text_col\": \"text2\", \"time_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 86399999000000}, \"timestamp_col\": 1733296987122000000, \"timeuuid_col\": \"9b94dd2a-b210-11ef-890e-9d9a41af9e54\", \"tinyint_col\": -123, \"uuid_col\": \"6324e301-94fb-44fe-95ac-91d2f7236e2e\", \"varchar_col\": \"varchar2\", \"varint_col\": \"-9876543210987654321\"}", + "{\"primary_key\": \"9a0acb7d-674c-4ee1-9644-9da24b7a72f4\", \"ascii_col\": \"ascii3\", \"bigint_col\": 1010101010, \"blob_col\": \"facefeed\", \"boolean_col\": true, \"date_col\": 19723, \"decimal_col\": \"10.101\", \"double_col\": 10.10101, \"duration_col\": {\"years\": 0, \"months\": 14, \"days\": 3, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 14706000000000}, \"float_col\": 10.1, \"inet_col\": \"/192.168.1.1\", \"int_col\": 10101, \"smallint_col\": 101, \"text_col\": \"text3\", \"time_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 0}, \"timestamp_col\": 1733296987127000000, \"timeuuid_col\": \"9b95a07a-b210-11ef-890e-9d9a41af9e54\", \"tinyint_col\": 101, \"uuid_col\": \"f0e1d922-06b5-4f07-a7a6-ec0c9f23e172\", \"varchar_col\": \"varchar3\", \"varint_col\": \"10101010101010101010\"}", + "{\"primary_key\": \"e6bc8562-2575-420f-9344-9fedc4945f61\", \"ascii_col\": null, \"bigint_col\": 0, \"blob_col\": null, \"boolean_col\": false, \"date_col\": null, \"decimal_col\": null, \"double_col\": 0.0, \"duration_col\": null, \"float_col\": 0.0, \"inet_col\": null, \"int_col\": 0, \"smallint_col\": 0, \"text_col\": null, \"time_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 0}, \"timestamp_col\": null, \"timeuuid_col\": null, \"tinyint_col\": 0, \"uuid_col\": null, \"varchar_col\": null, \"varint_col\": null}", + "{\"primary_key\": \"a389de30-f01f-4395-a0c6-c407bfbe81d0\", \"ascii_col\": \"zzzzzzzzzz\", \"bigint_col\": 9223372036854775807, \"blob_col\": \"ffffffff\", \"boolean_col\": true, \"date_col\": 2932896, \"decimal_col\": \"10000000000000000000000000000000000000\", \"double_col\": 1.7976931348623157E308, \"duration_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 320949000000000}, \"float_col\": 3.4028235E38, \"inet_col\": \"/255.255.255.255\", \"int_col\": 2147483647, \"smallint_col\": 32767, \"text_col\": \"abcdef\", \"time_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 86399999000000}, \"timestamp_col\": -1000, \"timeuuid_col\": null, \"tinyint_col\": 127, \"uuid_col\": \"00e4afef-52f8-4e1f-9afa-0632c8ccf790\", \"varchar_col\": \"abcdef\", \"varint_col\": \"9223372036854775807\"}", + "{\"primary_key\": \"29e38561-6376-4b45-b1a0-1709e11cfc8c\", \"ascii_col\": \"\", \"bigint_col\": -9223372036854775808, \"blob_col\": \"00\", \"boolean_col\": false, \"date_col\": -354285, \"decimal_col\": \"-10000000000000000000000000000000000000\", \"double_col\": -1.7976931348623157E308, \"duration_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 320949000000000}, \"float_col\": -3.4028235E38, \"inet_col\": \"/0.0.0.0\", \"int_col\": -2147483648, \"smallint_col\": -32768, \"text_col\": \"\", \"time_col\": {\"years\": 0, \"months\": 0, \"days\": 0, \"hours\": 0, \"minutes\": 0, \"seconds\": 0, \"nanos\": 0}, \"timestamp_col\": 0, \"timeuuid_col\": null, \"tinyint_col\": -128, \"uuid_col\": \"fff6d876-560f-48bc-8088-90c69e5a0c40\", \"varchar_col\": \"\", \"varint_col\": \"-9223372036854775808\"}"); private BasicTestSchema() {} ; diff --git a/v2/sourcedb-to-spanner/src/test/resources/CassandraUT/basicTest.cql b/v2/sourcedb-to-spanner/src/test/resources/CassandraUT/basicTest.cql index 20f381fc5f..35882d73c0 100644 --- a/v2/sourcedb-to-spanner/src/test/resources/CassandraUT/basicTest.cql +++ b/v2/sourcedb-to-spanner/src/test/resources/CassandraUT/basicTest.cql @@ -10,4 +10,159 @@ CREATE TABLE basic_test_table( PRIMARY KEY(id)); INSERT INTO basic_test_table(id, name) values('1234','Albert'); -INSERT INTO basic_test_table(id, name) values('5678','Einstein'); \ No newline at end of file +INSERT INTO basic_test_table(id, name) values('5678','Einstein'); + +// Primitive types +CREATE TABLE primitive_types_table ( + primary_key UUID PRIMARY KEY, + ascii_col ASCII, + bigint_col BIGINT, + blob_col BLOB, + boolean_col BOOLEAN, + date_col DATE, + decimal_col DECIMAL, + double_col DOUBLE, + duration_col DURATION, + float_col FLOAT, + inet_col INET, + int_col INT, + smallint_col SMALLINT, + text_col TEXT, + time_col TIME, + timestamp_col TIMESTAMP, + timeuuid_col TIMEUUID, + tinyint_col TINYINT, + uuid_col UUID, + varchar_col VARCHAR, + varint_col VARINT); + +-- Inserting 3 Randomly generated rows. +INSERT INTO primitive_types_table (primary_key, ascii_col, bigint_col, blob_col, boolean_col, date_col, decimal_col, double_col, duration_col, float_col, inet_col, int_col, smallint_col, text_col, time_col, timestamp_col, timeuuid_col, tinyint_col, uuid_col, varchar_col, varint_col) +VALUES ( + dfcad8f3-3cdc-49c7-bce9-575f307c0637, + 'ascii1', + 1234567890, + 0xCAFEBABE, + true, + '2023-12-03', + 123.456, + 123.456789, + 12h34m56s, + 123.45, + '127.0.0.1', + 12345, + 123, + 'text1', + '12:34:56.789', + 1733296987117000, + 9b9419da-b210-11ef-890e-9d9a41af9e54, + 123, + c3de3455-6b4e-4a81-a6d7-ab61610f08c6, + 'varchar1', + 1234567890123456789 + ); + + +INSERT INTO primitive_types_table (primary_key, ascii_col, bigint_col, blob_col, boolean_col, date_col, decimal_col, double_col, duration_col, float_col, inet_col, int_col, smallint_col, text_col, time_col, timestamp_col, timeuuid_col, tinyint_col, uuid_col, varchar_col, varint_col) +VALUES ( + fe3263a0-1577-4851-95f8-3af47628baa4, + 'ascii2', + 9876543210, + 0xDEADBEEF, + false, + '2022-11-02', + 987.654, + 987.654321, + -12h34m56s, + 987.65, + '::1', + 98765, + 987, + 'text2', + '23:59:59.999', + 1733296987122000, + 9b94dd2a-b210-11ef-890e-9d9a41af9e54, + -123, + 6324e301-94fb-44fe-95ac-91d2f7236e2e, + 'varchar2', + -9876543210987654321 + ); +INSERT INTO primitive_types_table (primary_key, ascii_col, bigint_col, blob_col, boolean_col, date_col, decimal_col, double_col, duration_col, float_col, inet_col, int_col, smallint_col, text_col, time_col, timestamp_col, timeuuid_col, tinyint_col, uuid_col, varchar_col, varint_col) +VALUES ( + 9a0acb7d-674c-4ee1-9644-9da24b7a72f4, + 'ascii3', + 1010101010, + 0xFACEFEED, + true, + '2024-01-01', + 10.101, + 10.101010, + 1y2mo3d4h5m6s, + 10.10, + '192.168.1.1', + 10101, + 101, + 'text3', + '00:00:00.000', + 1733296987127000, + 9b95a07a-b210-11ef-890e-9d9a41af9e54, + 101, + f0e1d922-06b5-4f07-a7a6-ec0c9f23e172, + 'varchar3', + 10101010101010101010 + ); + +-- Inserting data with all columns null (except primary key) +INSERT INTO primitive_types_table (primary_key) VALUES (e6bc8562-2575-420f-9344-9fedc4945f61); + +-- Inserting data with minimum values for each column (where applicable) +INSERT INTO primitive_types_table (primary_key, ascii_col, bigint_col, blob_col, boolean_col, date_col, decimal_col, double_col, duration_col, float_col, inet_col, int_col, smallint_col, text_col, time_col, timestamp_col, timeuuid_col, tinyint_col, uuid_col, varchar_col, varint_col) +VALUES ( + 29e38561-6376-4b45-b1a0-1709e11cfc8c, + '', -- Minimum ASCII (empty string) + -9223372036854775808, -- Minimum BIGINT + 0x00, -- Minimum BLOB (empty) + false, -- Minimum BOOLEAN + '1000-01-01', -- Minimum DATE + -10000000000000000000000000000000000000, + -1.7976931348623157E+308, -- Minimum DOUBLE + P0000-00-00T89:09:09, -- TODO Min + -3.4028234663852886E+38, -- Minimum FLOAT + '0.0.0.0', -- Minimum INET + -2147483648, -- Minimum INT + -32768, -- Minimum SMALLINT + '', -- Minimum TEXT (empty string) + '00:00:00.000', -- Minimum TIME + 0, -- Minimum TIMESTAMP (epoch) + null, -- TODO time uuid + -128, -- Minimum TINYINT + fff6d876-560f-48bc-8088-90c69e5a0c40, + '', -- Minimum VARCHAR (empty string) + -9223372036854775808 -- Minimum VARINT + ); + +-- Inserting data with maximum values for each column (where applicable) +INSERT INTO primitive_types_table (primary_key, ascii_col, bigint_col, blob_col, boolean_col, date_col, decimal_col, double_col, duration_col, float_col, inet_col, int_col, smallint_col, text_col, time_col, timestamp_col, timeuuid_col, tinyint_col, uuid_col, varchar_col, varint_col) +VALUES ( + a389de30-f01f-4395-a0c6-c407bfbe81d0, + 'zzzzzzzzzz', -- + 9223372036854775807, -- Maximum BIGINT + 0xFFFFFFFF, -- + true, -- Maximum BOOLEAN + '9999-12-31', -- Maximum DATE + 10000000000000000000000000000000000000, -- + 1.7976931348623157E+308, -- Maximum DOUBLE + P0000-00-00T89:09:09, -- TODO Max + 3.4028234663852886E+38, -- Maximum FLOAT + '255.255.255.255',-- Maximum INET + 2147483647, -- Maximum INT + 32767, -- Maximum SMALLINT + 'abcdef', -- + '23:59:59.999', -- Maximum TIME + 9223372036854775807, + null, -- TODO time uuid + 127, -- Maximum TINYINT + 00e4afef-52f8-4e1f-9afa-0632c8ccf790, + 'abcdef', + 9223372036854775807 -- Maximum VARINT + ); diff --git a/v2/spanner-change-streams-to-sharded-file-sink/README_Spanner_Change_Streams_to_Sharded_File_Sink.md b/v2/spanner-change-streams-to-sharded-file-sink/README_Spanner_Change_Streams_to_Sharded_File_Sink.md index 76b2832750..837f474939 100644 --- a/v2/spanner-change-streams-to-sharded-file-sink/README_Spanner_Change_Streams_to_Sharded_File_Sink.md +++ b/v2/spanner-change-streams-to-sharded-file-sink/README_Spanner_Change_Streams_to_Sharded_File_Sink.md @@ -17,29 +17,29 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **changeStreamName** : This is the name of the Spanner change stream that the pipeline will read from. -* **instanceId** : This is the name of the Cloud Spanner instance where the changestream is present. -* **databaseId** : This is the name of the Cloud Spanner database that the changestream is monitoring. -* **spannerProjectId** : This is the name of the Cloud Spanner project. -* **metadataInstance** : This is the instance to store the metadata used by the connector to control the consumption of the change stream API data. -* **metadataDatabase** : This is the database to store the metadata used by the connector to control the consumption of the change stream API data. -* **gcsOutputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. (Example: gs://your-bucket/your-path/). -* **sourceShardsFilePath** : Source shard details file path in Cloud Storage that contains connection profile of source shards. Atleast one shard information is expected. -* **runIdentifier** : The identifier to distinguish between different runs of reverse replication flows. +* **changeStreamName**: This is the name of the Spanner change stream that the pipeline will read from. +* **instanceId**: This is the name of the Cloud Spanner instance where the changestream is present. +* **databaseId**: This is the name of the Cloud Spanner database that the changestream is monitoring. +* **spannerProjectId**: This is the name of the Cloud Spanner project. +* **metadataInstance**: This is the instance to store the metadata used by the connector to control the consumption of the change stream API data. +* **metadataDatabase**: This is the database to store the metadata used by the connector to control the consumption of the change stream API data. +* **gcsOutputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. For example, `gs://your-bucket/your-path/`. +* **sourceShardsFilePath**: Source shard details file path in Cloud Storage that contains connection profile of source shards. Atleast one shard information is expected. +* **runIdentifier**: The identifier to distinguish between different runs of reverse replication flows. ### Optional parameters -* **startTimestamp** : Read changes from the given timestamp. Defaults to empty. -* **endTimestamp** : Read changes until the given timestamp. If no timestamp provided, reads indefinitely. Defaults to empty. -* **sessionFilePath** : Session file path in Cloud Storage that contains mapping information from HarbourBridge. Needed when doing sharded reverse replication. -* **windowDuration** : The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 5m). Defaults to: 10s. -* **filtrationMode** : Mode of Filtration, decides how to drop certain records based on a criteria. Currently supported modes are: none (filter nothing), forward_migration (filter records written via the forward migration pipeline). Defaults to forward_migration. -* **metadataTableSuffix** : Suffix appended to the spanner_to_gcs_metadata and shard_file_create_progress metadata tables.Useful when doing multiple runs.Only alpha numeric and underscores are allowed. Defaults to empty. -* **skipDirectoryName** : Records skipped from reverse replication are written to this directory. Default directory name is skip. -* **runMode** : Regular starts from input start time, resume start from last processed time. Defaults to: regular. -* **shardingCustomJarPath** : Custom jar location in Cloud Storage that contains the customization logic for fetching shard id. Defaults to empty. -* **shardingCustomClassName** : Fully qualified class name having the custom shard id implementation. It is a mandatory field in case shardingCustomJarPath is specified. Defaults to empty. -* **shardingCustomParameters** : String containing any custom parameters to be passed to the custom sharding class. Defaults to empty. +* **startTimestamp**: Read changes from the given timestamp. Defaults to empty. +* **endTimestamp**: Read changes until the given timestamp. If no timestamp provided, reads indefinitely. Defaults to empty. +* **sessionFilePath**: Session file path in Cloud Storage that contains mapping information from HarbourBridge. Needed when doing sharded reverse replication. +* **windowDuration**: The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `5m`. Defaults to: 10s. +* **filtrationMode**: Mode of Filtration, decides how to drop certain records based on a criteria. Currently supported modes are: none (filter nothing), forward_migration (filter records written via the forward migration pipeline). Defaults to forward_migration. +* **metadataTableSuffix**: Suffix appended to the spanner_to_gcs_metadata and shard_file_create_progress metadata tables.Useful when doing multiple runs.Only alpha numeric and underscores are allowed. Defaults to empty. +* **skipDirectoryName**: Records skipped from reverse replication are written to this directory. Default directory name is skip. +* **runMode**: Regular starts from input start time, resume start from last processed time. Defaults to: regular. +* **shardingCustomJarPath**: Custom jar location in Cloud Storage that contains the customization logic for fetching shard id. Defaults to empty. +* **shardingCustomClassName**: Fully qualified class name having the custom shard id implementation. It is a mandatory field in case shardingCustomJarPath is specified. Defaults to empty. +* **shardingCustomParameters**: String containing any custom parameters to be passed to the custom sharding class. Defaults to empty. @@ -264,13 +264,13 @@ resource "google_dataflow_flex_template_job" "spanner_change_streams_to_sharded_ spannerProjectId = "" metadataInstance = "" metadataDatabase = "" - gcsOutputDirectory = "gs://your-bucket/your-path/" + gcsOutputDirectory = "" sourceShardsFilePath = "" runIdentifier = "" # startTimestamp = "" # endTimestamp = "" # sessionFilePath = "" - # windowDuration = "5m" + # windowDuration = "10s" # filtrationMode = "forward_migration" # metadataTableSuffix = "" # skipDirectoryName = "skip" diff --git a/v2/spanner-common/src/main/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertor.java b/v2/spanner-common/src/main/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertor.java index 4c7ea2cdc6..3cfd9c8fcc 100644 --- a/v2/spanner-common/src/main/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertor.java +++ b/v2/spanner-common/src/main/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertor.java @@ -28,6 +28,7 @@ import java.time.Instant; import java.time.LocalDate; import java.time.LocalTime; +import java.time.Period; import java.time.ZoneId; import java.time.ZoneOffset; import java.time.ZonedDateTime; @@ -47,6 +48,7 @@ import org.apache.avro.generic.GenericRecord; import org.apache.beam.sdk.metrics.Distribution; import org.apache.beam.sdk.metrics.Metrics; +import org.apache.commons.lang3.StringUtils; import org.apache.kerby.util.Hex; import org.joda.time.Duration; import org.slf4j.Logger; @@ -529,6 +531,35 @@ static String handleRecordFieldType(String fieldName, GenericRecord element, Sch // Handle hours separately since that can also be negative. We convert micros to localTime // format (HH:MM:SS), then strip of HH:, which will always be "00:". return String.format("%s:%s", hours, localTime.substring(3)); + } else if (fieldSchema.getName().equals("intervalNano")) { + Period period = + Period.ZERO + .plusYears(getOrDefault(element, "years", 0L)) + .plusMonths(getOrDefault(element, "months", 0L)) + .plusDays(getOrDefault(element, "days", 0L)); + /* + * Convert the period to a ISO-8601 period formatted String, such as P6Y3M1D. + * A zero period will be represented as zero days, 'P0D'. + * Refer to javadoc for Period#toString. + */ + String periodIso8061 = period.toString(); + java.time.Duration duration = + java.time.Duration.ZERO + .plusHours(getOrDefault(element, "hours", 0L)) + .plusMinutes(getOrDefault(element, "minutes", 0L)) + .plusSeconds(getOrDefault(element, "seconds", 0L)) + .plusNanos(getOrDefault(element, "nanos", 0L)); + /* + * Convert the duration to a ISO-8601 period formatted String, such as PT8H6M12.345S + * refer to javadoc for Duration#toString. + */ + String durationIso8610 = duration.toString(); + // Convert to ISO-8601 period format. + if (duration.isZero()) { + return periodIso8061; + } else { + return periodIso8061 + StringUtils.removeStartIgnoreCase(durationIso8610, "P"); + } } else { throw new UnsupportedOperationException( String.format( @@ -536,4 +567,11 @@ static String handleRecordFieldType(String fieldName, GenericRecord element, Sch fieldSchema.getName(), element, fieldName)); } } + + private static T getOrDefault(GenericRecord element, String name, T def) { + if (element.get(name) == null) { + return def; + } + return (T) element.get(name); + } } diff --git a/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/AvroTestingHelper.java b/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/AvroTestingHelper.java index 08a2c6babd..bed7558b00 100644 --- a/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/AvroTestingHelper.java +++ b/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/AvroTestingHelper.java @@ -61,6 +61,33 @@ public class AvroTestingHelper { .noDefault() .endRecord(); + public static final Schema INTERVAL_NANOS_SCHEMA = + SchemaBuilder.builder() + .record("intervalNano") + .fields() + .name("years") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("months") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("days") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("hours") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("minutes") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("seconds") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .name("nanos") + .type(SchemaBuilder.builder().longType()) + .withDefault(0L) + .endRecord(); + public static final Schema UNSUPPORTED_SCHEMA = SchemaBuilder.record("unsupportedName") .fields() @@ -91,4 +118,17 @@ public static GenericRecord createIntervalRecord(Integer months, Integer hours, genericRecord.put("micros", micros); return genericRecord; } + + public static GenericRecord createIntervalNanosRecord( + Long years, Long months, Long days, Long hours, Long minutes, Long seconds, Long nanos) { + GenericRecord genericRecord = new GenericData.Record(INTERVAL_NANOS_SCHEMA); + genericRecord.put("years", years); + genericRecord.put("months", months); + genericRecord.put("days", days); + genericRecord.put("hours", hours); + genericRecord.put("minutes", minutes); + genericRecord.put("seconds", seconds); + genericRecord.put("nanos", nanos); + return genericRecord; + } } diff --git a/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertorTest.java b/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertorTest.java index 68b94f69d3..e5c9ff4a94 100644 --- a/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertorTest.java +++ b/v2/spanner-common/src/test/java/com/google/cloud/teleport/v2/spanner/migrations/avro/GenericRecordTypeConvertorTest.java @@ -337,6 +337,101 @@ public void testHandleRecordFieldType() { AvroTestingHelper.UNSUPPORTED_SCHEMA)); } + /* + * Test conversion of Interval Nano to String for various cases. + */ + @Test + public void testIntervalNanos() { + String result; + + /* Basic Test. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord(1000L, 1000L, 3890L, 25L, 331L, 12L, 9L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals( + "Test #1 interval nano conversion:", "P1000Y1000M3890DT30H31M12.000000009S", result); + + /* Test with any field set as null gets treated as 0. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord(1000L, 1000L, 3890L, 25L, null, 12L, 9L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals( + "Test #2 interval nano conversion with null minutes:", + "P1000Y1000M3890DT25H12.000000009S", + result); + + /* Basic test for negative field. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord(1000L, -1000L, 3890L, 25L, 31L, 12L, 9L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals( + "Test #3 interval nano conversion with negative months:", + "P1000Y-1000M3890DT25H31M12.000000009S", + result); + + /* Test that negative nanos subtract from the fractional seconds, for example 12 Seconds -1 Nanos becomes 11.999999991s. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord(1000L, 31L, 3890L, 25L, 31L, 12L, -9L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals( + "Test #4 interval nano conversion with negative nanos:", + "P1000Y31M3890DT25H31M11.999999991S", + result); + + /* Test 0 interval. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord(0L, 0L, 0L, 0L, 0L, 0L, 0L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals("Test #5 interval nano conversion with all zeros", "P0D", result); + + /* Test almost zero interval with only nanos set. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord(0L, 0L, 0L, 0L, 0L, 0L, 1L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals("Test #6 interval nano conversion with only nanos", "P0DT0.000000001S", result); + /* Test with large values. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord( + 2147483647L, 11L, 2147483647L, 2147483647L, 2147483647L, 2147483647L, 999999999L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals( + "Test #6 interval nano conversion with INT.MAX values", + "P2147483647Y11M2147483647DT2183871564H21M7.999999999S", + result); + + /* Test with large negative values. */ + result = + GenericRecordTypeConvertor.handleRecordFieldType( + "interval_nanos_column", + AvroTestingHelper.createIntervalNanosRecord( + -2147483647L, + -11L, + -2147483647L, + -2147483647L, + -2147483647L, + -2147483647L, + -999999999L), + AvroTestingHelper.INTERVAL_NANOS_SCHEMA); + assertEquals( + "Test #6 interval nano conversion with -INT.MAX values", + "P-2147483647Y-11M-2147483647DT-2183871564H-21M-7.999999999S", + result); + } + @Test public void testHandleRecordFieldType_nullInput() { assertNull( diff --git a/v2/spanner-custom-shard/src/main/java/com/custom/CustomTransformationWithShardForLiveIT.java b/v2/spanner-custom-shard/src/main/java/com/custom/CustomTransformationWithShardForLiveIT.java index f6b65ba41b..bf9c890bd0 100644 --- a/v2/spanner-custom-shard/src/main/java/com/custom/CustomTransformationWithShardForLiveIT.java +++ b/v2/spanner-custom-shard/src/main/java/com/custom/CustomTransformationWithShardForLiveIT.java @@ -131,7 +131,6 @@ public MigrationTransformationResponse toSourceRow(MigrationTransformationReques Long tinyIntColumn = Long.parseLong((String) requestRow.get("tinyint_column")) + 1; Long intColumn = Long.parseLong((String) requestRow.get("int_column")) + 1; Long bigIntColumn = Long.parseLong((String) requestRow.get("bigint_column")) + 1; - Long timeColumn = Long.parseLong((String) requestRow.get("time_column")) + 1000; Long yearColumn = Long.parseLong((String) requestRow.get("year_column")) + 1; BigDecimal floatColumn = (BigDecimal) requestRow.get("float_column"); BigDecimal doubleColumn = (BigDecimal) requestRow.get("double_column"); @@ -143,7 +142,6 @@ public MigrationTransformationResponse toSourceRow(MigrationTransformationReques responseRow.put("double_column", doubleColumn.add(BigDecimal.ONE).toString()); Double value = Double.parseDouble((String) requestRow.get("decimal_column")); responseRow.put("decimal_column", String.valueOf(value - 1)); - responseRow.put("time_column", "\'" + timeColumn + "\'"); responseRow.put("bool_column", "false"); responseRow.put("enum_column", "\'3\'"); responseRow.put( @@ -191,11 +189,26 @@ public MigrationTransformationResponse toSourceRow(MigrationTransformationReques "CONVERT_TZ(\'" + timestampColumn.substring(0, timestampColumn.length() - 1) + "\','+00:00','+00:00')"); + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("HH:mm:ss"); + LocalTime time = LocalTime.parse((String) requestRow.get("time_column"), formatter); + + LocalTime newTime = time.plusMinutes(10); + responseRow.put("time_column", "\'" + newTime.format(formatter) + "\'"); } catch (Exception e) { throw new InvalidTransformationException(e); } + MigrationTransformationResponse response = + new MigrationTransformationResponse(responseRow, false); + return response; + } else if (request.getTableName().equals("Users1")) { + Map responseRow = new HashMap<>(); + Map requestRow = request.getRequestRow(); + String name = requestRow.get("name").toString(); + String[] nameArray = name.split(" "); + responseRow.put("first_name", "\'" + nameArray[0] + "\'"); + responseRow.put("last_name", "\'" + nameArray[1] + "\'"); MigrationTransformationResponse response = new MigrationTransformationResponse(responseRow, false); return response; diff --git a/v2/spanner-to-sourcedb/README_Spanner_to_SourceDb.md b/v2/spanner-to-sourcedb/README_Spanner_to_SourceDb.md index f84df5c26c..0c28c6e891 100644 --- a/v2/spanner-to-sourcedb/README_Spanner_to_SourceDb.md +++ b/v2/spanner-to-sourcedb/README_Spanner_to_SourceDb.md @@ -14,33 +14,38 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **changeStreamName** : This is the name of the Spanner change stream that the pipeline will read from. -* **instanceId** : This is the name of the Cloud Spanner instance where the changestream is present. -* **databaseId** : This is the name of the Cloud Spanner database that the changestream is monitoring. -* **spannerProjectId** : This is the name of the Cloud Spanner project. -* **metadataInstance** : This is the instance to store the metadata used by the connector to control the consumption of the change stream API data. -* **metadataDatabase** : This is the database to store the metadata used by the connector to control the consumption of the change stream API data. -* **sourceShardsFilePath** : Path to GCS file containing connection profile info for source shards. +* **changeStreamName**: This is the name of the Spanner change stream that the pipeline will read from. +* **instanceId**: This is the name of the Cloud Spanner instance where the changestream is present. +* **databaseId**: This is the name of the Cloud Spanner database that the changestream is monitoring. +* **spannerProjectId**: This is the name of the Cloud Spanner project. +* **metadataInstance**: This is the instance to store the metadata used by the connector to control the consumption of the change stream API data. +* **metadataDatabase**: This is the database to store the metadata used by the connector to control the consumption of the change stream API data. +* **sourceShardsFilePath**: Path to GCS file containing connection profile info for source shards. ### Optional parameters -* **startTimestamp** : Read changes from the given timestamp. Defaults to empty. -* **endTimestamp** : Read changes until the given timestamp. If no timestamp provided, reads indefinitely. Defaults to empty. -* **shadowTablePrefix** : The prefix used to name shadow tables. Default: `shadow_`. -* **sessionFilePath** : Session file path in Cloud Storage that contains mapping information from HarbourBridge. -* **filtrationMode** : Mode of Filtration, decides how to drop certain records based on a criteria. Currently supported modes are: none (filter nothing), forward_migration (filter records written via the forward migration pipeline). Defaults to forward_migration. -* **shardingCustomJarPath** : Custom jar location in Cloud Storage that contains the customization logic for fetching shard id. Defaults to empty. -* **shardingCustomClassName** : Fully qualified class name having the custom shard id implementation. It is a mandatory field in case shardingCustomJarPath is specified. Defaults to empty. -* **shardingCustomParameters** : String containing any custom parameters to be passed to the custom sharding class. Defaults to empty. -* **sourceDbTimezoneOffset** : This is the timezone offset from UTC for the source database. Example value: +10:00. Defaults to: +00:00. -* **dlqGcsPubSubSubscription** : The Pub/Sub subscription being used in a Cloud Storage notification policy for DLQ retry directory when running in regular mode. The name should be in the format of projects//subscriptions/. When set, the deadLetterQueueDirectory and dlqRetryMinutes are ignored. -* **skipDirectoryName** : Records skipped from reverse replication are written to this directory. Default directory name is skip. -* **maxShardConnections** : This will come from shard file eventually. Defaults to: 10000. -* **deadLetterQueueDirectory** : The file path used when storing the error queue output. The default file path is a directory under the Dataflow job's temp location. -* **dlqMaxRetryCount** : The max number of times temporary errors can be retried through DLQ. Defaults to 500. -* **runMode** : This is the run mode type, whether regular or with retryDLQ.Default is regular. retryDLQ is used to retry the severe DLQ records only. -* **dlqRetryMinutes** : The number of minutes between dead letter queue retries. Defaults to 10. -* **sourceType** : The type of source database to reverse replicate to. Defaults to mysql. +* **startTimestamp**: Read changes from the given timestamp. Defaults to empty. +* **endTimestamp**: Read changes until the given timestamp. If no timestamp provided, reads indefinitely. Defaults to empty. +* **shadowTablePrefix**: The prefix used to name shadow tables. Default: `shadow_`. +* **sessionFilePath**: Session file path in Cloud Storage that contains mapping information from HarbourBridge. +* **filtrationMode**: Mode of Filtration, decides how to drop certain records based on a criteria. Currently supported modes are: none (filter nothing), forward_migration (filter records written via the forward migration pipeline). Defaults to forward_migration. +* **shardingCustomJarPath**: Custom jar location in Cloud Storage that contains the customization logic for fetching shard id. Defaults to empty. +* **shardingCustomClassName**: Fully qualified class name having the custom shard id implementation. It is a mandatory field in case shardingCustomJarPath is specified. Defaults to empty. +* **shardingCustomParameters**: String containing any custom parameters to be passed to the custom sharding class. Defaults to empty. +* **sourceDbTimezoneOffset**: This is the timezone offset from UTC for the source database. Example value: +10:00. Defaults to: +00:00. +* **dlqGcsPubSubSubscription**: The Pub/Sub subscription being used in a Cloud Storage notification policy for DLQ retry directory when running in regular mode. The name should be in the format of projects//subscriptions/. When set, the deadLetterQueueDirectory and dlqRetryMinutes are ignored. +* **skipDirectoryName**: Records skipped from reverse replication are written to this directory. Default directory name is skip. +* **maxShardConnections**: This will come from shard file eventually. Defaults to: 10000. +* **deadLetterQueueDirectory**: The file path used when storing the error queue output. The default file path is a directory under the Dataflow job's temp location. +* **dlqMaxRetryCount**: The max number of times temporary errors can be retried through DLQ. Defaults to 500. +* **runMode**: This is the run mode type, whether regular or with retryDLQ.Default is regular. retryDLQ is used to retry the severe DLQ records only. +* **dlqRetryMinutes**: The number of minutes between dead letter queue retries. Defaults to 10. +* **sourceType**: The type of source database to reverse replicate to. Defaults to: mysql. +* **transformationJarPath**: Custom jar location in Cloud Storage that contains the custom transformation logic for processing records in reverse replication. Defaults to empty. +* **transformationClassName**: Fully qualified class name having the custom transformation logic. It is a mandatory field in case transformationJarPath is specified. Defaults to empty. +* **transformationCustomParameters**: String containing any custom parameters to be passed to the custom transformation class. Defaults to empty. +* **filterEventsDirectoryName**: Records skipped from reverse replication are written to this directory. Default directory name is skip. + ## Getting Started @@ -143,6 +148,11 @@ export DEAD_LETTER_QUEUE_DIRECTORY="" export DLQ_MAX_RETRY_COUNT=500 export RUN_MODE=regular export DLQ_RETRY_MINUTES=10 +export SOURCE_TYPE=mysql +export TRANSFORMATION_JAR_PATH="" +export TRANSFORMATION_CLASS_NAME="" +export TRANSFORMATION_CUSTOM_PARAMETERS="" +export FILTER_EVENTS_DIRECTORY_NAME=filteredEvents gcloud dataflow flex-template run "spanner-to-sourcedb-job" \ --project "$PROJECT" \ @@ -170,7 +180,12 @@ gcloud dataflow flex-template run "spanner-to-sourcedb-job" \ --parameters "deadLetterQueueDirectory=$DEAD_LETTER_QUEUE_DIRECTORY" \ --parameters "dlqMaxRetryCount=$DLQ_MAX_RETRY_COUNT" \ --parameters "runMode=$RUN_MODE" \ - --parameters "dlqRetryMinutes=$DLQ_RETRY_MINUTES" + --parameters "dlqRetryMinutes=$DLQ_RETRY_MINUTES" \ + --parameters "sourceType=$SOURCE_TYPE" \ + --parameters "transformationJarPath=$TRANSFORMATION_JAR_PATH" \ + --parameters "transformationClassName=$TRANSFORMATION_CLASS_NAME" \ + --parameters "transformationCustomParameters=$TRANSFORMATION_CUSTOM_PARAMETERS" \ + --parameters "filterEventsDirectoryName=$FILTER_EVENTS_DIRECTORY_NAME" ``` For more information about the command, please check: @@ -214,6 +229,11 @@ export DEAD_LETTER_QUEUE_DIRECTORY="" export DLQ_MAX_RETRY_COUNT=500 export RUN_MODE=regular export DLQ_RETRY_MINUTES=10 +export SOURCE_TYPE=mysql +export TRANSFORMATION_JAR_PATH="" +export TRANSFORMATION_CLASS_NAME="" +export TRANSFORMATION_CUSTOM_PARAMETERS="" +export FILTER_EVENTS_DIRECTORY_NAME=filteredEvents mvn clean package -PtemplatesRun \ -DskipTests \ @@ -222,7 +242,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="spanner-to-sourcedb-job" \ -DtemplateName="Spanner_to_SourceDb" \ --Dparameters="changeStreamName=$CHANGE_STREAM_NAME,instanceId=$INSTANCE_ID,databaseId=$DATABASE_ID,spannerProjectId=$SPANNER_PROJECT_ID,metadataInstance=$METADATA_INSTANCE,metadataDatabase=$METADATA_DATABASE,startTimestamp=$START_TIMESTAMP,endTimestamp=$END_TIMESTAMP,shadowTablePrefix=$SHADOW_TABLE_PREFIX,sourceShardsFilePath=$SOURCE_SHARDS_FILE_PATH,sessionFilePath=$SESSION_FILE_PATH,filtrationMode=$FILTRATION_MODE,shardingCustomJarPath=$SHARDING_CUSTOM_JAR_PATH,shardingCustomClassName=$SHARDING_CUSTOM_CLASS_NAME,shardingCustomParameters=$SHARDING_CUSTOM_PARAMETERS,sourceDbTimezoneOffset=$SOURCE_DB_TIMEZONE_OFFSET,dlqGcsPubSubSubscription=$DLQ_GCS_PUB_SUB_SUBSCRIPTION,skipDirectoryName=$SKIP_DIRECTORY_NAME,maxShardConnections=$MAX_SHARD_CONNECTIONS,deadLetterQueueDirectory=$DEAD_LETTER_QUEUE_DIRECTORY,dlqMaxRetryCount=$DLQ_MAX_RETRY_COUNT,runMode=$RUN_MODE,dlqRetryMinutes=$DLQ_RETRY_MINUTES" \ +-Dparameters="changeStreamName=$CHANGE_STREAM_NAME,instanceId=$INSTANCE_ID,databaseId=$DATABASE_ID,spannerProjectId=$SPANNER_PROJECT_ID,metadataInstance=$METADATA_INSTANCE,metadataDatabase=$METADATA_DATABASE,startTimestamp=$START_TIMESTAMP,endTimestamp=$END_TIMESTAMP,shadowTablePrefix=$SHADOW_TABLE_PREFIX,sourceShardsFilePath=$SOURCE_SHARDS_FILE_PATH,sessionFilePath=$SESSION_FILE_PATH,filtrationMode=$FILTRATION_MODE,shardingCustomJarPath=$SHARDING_CUSTOM_JAR_PATH,shardingCustomClassName=$SHARDING_CUSTOM_CLASS_NAME,shardingCustomParameters=$SHARDING_CUSTOM_PARAMETERS,sourceDbTimezoneOffset=$SOURCE_DB_TIMEZONE_OFFSET,dlqGcsPubSubSubscription=$DLQ_GCS_PUB_SUB_SUBSCRIPTION,skipDirectoryName=$SKIP_DIRECTORY_NAME,maxShardConnections=$MAX_SHARD_CONNECTIONS,deadLetterQueueDirectory=$DEAD_LETTER_QUEUE_DIRECTORY,dlqMaxRetryCount=$DLQ_MAX_RETRY_COUNT,runMode=$RUN_MODE,dlqRetryMinutes=$DLQ_RETRY_MINUTES,sourceType=$SOURCE_TYPE,transformationJarPath=$TRANSFORMATION_JAR_PATH,transformationClassName=$TRANSFORMATION_CLASS_NAME,transformationCustomParameters=$TRANSFORMATION_CUSTOM_PARAMETERS,filterEventsDirectoryName=$FILTER_EVENTS_DIRECTORY_NAME" \ -f v2/spanner-to-sourcedb ``` @@ -290,6 +310,11 @@ resource "google_dataflow_flex_template_job" "spanner_to_sourcedb" { # dlqMaxRetryCount = "500" # runMode = "regular" # dlqRetryMinutes = "10" + # sourceType = "mysql" + # transformationJarPath = "" + # transformationClassName = "" + # transformationCustomParameters = "" + # filterEventsDirectoryName = "filteredEvents" } } ``` diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbCustomShardIT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbCustomShardIT.java index 6f56432066..e58488283d 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbCustomShardIT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbCustomShardIT.java @@ -122,6 +122,7 @@ public void setUp() throws IOException, InterruptedException { getClass().getSimpleName(), "input/customShard.jar", "com.custom.CustomShardIdFetcherForIT", + null, null); } } diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbCustomTransformationIT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbCustomTransformationIT.java new file mode 100644 index 0000000000..5f755e8ea7 --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbCustomTransformationIT.java @@ -0,0 +1,412 @@ +/* + * Copyright (C) 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ +package com.google.cloud.teleport.v2.templates; + +import static com.google.common.truth.Truth.assertThat; +import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatPipeline; +import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatResult; + +import com.google.cloud.ByteArray; +import com.google.cloud.Date; +import com.google.cloud.Timestamp; +import com.google.cloud.spanner.Key; +import com.google.cloud.spanner.Mutation; +import com.google.cloud.spanner.Value; +import com.google.cloud.teleport.metadata.SkipDirectRunnerTest; +import com.google.cloud.teleport.metadata.TemplateIntegrationTest; +import com.google.cloud.teleport.v2.spanner.migrations.transformation.CustomTransformation; +import com.google.common.io.Resources; +import com.google.pubsub.v1.SubscriptionName; +import java.io.IOException; +import java.math.BigDecimal; +import java.nio.charset.StandardCharsets; +import java.time.Duration; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import org.apache.beam.it.common.PipelineLauncher; +import org.apache.beam.it.common.PipelineOperator; +import org.apache.beam.it.common.utils.ResourceManagerUtils; +import org.apache.beam.it.gcp.pubsub.PubsubResourceManager; +import org.apache.beam.it.gcp.spanner.SpannerResourceManager; +import org.apache.beam.it.gcp.storage.GcsResourceManager; +import org.apache.beam.it.jdbc.MySQLResourceManager; +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Integration test for {@link SpannerToSourceDb} Flex template with custom transformation jar + * supplied. + */ +@Category({TemplateIntegrationTest.class, SkipDirectRunnerTest.class}) +@TemplateIntegrationTest(SpannerToSourceDb.class) +@RunWith(JUnit4.class) +public class SpannerToSourceDbCustomTransformationIT extends SpannerToSourceDbITBase { + private static final Logger LOG = + LoggerFactory.getLogger(SpannerToSourceDbCustomTransformationIT.class); + + private static final String SPANNER_DDL_RESOURCE = + "SpannerToSourceDbCustomTransformationIT/spanner-schema.sql"; + private static final String SESSION_FILE_RESOURCE = + "SpannerToSourceDbCustomTransformationIT/session.json"; + private static final String MYSQL_SCHEMA_FILE_RESOURCE = + "SpannerToSourceDbCustomTransformationIT/mysql-schema.sql"; + + private static final String TABLE = "Users1"; + + private static final String TABLE2 = "AllDatatypeTransformation"; + private static final HashSet testInstances = + new HashSet<>(); + private static PipelineLauncher.LaunchInfo jobInfo; + public static SpannerResourceManager spannerResourceManager; + private static SpannerResourceManager spannerMetadataResourceManager; + private static MySQLResourceManager jdbcResourceManager; + private static GcsResourceManager gcsResourceManager; + private static PubsubResourceManager pubsubResourceManager; + private SubscriptionName subscriptionName; + + /** + * Setup resource managers and Launch dataflow job once during the execution of this test class. + * + * @throws IOException + */ + @Before + public void setUp() throws IOException, InterruptedException { + skipBaseCleanup = true; + synchronized (SpannerToSourceDbCustomTransformationIT.class) { + testInstances.add(this); + if (jobInfo == null) { + spannerResourceManager = + createSpannerDatabase(SpannerToSourceDbCustomTransformationIT.SPANNER_DDL_RESOURCE); + spannerMetadataResourceManager = createSpannerMetadataDatabase(); + + jdbcResourceManager = MySQLResourceManager.builder(testName).build(); + + createMySQLSchema( + jdbcResourceManager, + SpannerToSourceDbCustomTransformationIT.MYSQL_SCHEMA_FILE_RESOURCE); + + gcsResourceManager = + GcsResourceManager.builder(artifactBucketName, getClass().getSimpleName(), credentials) + .build(); + createAndUploadShardConfigToGcs(gcsResourceManager, jdbcResourceManager); + gcsResourceManager.uploadArtifact( + "input/session.json", Resources.getResource(SESSION_FILE_RESOURCE).getPath()); + pubsubResourceManager = setUpPubSubResourceManager(); + subscriptionName = + createPubsubResources( + getClass().getSimpleName(), + pubsubResourceManager, + getGcsPath("dlq", gcsResourceManager).replace("gs://" + artifactBucketName, "")); + CustomTransformation customTransformation = + CustomTransformation.builder( + "input/customShard.jar", "com.custom.CustomTransformationWithShardForLiveIT") + .build(); + createAndUploadJarToGcs(gcsResourceManager); + jobInfo = + launchDataflowJob( + gcsResourceManager, + spannerResourceManager, + spannerMetadataResourceManager, + subscriptionName.toString(), + null, + null, + null, + null, + customTransformation); + } + } + } + + /** + * Cleanup dataflow job and all the resources and resource managers. + * + * @throws IOException + */ + @AfterClass + public static void cleanUp() throws IOException { + for (SpannerToSourceDbCustomTransformationIT instance : testInstances) { + instance.tearDownBase(); + } + ResourceManagerUtils.cleanResources( + spannerResourceManager, + jdbcResourceManager, + spannerMetadataResourceManager, + gcsResourceManager, + pubsubResourceManager); + } + + @Test + public void spannerToSourceDbWithCustomTransformation() throws InterruptedException { + assertThatPipeline(jobInfo).isRunning(); + // Write row in Spanner + writeRowInSpanner(); + // Assert events on Mysql + assertRowInMySQL(); + } + + private void writeRowInSpanner() { + Mutation m = + Mutation.newInsertOrUpdateBuilder("Users1").set("id").to(1).set("name").to("AA BB").build(); + spannerResourceManager.write(m); + m = + Mutation.newInsertOrUpdateBuilder("AllDatatypeTransformation") + .set("varchar_column") + .to("example2") + .set("bigint_column") + .to(1000) + .set("binary_column") + .to(Value.bytes(ByteArray.copyFrom("bin_column"))) + .set("bit_column") + .to(Value.bytes(ByteArray.copyFrom("1"))) + .set("blob_column") + .to(Value.bytes(ByteArray.copyFrom("blob_column"))) + .set("bool_column") + .to(Value.bool(Boolean.TRUE)) + .set("date_column") + .to(Value.date(Date.fromYearMonthDay(2024, 01, 01))) + .set("datetime_column") + .to(Value.timestamp(Timestamp.parseTimestamp("2024-01-01T12:34:56Z"))) + .set("decimal_column") + .to(new BigDecimal("99999.99")) + .set("double_column") + .to(123456.123) + .set("enum_column") + .to("1") + .set("float_column") + .to(12345.67) + .set("int_column") + .to(100) + .set("text_column") + .to("Sample text for entry 2") + .set("time_column") + .to("14:30:00") + .set("timestamp_column") + .to(Value.timestamp(Timestamp.parseTimestamp("2024-01-01T12:34:56Z"))) + .set("tinyint_column") + .to(2) + .set("year_column") + .to("2024") + .build(); + spannerResourceManager.write(m); + m = + Mutation.newUpdateBuilder("AllDatatypeTransformation") + .set("varchar_column") + .to("example2") + .set("bigint_column") + .to(1000) + .set("binary_column") + .to(Value.bytes(ByteArray.copyFrom("bin_column"))) + .set("bit_column") + .to(Value.bytes(ByteArray.copyFrom("1"))) + .set("blob_column") + .to(Value.bytes(ByteArray.copyFrom("blob_column"))) + .set("bool_column") + .to(Value.bool(Boolean.TRUE)) + .set("date_column") + .to(Value.date(Date.fromYearMonthDay(2024, 01, 01))) + .set("datetime_column") + .to(Value.timestamp(Timestamp.parseTimestamp("2024-01-01T12:34:56Z"))) + .set("decimal_column") + .to(new BigDecimal("99999.99")) + .set("double_column") + .to(123456.123) + .set("enum_column") + .to("1") + .set("float_column") + .to(12345.67) + .set("int_column") + .to(100) + .set("text_column") + .to("Sample text for entry 2") + .set("time_column") + .to("14:30:00") + .set("timestamp_column") + .to(Value.timestamp(Timestamp.parseTimestamp("2024-01-01T12:34:56Z"))) + .set("tinyint_column") + .to(2) + .set("year_column") + .to("2024") + .build(); + spannerResourceManager.write(m); + m = Mutation.delete("AllDatatypeTransformation", Key.of("example2")); + spannerResourceManager.write(m); + m = + Mutation.newInsertBuilder("AllDatatypeTransformation") + .set("varchar_column") + .to("example1") + .set("bigint_column") + .to(1000) + .set("binary_column") + .to(Value.bytes(ByteArray.copyFrom("examplebinary1"))) + .set("bit_column") + .to(Value.bytes(ByteArray.copyFrom("1"))) + .set("blob_column") + .to(Value.bytes(ByteArray.copyFrom("exampleblob1"))) + .set("bool_column") + .to(Value.bool(Boolean.TRUE)) + .set("date_column") + .to(Value.date(Date.fromYearMonthDay(2024, 01, 01))) + .set("datetime_column") + .to(Timestamp.parseTimestamp("2024-01-01T12:34:56Z")) + .set("decimal_column") + .to(new BigDecimal("99999.99")) + .set("double_column") + .to(123456.123) + .set("enum_column") + .to("1") + .set("float_column") + .to(12345.67) + .set("int_column") + .to(100) + .set("text_column") + .to("Sample text for entry 1") + .set("time_column") + .to("14:30:00") + .set("timestamp_column") + .to(Timestamp.parseTimestamp("2024-01-01T12:34:56Z")) + .set("tinyint_column") + .to(1) + .set("year_column") + .to("2024") + .build(); + spannerResourceManager.write(m); + m = + Mutation.newInsertBuilder("AllDatatypeTransformation") + .set("varchar_column") + .to("example") + .set("bigint_column") + .to(12345) + .set("binary_column") + .to(Value.bytes(ByteArray.copyFrom("Some binary data"))) + .set("bit_column") + .to(Value.bytes(ByteArray.copyFrom("1"))) + .set("blob_column") + .to(Value.bytes(ByteArray.copyFrom("Some blob data"))) + .set("bool_column") + .to(Value.bool(Boolean.TRUE)) + .set("date_column") + .to(Value.date(Date.fromYearMonthDay(2024, 01, 01))) + .set("datetime_column") + .to(Value.timestamp(Timestamp.parseTimestamp("2024-01-01T12:34:56Z"))) + .set("decimal_column") + .to(new BigDecimal("12345.67")) + .set("double_column") + .to(123.456) + .set("enum_column") + .to("1") + .set("float_column") + .to(123.45) + .set("int_column") + .to(123) + .set("text_column") + .to("Sample text") + .set("time_column") + .to("14:30:00") + .set("timestamp_column") + .to(Value.timestamp(Timestamp.parseTimestamp("2024-01-01T12:34:56Z"))) + .set("tinyint_column") + .to(1) + .set("year_column") + .to("2024") + .build(); + spannerResourceManager.write(m); + } + + private void assertRowInMySQL() { + PipelineOperator.Result result = + pipelineOperator() + .waitForCondition( + createConfig(jobInfo, Duration.ofMinutes(10)), + () -> jdbcResourceManager.getRowCount(TABLE) == 1); + assertThatResult(result).meetsConditions(); + + result = + pipelineOperator() + .waitForCondition( + createConfig(jobInfo, Duration.ofMinutes(10)), + () -> jdbcResourceManager.getRowCount(TABLE2) == 2); + assertThatResult(result).meetsConditions(); + + List> rows = jdbcResourceManager.readTable(TABLE); + assertThat(rows).hasSize(1); + assertThat(rows.get(0).get("id")).isEqualTo(1); + assertThat(rows.get(0).get("first_name")).isEqualTo("AA"); + assertThat(rows.get(0).get("last_name")).isEqualTo("BB"); + + rows = + jdbcResourceManager.runSQLQuery( + String.format("select * from %s order by %s", TABLE2, "varchar_column")); + assertThat(rows).hasSize(2); + assertThat(rows.get(1).get("varchar_column")).isEqualTo("example2"); + assertThat(rows.get(1).get("bigint_column")).isEqualTo(1000); + assertThat(rows.get(1).get("binary_column")) + .isEqualTo("bin_column".getBytes(StandardCharsets.UTF_8)); + assertThat(rows.get(1).get("bit_column")).isEqualTo("1".getBytes(StandardCharsets.UTF_8)); + assertThat(rows.get(1).get("blob_column")) + .isEqualTo("blob_column".getBytes(StandardCharsets.UTF_8)); + assertThat(rows.get(1).get("bool_column")).isEqualTo(true); + assertThat(rows.get(1).get("date_column")).isEqualTo(java.sql.Date.valueOf("2024-01-01")); + assertThat(rows.get(1).get("datetime_column")) + .isEqualTo(java.time.LocalDateTime.of(2024, 1, 1, 12, 34, 56)); + assertThat(rows.get(1).get("decimal_column")).isEqualTo(new BigDecimal("99999.99")); + assertThat(rows.get(1).get("double_column")).isEqualTo(123456.123); + assertThat(rows.get(1).get("enum_column")).isEqualTo("1"); + assertThat(rows.get(1).get("float_column")).isEqualTo(12345.67f); + assertThat(rows.get(1).get("int_column")).isEqualTo(100); + assertThat(rows.get(1).get("text_column")).isEqualTo("Sample text for entry 2"); + assertThat(rows.get(1).get("time_column")).isEqualTo(java.sql.Time.valueOf("14:30:00")); + assertThat(rows.get(1).get("timestamp_column")) + .isEqualTo(java.sql.Timestamp.valueOf("2024-01-01 12:34:56.0")); + assertThat(rows.get(1).get("tinyint_column")).isEqualTo(2); + assertThat(rows.get(1).get("year_column")).isEqualTo(java.sql.Date.valueOf("2024-01-01")); + + assertThat(rows.get(0).get("varchar_column")).isEqualTo("example"); + assertThat(rows.get(0).get("bigint_column")).isEqualTo(12346); + assertThat(rows.get(0).get("binary_column")) + .isEqualTo("binary_column_appended".getBytes(StandardCharsets.UTF_8)); + assertThat(rows.get(0).get("bit_column")).isEqualTo("5".getBytes(StandardCharsets.UTF_8)); + assertThat(rows.get(0).get("blob_column")) + .isEqualTo("blob_column_appended".getBytes(StandardCharsets.UTF_8)); + assertThat(rows.get(0).get("bool_column")).isEqualTo(false); + assertThat(rows.get(0).get("date_column")).isEqualTo(java.sql.Date.valueOf("2024-01-02")); + assertThat(rows.get(0).get("datetime_column")) + .isEqualTo(java.time.LocalDateTime.of(2024, 1, 1, 12, 34, 55)); + assertThat(rows.get(0).get("decimal_column")).isEqualTo(new BigDecimal("12344.67")); + assertThat(rows.get(0).get("double_column")).isEqualTo(124.456); + assertThat(rows.get(0).get("enum_column")).isEqualTo("3"); + assertThat(rows.get(0).get("float_column")).isEqualTo(124.45f); + assertThat(rows.get(0).get("int_column")).isEqualTo(124); + assertThat(rows.get(0).get("text_column")).isEqualTo("Sample text append"); + assertThat(rows.get(0).get("time_column")).isEqualTo(java.sql.Time.valueOf("14:40:00")); + assertThat(rows.get(0).get("timestamp_column")) + .isEqualTo(java.sql.Timestamp.valueOf("2024-01-01 12:34:55.0")); + assertThat(rows.get(0).get("tinyint_column")).isEqualTo(2); + assertThat(rows.get(0).get("year_column")).isEqualTo(java.sql.Date.valueOf("2025-01-01")); + + rows = + jdbcResourceManager.runSQLQuery( + String.format( + "select * from %s where %s like '%s'", TABLE2, "varchar_column", "example1")); + assertThat(rows).hasSize(0); + } +} diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbDatatypeIT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbDatatypeIT.java index 710526fb5a..dbd023cdef 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbDatatypeIT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbDatatypeIT.java @@ -117,6 +117,7 @@ public void setUp() throws IOException { null, null, null, + null, null); } } diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbIT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbIT.java index 0a97e21d00..7c3ad39760 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbIT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbIT.java @@ -113,6 +113,7 @@ public void setUp() throws IOException { null, null, null, + null, null); } } diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbITBase.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbITBase.java index 781b4c4a2e..64d15895cd 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbITBase.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbITBase.java @@ -18,6 +18,7 @@ import static org.apache.beam.it.truthmatchers.PipelineAsserts.assertThatPipeline; import com.google.cloud.teleport.v2.spanner.migrations.shard.Shard; +import com.google.cloud.teleport.v2.spanner.migrations.transformation.CustomTransformation; import com.google.common.io.Resources; import com.google.gson.Gson; import com.google.gson.JsonArray; @@ -123,7 +124,8 @@ public PipelineLauncher.LaunchInfo launchDataflowJob( String identifierSuffix, String shardingCustomJarPath, String shardingCustomClassName, - String sourceDbTimezoneOffset) + String sourceDbTimezoneOffset, + CustomTransformation customTransformation) throws IOException { // default parameters @@ -159,6 +161,12 @@ public PipelineLauncher.LaunchInfo launchDataflowJob( params.put("sourceDbTimezoneOffset", sourceDbTimezoneOffset); } + if (customTransformation != null) { + params.put( + "transformationJarPath", getGcsPath(customTransformation.jarPath(), gcsResourceManager)); + params.put("transformationClassName", customTransformation.classPath()); + } + // Construct template String jobName = PipelineUtils.createJobName("rrev-it" + testName); // /-DunifiedWorker=true when using runner v2 diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbInterleaveMultiShardIT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbInterleaveMultiShardIT.java index 25a1b1b991..1f5acdc952 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbInterleaveMultiShardIT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbInterleaveMultiShardIT.java @@ -123,6 +123,7 @@ public void setUp() throws IOException { null, null, null, + null, null); } } diff --git a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbTimezoneIT.java b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbTimezoneIT.java index c8c3ce5945..1ab2d78b49 100644 --- a/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbTimezoneIT.java +++ b/v2/spanner-to-sourcedb/src/test/java/com/google/cloud/teleport/v2/templates/SpannerToSourceDbTimezoneIT.java @@ -112,7 +112,8 @@ public void setUp() throws IOException { null, null, null, - "+10:00"); + "+10:00", + null); } } } diff --git a/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/mysql-schema.sql b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/mysql-schema.sql new file mode 100644 index 0000000000..6e68b9af51 --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/mysql-schema.sql @@ -0,0 +1,27 @@ +CREATE TABLE Users1 ( + id INT NOT NULL, + first_name VARCHAR(25), + last_name VARCHAR(25), + PRIMARY KEY(id)); + +CREATE TABLE AllDatatypeTransformation ( + varchar_column VARCHAR(20) NOT NULL, + tinyint_column TINYINT, + text_column TEXT, + date_column DATE, + int_column INT, + bigint_column BIGINT, + float_column FLOAT(10,2), + double_column DOUBLE, + decimal_column DECIMAL(10,2), + datetime_column DATETIME, + timestamp_column TIMESTAMP, + time_column TIME, + year_column YEAR, + blob_column BLOB, + enum_column ENUM('1','2','3'), + bool_column TINYINT(1), + binary_column VARBINARY(150), + bit_column BIT(8), + PRIMARY KEY (varchar_column) +); \ No newline at end of file diff --git a/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/session.json b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/session.json new file mode 100644 index 0000000000..e1a764ec4b --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/session.json @@ -0,0 +1,885 @@ +{ + "SessionName": "NewSession", + "EditorName": "", + "DatabaseType": "mysql", + "DatabaseName": "rr_write", + "Dialect": "google_standard_sql", + "Notes": null, + "Tags": null, + "SpSchema": { + "t113": { + "Name": "AllDatatypeTransformation", + "ColIds": [ + "c115", + "c116", + "c117", + "c118", + "c119", + "c120", + "c121", + "c122", + "c123", + "c124", + "c125", + "c126", + "c127", + "c128", + "c129", + "c130", + "c131", + "c132" + ], + "ShardIdColumn": "", + "ColDefs": { + "c115": { + "Name": "varchar_column", + "T": { + "Name": "STRING", + "Len": 20, + "IsArray": false + }, + "NotNull": true, + "Comment": "From: varchar_column varchar(20)", + "Id": "c115", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c116": { + "Name": "tinyint_column", + "T": { + "Name": "INT64", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: tinyint_column tinyint(3)", + "Id": "c116", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c117": { + "Name": "text_column", + "T": { + "Name": "STRING", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: text_column text(65535)", + "Id": "c117", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c118": { + "Name": "date_column", + "T": { + "Name": "DATE", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: date_column date", + "Id": "c118", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c119": { + "Name": "int_column", + "T": { + "Name": "INT64", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: int_column int(10)", + "Id": "c119", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c120": { + "Name": "bigint_column", + "T": { + "Name": "INT64", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: bigint_column bigint(19)", + "Id": "c120", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c121": { + "Name": "float_column", + "T": { + "Name": "FLOAT64", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: float_column float(10,2)", + "Id": "c121", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c122": { + "Name": "double_column", + "T": { + "Name": "FLOAT64", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: double_column double(22)", + "Id": "c122", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c123": { + "Name": "decimal_column", + "T": { + "Name": "NUMERIC", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: decimal_column decimal(10,2)", + "Id": "c123", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c124": { + "Name": "datetime_column", + "T": { + "Name": "TIMESTAMP", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: datetime_column datetime", + "Id": "c124", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c125": { + "Name": "timestamp_column", + "T": { + "Name": "TIMESTAMP", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: timestamp_column timestamp", + "Id": "c125", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c126": { + "Name": "time_column", + "T": { + "Name": "STRING", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: time_column time", + "Id": "c126", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c127": { + "Name": "year_column", + "T": { + "Name": "STRING", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: year_column year", + "Id": "c127", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c128": { + "Name": "blob_column", + "T": { + "Name": "BYTES", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: blob_column blob(65535)", + "Id": "c128", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c129": { + "Name": "enum_column", + "T": { + "Name": "STRING", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: enum_column enum(1)", + "Id": "c129", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c130": { + "Name": "bool_column", + "T": { + "Name": "BOOL", + "Len": 0, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: bool_column tinyint(1)", + "Id": "c130", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c131": { + "Name": "binary_column", + "T": { + "Name": "BYTES", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: binary_column binary(20)", + "Id": "c131", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c132": { + "Name": "bit_column", + "T": { + "Name": "BYTES", + "Len": 9223372036854775807, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: bit_column bit(7)", + "Id": "c132", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + } + }, + "PrimaryKeys": [ + { + "ColId": "c115", + "Desc": false, + "Order": 1 + } + ], + "ForeignKeys": null, + "Indexes": null, + "ParentId": "", + "Comment": "Spanner schema for source table AllDatatypeTransformation", + "Id": "t113" + }, + "t114": { + "Name": "Users1", + "ColIds": [ + "c133", + "c134" + ], + "ShardIdColumn": "", + "ColDefs": { + "c133": { + "Name": "id", + "T": { + "Name": "INT64", + "Len": 0, + "IsArray": false + }, + "NotNull": true, + "Comment": "From: id int(10)", + "Id": "c133", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + }, + "c134": { + "Name": "name", + "T": { + "Name": "STRING", + "Len": 25, + "IsArray": false + }, + "NotNull": false, + "Comment": "From: name varchar(25)", + "Id": "c134", + "AutoGen": { + "Name": "", + "GenerationType": "" + } + } + }, + "PrimaryKeys": [ + { + "ColId": "c133", + "Desc": false, + "Order": 1 + } + ], + "ForeignKeys": null, + "Indexes": null, + "ParentId": "", + "Comment": "Spanner schema for source table Users", + "Id": "t114" + } + }, + "SyntheticPKeys": {}, + "SrcSchema": { + "t113": { + "Name": "AllDatatypeTransformation", + "Schema": "rr_write", + "ColIds": [ + "c115", + "c116", + "c117", + "c118", + "c119", + "c120", + "c121", + "c122", + "c123", + "c124", + "c125", + "c126", + "c127", + "c128", + "c129", + "c130", + "c131", + "c132" + ], + "ColDefs": { + "c115": { + "Name": "varchar_column", + "Type": { + "Name": "varchar", + "Mods": [ + 20 + ], + "ArrayBounds": null + }, + "NotNull": true, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c115" + }, + "c116": { + "Name": "tinyint_column", + "Type": { + "Name": "tinyint", + "Mods": [ + 3 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c116" + }, + "c117": { + "Name": "text_column", + "Type": { + "Name": "text", + "Mods": [ + 65535 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c117" + }, + "c118": { + "Name": "date_column", + "Type": { + "Name": "date", + "Mods": null, + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c118" + }, + "c119": { + "Name": "int_column", + "Type": { + "Name": "int", + "Mods": [ + 10 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c119" + }, + "c120": { + "Name": "bigint_column", + "Type": { + "Name": "bigint", + "Mods": [ + 19 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c120" + }, + "c121": { + "Name": "float_column", + "Type": { + "Name": "float", + "Mods": [ + 10, + 2 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c121" + }, + "c122": { + "Name": "double_column", + "Type": { + "Name": "double", + "Mods": [ + 22 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c122" + }, + "c123": { + "Name": "decimal_column", + "Type": { + "Name": "decimal", + "Mods": [ + 10, + 2 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c123" + }, + "c124": { + "Name": "datetime_column", + "Type": { + "Name": "datetime", + "Mods": null, + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c124" + }, + "c125": { + "Name": "timestamp_column", + "Type": { + "Name": "timestamp", + "Mods": null, + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c125" + }, + "c126": { + "Name": "time_column", + "Type": { + "Name": "time", + "Mods": null, + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c126" + }, + "c127": { + "Name": "year_column", + "Type": { + "Name": "year", + "Mods": null, + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c127" + }, + "c128": { + "Name": "blob_column", + "Type": { + "Name": "blob", + "Mods": [ + 65535 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c128" + }, + "c129": { + "Name": "enum_column", + "Type": { + "Name": "enum", + "Mods": [ + 1 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c129" + }, + "c130": { + "Name": "bool_column", + "Type": { + "Name": "tinyint", + "Mods": [ + 1 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c130" + }, + "c131": { + "Name": "binary_column", + "Type": { + "Name": "binary", + "Mods": [ + 150 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c131" + }, + "c132": { + "Name": "bit_column", + "Type": { + "Name": "bit", + "Mods": [ + 20 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c132" + } + }, + "PrimaryKeys": [ + { + "ColId": "c115", + "Desc": false, + "Order": 1 + } + ], + "ForeignKeys": null, + "Indexes": null, + "Id": "t113" + }, + "t114": { + "Name": "Users1", + "Schema": "rr_write", + "ColIds": [ + "c133", + "c134", + "c135" + ], + "ColDefs": { + "c133": { + "Name": "id", + "Type": { + "Name": "int", + "Mods": [ + 10 + ], + "ArrayBounds": null + }, + "NotNull": true, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c133" + }, + "c134": { + "Name": "first_name", + "Type": { + "Name": "varchar", + "Mods": [ + 25 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c134" + }, + "c135": { + "Name": "last_name", + "Type": { + "Name": "varchar", + "Mods": [ + 25 + ], + "ArrayBounds": null + }, + "NotNull": false, + "Ignored": { + "Check": false, + "Identity": false, + "Default": false, + "Exclusion": false, + "ForeignKey": false, + "AutoIncrement": false + }, + "Id": "c135" + } + }, + "PrimaryKeys": [ + { + "ColId": "c133", + "Desc": false, + "Order": 1 + } + ], + "ForeignKeys": null, + "Indexes": null, + "Id": "t114" + } + }, + "SchemaIssues": { + "t113": { + "ColumnLevelIssues": { + "c116": [ + 14 + ], + "c119": [ + 14 + ], + "c121": [ + 14 + ], + "c124": [ + 13 + ], + "c126": [ + 15 + ], + "c127": [ + 15 + ] + }, + "TableLevelIssues": null + }, + "t114": { + "ColumnLevelIssues": { + "c133": [ + 14 + ] + }, + "TableLevelIssues": null + } + }, + "Location": {}, + "TimezoneOffset": "+00:00", + "SpDialect": "google_standard_sql", + "UniquePKey": {}, + "Rules": [], + "IsSharded": false, + "SpRegion": "", + "ResourceValidation": false, + "UI": false +} \ No newline at end of file diff --git a/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/spanner-schema.sql b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/spanner-schema.sql new file mode 100644 index 0000000000..a5a1f125ff --- /dev/null +++ b/v2/spanner-to-sourcedb/src/test/resources/SpannerToSourceDbCustomTransformationIT/spanner-schema.sql @@ -0,0 +1,31 @@ +CREATE TABLE IF NOT EXISTS Users1 ( + id INT64 NOT NULL, + name STRING(25), +) PRIMARY KEY(id); + +CREATE TABLE AllDatatypeTransformation ( + varchar_column STRING(20) NOT NULL, + tinyint_column INT64, + text_column STRING(MAX), + date_column DATE, + int_column INT64, + bigint_column INT64, + float_column FLOAT64, + double_column FLOAT64, + decimal_column NUMERIC, + datetime_column TIMESTAMP, + timestamp_column TIMESTAMP, + time_column STRING(MAX), + year_column STRING(MAX), + blob_column BYTES(MAX), + enum_column STRING(MAX), + bool_column BOOL, + binary_column BYTES(MAX), + bit_column BYTES(MAX), +) PRIMARY KEY (varchar_column); + +CREATE CHANGE STREAM allstream + FOR ALL OPTIONS ( + value_capture_type = 'NEW_ROW', + retention_period = '7d' +); \ No newline at end of file diff --git a/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub.md b/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub.md index 0e39a3bd2e..63ce6272a0 100644 --- a/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub.md +++ b/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub.md @@ -18,20 +18,20 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : The JDBC driver class name. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma-separated Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : The query to run on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The Pub/Sub topic to publish to, in the format projects//topics/. (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: The JDBC driver class name. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: The JDBC connection URL string. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example: 'echo -n "jdbc:mysql://some-host:3306/sampledb" | gcloud kms encrypt --location= --keyring= --key= --plaintext-file=- --ciphertext-file=- | base64' For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma-separated Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: The query to run on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The Pub/Sub topic to publish to. For example, `projects//topics/`. ### Optional parameters -* **username** : The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **password** : The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). +* **username**: The username to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_username' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **password**: The password to use for the JDBC connection. You can pass in this value as a string that's encrypted with a Cloud KMS key and then Base64-encoded. For example, `echo -n 'some_password' | glcloud kms encrypt --location=my_location --keyring=mykeyring --key=mykey --plaintext-file=- --ciphertext-file=- | base64`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: The Cloud KMS Encryption Key to use to decrypt the username, password, and connection string. If a Cloud KMS key is passed in, the username, password, and connection string must all be passed in encrypted and base64 encoded. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. @@ -223,17 +223,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub" { name = "jdbc-to-pubsub" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # connectionProperties = "" + # KMSEncryptionKey = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" } } ``` diff --git a/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub_Auto.md b/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub_Auto.md index 458d522dc3..8441e93d01 100644 --- a/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub_Auto.md +++ b/v2/sqlserver-to-googlecloud/README_Jdbc_to_PubSub_Auto.md @@ -15,23 +15,23 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. (Example: jdbc:mysql://some-host:3306/sampledb). -* **driverJars** : Comma separate Cloud Storage paths for JDBC drivers. (Example: gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar). -* **query** : Query to be executed on the source to extract the data. (Example: select * from sampledb.sample_table). -* **outputTopic** : The name of the topic to which data should published, in the format of 'projects/your-project-id/topics/your-topic-name' (Example: projects/your-project-id/topics/your-topic-name). +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: Url connection string to connect to the JDBC source. Connection string can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **driverJars**: Comma separate Cloud Storage paths for JDBC drivers. For example, `gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar`. +* **query**: Query to be executed on the source to extract the data. For example, `select * from sampledb.sample_table`. +* **outputTopic**: The name of the topic to publish data to. For example, `projects//topics/`. ### Optional parameters -* **username** : User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **password** : Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **KMSEncryptionKey** : If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **partitionColumn** : If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. -* **table** : Table to read from using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. -* **lowerBound** : Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). -* **upperBound** : Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **username**: User name to be used for the JDBC connection. User name can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **password**: Password to be used for the JDBC connection. Password can be passed in as plaintext or as a base64 encoded string encrypted by Google Cloud KMS. +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **KMSEncryptionKey**: If this parameter is provided, password, user name and connection string should all be passed in encrypted. Encrypt parameters using the KMS API encrypt endpoint. See: https://cloud.google.com/kms/docs/reference/rest/v1/projects.locations.keyRings.cryptoKeys/encrypt For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **partitionColumn**: If this parameter is provided (along with `table`), JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only Long partition columns are supported. +* **table**: Table to read from using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. This, along with the lower and upper bound, form partitions strides for generated WHERE clause expressions used to split the partition column evenly. When the input is less than 1, the number is set to 1. +* **lowerBound**: Lower bound used in the partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). +* **upperBound**: Upper bound used in partition scheme. If not provided, it is automatically inferred by Beam (for the supported types). @@ -232,17 +232,17 @@ resource "google_dataflow_flex_template_job" "jdbc_to_pubsub_auto" { name = "jdbc-to-pubsub-auto" region = var.region parameters = { - driverClassName = "com.mysql.jdbc.Driver" - connectionUrl = "jdbc:mysql://some-host:3306/sampledb" - driverJars = "gs://your-bucket/driver_jar1.jar,gs://your-bucket/driver_jar2.jar" - query = "select * from sampledb.sample_table" - outputTopic = "projects/your-project-id/topics/your-topic-name" + driverClassName = "" + connectionUrl = "" + driverJars = "" + query = "" + outputTopic = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # connectionProperties = "" + # KMSEncryptionKey = "" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "

" # numPartitions = "" # lowerBound = "" # upperBound = "" diff --git a/v2/sqlserver-to-googlecloud/README_SQLServer_to_BigQuery.md b/v2/sqlserver-to-googlecloud/README_SQLServer_to_BigQuery.md index 6278b5f3b3..ffdd788896 100644 --- a/v2/sqlserver-to-googlecloud/README_SQLServer_to_BigQuery.md +++ b/v2/sqlserver-to-googlecloud/README_SQLServer_to_BigQuery.md @@ -23,34 +23,32 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **connectionURL** : The JDBC connection URL string. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. (Example: jdbc:sqlserver://localhost;databaseName=sampledb). -* **outputTable** : The BigQuery output table location. (Example: :.). -* **bigQueryLoadingTemporaryDirectory** : The temporary directory for the BigQuery loading process. (Example: gs://your-bucket/your-files/temp_dir). +* **connectionURL**: The JDBC connection URL string. Can be passed in as a string that's Base64-encoded and then encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. For example, `jdbc:sqlserver://localhost;databaseName=sampledb`. +* **outputTable**: The BigQuery output table location. For example, `:.`. +* **bigQueryLoadingTemporaryDirectory**: The temporary directory for the BigQuery loading process. For example, `gs://your-bucket/your-files/temp_dir`. ### Optional parameters -* **connectionProperties** : The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. (Example: unicode=true;characterEncoding=UTF-8). -* **username** : The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **password** : The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. -* **query** : The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are: -DATETIME --> TIMESTAMP - -Type casting may be required if your schemas do not match. This parameter can be set to a gs:// path pointing to a file in Cloud Storage to load the query from. The file encoding should be UTF-8. (Example: select * from sampledb.sample_table). -* **KMSEncryptionKey** : The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. (Example: projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key). -* **useColumnAlias** : If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. -* **isTruncate** : If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. -* **partitionColumn** : If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. -* **table** : The table to read from when using partitions. This parameter also accepts a subquery in parentheses. (Example: (select id, name from Person) as subq). -* **numPartitions** : The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. -* **lowerBound** : The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **upperBound** : The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. -* **fetchSize** : The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. -* **createDisposition** : The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. -* **bigQuerySchemaPath** : The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to CREATE_IF_NEEDED, this parameter must be specified. (Example: gs://your-bucket/your-schema.json). -* **disabledAlgorithms** : Comma separated algorithms to disable. If this value is set to none, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. (Example: SSLv3, RC4). -* **extraFilesToStage** : Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. (Example: gs:///file.txt,projects//secrets//versions/). -* **useStorageWriteApi** : If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). -* **useStorageWriteApiAtLeastOnce** : When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. +* **connectionProperties**: The properties string to use for the JDBC connection. The format of the string must be `[propertyName=property;]*`.For more information, see Configuration Properties (https://dev.mysql.com/doc/connector-j/en/connector-j-reference-configuration-properties.html) in the MySQL documentation. For example, `unicode=true;characterEncoding=UTF-8`. +* **username**: The username to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **password**: The password to use for the JDBC connection. Can be passed in as a string that's encrypted with a Cloud KMS key, or can be a Secret Manager secret in the form projects/{project}/secrets/{secret}/versions/{secret_version}. +* **query**: The query to run on the source to extract the data. Note that some JDBC SQL and BigQuery types, although sharing the same name, have some differences. Some important SQL -> BigQuery type mappings to keep in mind are `DATETIME --> TIMESTAMP`. Type casting may be required if your schemas do not match. For example, `select * from sampledb.sample_table`. +* **KMSEncryptionKey**: The Cloud KMS encryption key to use to decrypt the username, password, and connection string. If you pass in a Cloud KMS key, you must also encrypt the username, password, and connection string. For example, `projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key`. +* **useColumnAlias**: If set to `true`, the pipeline uses the column alias (`AS`) instead of the column name to map the rows to BigQuery. Defaults to `false`. +* **isTruncate**: If set to `true`, the pipeline truncates before loading data into BigQuery. Defaults to `false`, which causes the pipeline to append data. +* **partitionColumn**: If this parameter is provided with the name of the `table` defined as an optional parameter, JdbcIO reads the table in parallel by executing multiple instances of the query on the same table (subquery) using ranges. Currently, only supports `Long` partition columns. +* **table**: The table to read from when using partitions. This parameter also accepts a subquery in parentheses. For example, `(select id, name from Person) as subq`. +* **numPartitions**: The number of partitions. With the lower and upper bound, this value forms partition strides for generated `WHERE` clause expressions that are used to split the partition column evenly. When the input is less than `1`, the number is set to `1`. +* **lowerBound**: The lower bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **upperBound**: The upper bound to use in the partition scheme. If not provided, this value is automatically inferred by Apache Beam for the supported types. +* **fetchSize**: The number of rows to be fetched from database at a time. Not used for partitioned reads. Defaults to: 50000. +* **createDisposition**: The BigQuery CreateDisposition to use. For example, `CREATE_IF_NEEDED` or `CREATE_NEVER`. Defaults to: CREATE_NEVER. +* **bigQuerySchemaPath**: The Cloud Storage path for the BigQuery JSON schema. If `createDisposition` is set to `CREATE_IF_NEEDED`, this parameter must be specified. For example, `gs://your-bucket/your-schema.json`. +* **outputDeadletterTable**: The BigQuery table to use for messages that failed to reach the output table, formatted as `"PROJECT_ID:DATASET_NAME.TABLE_NAME"`. If the table doesn't exist, it is created when the pipeline runs. If this parameter is not specified, the pipeline will fail on write errors.This parameter can only be specified if `useStorageWriteApi` or `useStorageWriteApiAtLeastOnce` is set to true. +* **disabledAlgorithms**: Comma separated algorithms to disable. If this value is set to `none`, no algorithm is disabled. Use this parameter with caution, because the algorithms disabled by default might have vulnerabilities or performance issues. For example, `SSLv3, RC4`. +* **extraFilesToStage**: Comma separated Cloud Storage paths or Secret Manager secrets for files to stage in the worker. These files are saved in the /extra_files directory in each worker. For example, `gs:///file.txt,projects//secrets//versions/`. +* **useStorageWriteApi**: If `true`, the pipeline uses the BigQuery Storage Write API (https://cloud.google.com/bigquery/docs/write-api). The default value is `false`. For more information, see Using the Storage Write API (https://beam.apache.org/documentation/io/built-in/google-bigquery/#storage-write-api). +* **useStorageWriteApiAtLeastOnce**: When using the Storage Write API, specifies the write semantics. To use at-least-once semantics (https://beam.apache.org/documentation/io/built-in/google-bigquery/#at-least-once-semantics), set this parameter to `true`. To use exactly-once semantics, set the parameter to `false`. This parameter applies only when `useStorageWriteApi` is `true`. The default value is `false`. @@ -149,6 +147,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -176,6 +175,7 @@ gcloud dataflow flex-template run "sqlserver-to-bigquery-job" \ --parameters "fetchSize=$FETCH_SIZE" \ --parameters "createDisposition=$CREATE_DISPOSITION" \ --parameters "bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH" \ + --parameters "outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE" \ --parameters "disabledAlgorithms=$DISABLED_ALGORITHMS" \ --parameters "extraFilesToStage=$EXTRA_FILES_TO_STAGE" \ --parameters "useStorageWriteApi=$USE_STORAGE_WRITE_API" \ @@ -218,6 +218,7 @@ export UPPER_BOUND= export FETCH_SIZE=50000 export CREATE_DISPOSITION=CREATE_NEVER export BIG_QUERY_SCHEMA_PATH= +export OUTPUT_DEADLETTER_TABLE= export DISABLED_ALGORITHMS= export EXTRA_FILES_TO_STAGE= export USE_STORAGE_WRITE_API=false @@ -230,7 +231,7 @@ mvn clean package -PtemplatesRun \ -Dregion="$REGION" \ -DjobName="sqlserver-to-bigquery-job" \ -DtemplateName="SQLServer_to_BigQuery" \ --Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ +-Dparameters="connectionURL=$CONNECTION_URL,connectionProperties=$CONNECTION_PROPERTIES,username=$USERNAME,password=$PASSWORD,query=$QUERY,outputTable=$OUTPUT_TABLE,bigQueryLoadingTemporaryDirectory=$BIG_QUERY_LOADING_TEMPORARY_DIRECTORY,KMSEncryptionKey=$KMSENCRYPTION_KEY,useColumnAlias=$USE_COLUMN_ALIAS,isTruncate=$IS_TRUNCATE,partitionColumn=$PARTITION_COLUMN,table=$TABLE,numPartitions=$NUM_PARTITIONS,lowerBound=$LOWER_BOUND,upperBound=$UPPER_BOUND,fetchSize=$FETCH_SIZE,createDisposition=$CREATE_DISPOSITION,bigQuerySchemaPath=$BIG_QUERY_SCHEMA_PATH,outputDeadletterTable=$OUTPUT_DEADLETTER_TABLE,disabledAlgorithms=$DISABLED_ALGORITHMS,extraFilesToStage=$EXTRA_FILES_TO_STAGE,useStorageWriteApi=$USE_STORAGE_WRITE_API,useStorageWriteApiAtLeastOnce=$USE_STORAGE_WRITE_API_AT_LEAST_ONCE" \ -f v2/sqlserver-to-googlecloud ``` @@ -275,26 +276,27 @@ resource "google_dataflow_flex_template_job" "sqlserver_to_bigquery" { name = "sqlserver-to-bigquery" region = var.region parameters = { - connectionURL = "jdbc:sqlserver://localhost;databaseName=sampledb" - outputTable = ":." - bigQueryLoadingTemporaryDirectory = "gs://your-bucket/your-files/temp_dir" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" + connectionURL = "" + outputTable = "" + bigQueryLoadingTemporaryDirectory = "" + # connectionProperties = "" # username = "" # password = "" - # query = "select * from sampledb.sample_table" - # KMSEncryptionKey = "projects/your-project/locations/global/keyRings/your-keyring/cryptoKeys/your-key" + # query = "" + # KMSEncryptionKey = "" # useColumnAlias = "false" # isTruncate = "false" # partitionColumn = "" - # table = "(select id, name from Person) as subq" + # table = "
" # numPartitions = "" # lowerBound = "" # upperBound = "" # fetchSize = "50000" # createDisposition = "CREATE_NEVER" - # bigQuerySchemaPath = "gs://your-bucket/your-schema.json" - # disabledAlgorithms = "SSLv3, RC4" - # extraFilesToStage = "gs:///file.txt,projects//secrets//versions/" + # bigQuerySchemaPath = "" + # outputDeadletterTable = "" + # disabledAlgorithms = "" + # extraFilesToStage = "" # useStorageWriteApi = "false" # useStorageWriteApiAtLeastOnce = "false" } diff --git a/v2/streaming-data-generator/README_Streaming_Data_Generator.md b/v2/streaming-data-generator/README_Streaming_Data_Generator.md index 0d4e40e71b..0ac35a9c05 100644 --- a/v2/streaming-data-generator/README_Streaming_Data_Generator.md +++ b/v2/streaming-data-generator/README_Streaming_Data_Generator.md @@ -17,40 +17,40 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **qps** : Indicates rate of messages per second to be published to Pub/Sub. +* **qps**: Indicates rate of messages per second to be published to Pub/Sub. ### Optional parameters -* **schemaTemplate** : Pre-existing schema template to use. The value must be one of: [GAME_EVENT]. -* **schemaLocation** : Cloud Storage path of schema location. (Example: gs:///prefix). -* **topic** : The name of the topic to which the pipeline should publish data. (Example: projects//topics/). -* **messagesLimit** : Indicates maximum number of output messages to be generated. 0 means unlimited. Defaults to: 0. -* **outputType** : The message Output type. Default is JSON. -* **avroSchemaLocation** : Cloud Storage path of Avro schema location. Mandatory when output type is AVRO or PARQUET. (Example: gs://your-bucket/your-path/schema.avsc). -* **sinkType** : The message Sink type. Default is PUBSUB. -* **outputTableSpec** : Output BigQuery table. Mandatory when sinkType is BIGQUERY (Example: :.). -* **writeDisposition** : BigQuery WriteDisposition. For example, WRITE_APPEND, WRITE_EMPTY or WRITE_TRUNCATE. Defaults to: WRITE_APPEND. -* **outputDeadletterTable** : Messages failed to reach the output table for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. (Example: your-project-id:your-dataset.your-table-name). -* **windowDuration** : The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). (Example: 1m). Defaults to: 1m. -* **outputDirectory** : The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. (Example: gs://your-bucket/your-path/). -* **outputFilenamePrefix** : The prefix to place on each windowed file. (Example: output-). Defaults to: output-. -* **numShards** : The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Default value is decided by Dataflow. -* **driverClassName** : JDBC driver class name to use. (Example: com.mysql.jdbc.Driver). -* **connectionUrl** : Url connection string to connect to the JDBC source. (Example: jdbc:mysql://some-host:3306/sampledb). -* **username** : User name to be used for the JDBC connection. -* **password** : Password to be used for the JDBC connection. -* **connectionProperties** : Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. (Example: unicode=true;characterEncoding=UTF-8). -* **statement** : SQL statement which will be executed to write to the database. The statement must specify the column names of the table in any order. Only the values of the specified column names will be read from the json and added to the statement. (Example: INSERT INTO tableName (column1, column2) VALUES (?,?)). -* **projectId** : GCP Project Id of where the Spanner table lives. -* **spannerInstanceName** : Cloud Spanner instance name. -* **spannerDatabaseName** : Cloud Spanner database name. -* **spannerTableName** : Cloud Spanner table name. -* **maxNumMutations** : Specifies the cell mutation limit (maximum number of mutated cells per batch). Default value is 5000. -* **maxNumRows** : Specifies the row mutation limit (maximum number of mutated rows per batch). Default value is 1000. -* **batchSizeBytes** : Specifies the batch size limit (max number of bytes mutated per batch). Default value is 1MB. -* **commitDeadlineSeconds** : Specifies the deadline in seconds for the Commit API call. -* **bootstrapServer** : Kafka Bootstrap Server (Example: localhost:9092). -* **kafkaTopic** : Kafka topic to write to. (Example: topic). +* **schemaTemplate**: Pre-existing schema template to use. The value must be one of: [GAME_EVENT]. +* **schemaLocation**: Cloud Storage path of schema location. For example, `gs:///prefix`. +* **topic**: The name of the topic to which the pipeline should publish data. For example, `projects//topics/`. +* **messagesLimit**: Indicates maximum number of output messages to be generated. 0 means unlimited. Defaults to: 0. +* **outputType**: The message Output type. Default is JSON. +* **avroSchemaLocation**: Cloud Storage path of Avro schema location. Mandatory when output type is AVRO or PARQUET. For example, `gs://your-bucket/your-path/schema.avsc`. +* **sinkType**: The message Sink type. Default is PUBSUB. +* **outputTableSpec**: Output BigQuery table. Mandatory when sinkType is BIGQUERY For example, `:.`. +* **writeDisposition**: BigQuery WriteDisposition. For example, WRITE_APPEND, WRITE_EMPTY or WRITE_TRUNCATE. Defaults to: WRITE_APPEND. +* **outputDeadletterTable**: Messages failed to reach the output table for all kind of reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. For example, `your-project-id:your-dataset.your-table-name`. +* **windowDuration**: The window duration/size in which data will be written to Cloud Storage. Allowed formats are: Ns (for seconds, example: 5s), Nm (for minutes, example: 12m), Nh (for hours, example: 2h). For example, `1m`. Defaults to: 1m. +* **outputDirectory**: The path and filename prefix for writing output files. Must end with a slash. DateTime formatting is used to parse directory path for date & time formatters. For example, `gs://your-bucket/your-path/`. +* **outputFilenamePrefix**: The prefix to place on each windowed file. For example, `output-`. Defaults to: output-. +* **numShards**: The maximum number of output shards produced when writing. A higher number of shards means higher throughput for writing to Cloud Storage, but potentially higher data aggregation cost across shards when processing output Cloud Storage files. Default value is decided by Dataflow. +* **driverClassName**: JDBC driver class name to use. For example, `com.mysql.jdbc.Driver`. +* **connectionUrl**: Url connection string to connect to the JDBC source. For example, `jdbc:mysql://some-host:3306/sampledb`. +* **username**: User name to be used for the JDBC connection. +* **password**: Password to be used for the JDBC connection. +* **connectionProperties**: Properties string to use for the JDBC connection. Format of the string must be [propertyName=property;]*. For example, `unicode=true;characterEncoding=UTF-8`. +* **statement**: SQL statement which will be executed to write to the database. The statement must specify the column names of the table in any order. Only the values of the specified column names will be read from the json and added to the statement. For example, `INSERT INTO tableName (column1, column2) VALUES (?,?)`. +* **projectId**: GCP Project Id of where the Spanner table lives. +* **spannerInstanceName**: Cloud Spanner instance name. +* **spannerDatabaseName**: Cloud Spanner database name. +* **spannerTableName**: Cloud Spanner table name. +* **maxNumMutations**: Specifies the cell mutation limit (maximum number of mutated cells per batch). Default value is 5000. +* **maxNumRows**: Specifies the row mutation limit (maximum number of mutated rows per batch). Default value is 1000. +* **batchSizeBytes**: Specifies the batch size limit (max number of bytes mutated per batch). Default value is 1MB. +* **commitDeadlineSeconds**: Specifies the deadline in seconds for the Commit API call. +* **bootstrapServer**: Kafka Bootstrap Server For example, `localhost:9092`. +* **kafkaTopic**: Kafka topic to write to. For example, `topic`. @@ -304,25 +304,25 @@ resource "google_dataflow_flex_template_job" "streaming_data_generator" { parameters = { qps = "" # schemaTemplate = "" - # schemaLocation = "gs:///prefix" - # topic = "projects//topics/" + # schemaLocation = "" + # topic = "" # messagesLimit = "0" # outputType = "JSON" - # avroSchemaLocation = "gs://your-bucket/your-path/schema.avsc" + # avroSchemaLocation = "" # sinkType = "PUBSUB" - # outputTableSpec = ":." + # outputTableSpec = "" # writeDisposition = "WRITE_APPEND" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" + # outputDeadletterTable = "" # windowDuration = "1m" - # outputDirectory = "gs://your-bucket/your-path/" + # outputDirectory = "" # outputFilenamePrefix = "output-" # numShards = "0" - # driverClassName = "com.mysql.jdbc.Driver" - # connectionUrl = "jdbc:mysql://some-host:3306/sampledb" + # driverClassName = "" + # connectionUrl = "" # username = "" # password = "" - # connectionProperties = "unicode=true;characterEncoding=UTF-8" - # statement = "INSERT INTO tableName (column1, column2) VALUES (?,?)" + # connectionProperties = "" + # statement = "" # projectId = "" # spannerInstanceName = "" # spannerDatabaseName = "" @@ -331,8 +331,8 @@ resource "google_dataflow_flex_template_job" "streaming_data_generator" { # maxNumRows = "" # batchSizeBytes = "" # commitDeadlineSeconds = "" - # bootstrapServer = "localhost:9092" - # kafkaTopic = "topic" + # bootstrapServer = "" + # kafkaTopic = "" } } ``` diff --git a/yaml/README_Kafka_to_BigQuery_Yaml.md b/yaml/README_Kafka_to_BigQuery_Yaml.md index c1f4fcec75..dc38e0752c 100644 --- a/yaml/README_Kafka_to_BigQuery_Yaml.md +++ b/yaml/README_Kafka_to_BigQuery_Yaml.md @@ -21,17 +21,17 @@ on [Metadata Annotations](https://github.com/GoogleCloudPlatform/DataflowTemplat ### Required parameters -* **outputTableSpec** : BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. +* **outputTableSpec**: BigQuery table location to write the output to. The name should be in the format `:.`. The table's schema must match input objects. ### Optional parameters -* **readBootstrapServers** : Kafka Bootstrap Server list, separated by commas. (Example: localhost:9092,127.0.0.1:9093). -* **kafkaReadTopics** : Kafka topic(s) to read input from. (Example: topic1,topic2). -* **outputDeadletterTable** : BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. (Example: your-project-id:your-dataset.your-table-name). -* **messageFormat** : The message format. Can be AVRO or JSON. Defaults to: JSON. -* **schema** : Kafka schema. A schema is required if data format is JSON, AVRO or PROTO. -* **numStorageWriteApiStreams** : Number of streams defines the parallelism of the BigQueryIO’s Write transform and roughly corresponds to the number of Storage Write API’s streams which will be used by the pipeline. See https://cloud.google.com/blog/products/data-analytics/streaming-data-into-bigquery-using-storage-write-api for the recommended values. Defaults to: 1. -* **storageWriteApiTriggeringFrequencySec** : Triggering frequency will determine how soon the data will be visible for querying in BigQuery. See https://cloud.google.com/blog/products/data-analytics/streaming-data-into-bigquery-using-storage-write-api for the recommended values. Defaults to: 1. +* **readBootstrapServers**: Kafka Bootstrap Server list, separated by commas. For example, `localhost:9092,127.0.0.1:9093`. +* **kafkaReadTopics**: Kafka topic(s) to read input from. For example, `topic1,topic2`. +* **outputDeadletterTable**: BigQuery table for failed messages. Messages failed to reach the output table for different reasons (e.g., mismatched schema, malformed json) are written to this table. If it doesn't exist, it will be created during pipeline execution. If not specified, "outputTableSpec_error_records" is used instead. For example, `your-project-id:your-dataset.your-table-name`. +* **messageFormat**: The message format. Can be AVRO or JSON. Defaults to: JSON. +* **schema**: Kafka schema. A schema is required if data format is JSON, AVRO or PROTO. +* **numStorageWriteApiStreams**: Number of streams defines the parallelism of the BigQueryIO’s Write transform and roughly corresponds to the number of Storage Write API’s streams which will be used by the pipeline. See https://cloud.google.com/blog/products/data-analytics/streaming-data-into-bigquery-using-storage-write-api for the recommended values. Defaults to: 1. +* **storageWriteApiTriggeringFrequencySec**: Triggering frequency will determine how soon the data will be visible for querying in BigQuery. See https://cloud.google.com/blog/products/data-analytics/streaming-data-into-bigquery-using-storage-write-api for the recommended values. Defaults to: 1. @@ -215,9 +215,9 @@ resource "google_dataflow_flex_template_job" "kafka_to_bigquery_yaml" { region = var.region parameters = { outputTableSpec = "" - # readBootstrapServers = "localhost:9092,127.0.0.1:9093" - # kafkaReadTopics = "topic1,topic2" - # outputDeadletterTable = "your-project-id:your-dataset.your-table-name" + # readBootstrapServers = "" + # kafkaReadTopics = "" + # outputDeadletterTable = "" # messageFormat = "JSON" # schema = "" # numStorageWriteApiStreams = "1"