miraisolutions · martinstuder · Sep 30, 2024 · Oct 1, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -10,4 +10,7 @@
 ^logs$
 ^test\.R$
 ^spark-warehouse$
-cran-comments.md
+cran-comments.md
+^\.github$
+^adc.json$
+^spark_versions.json$
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -0,0 +1,177 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+#
+# See https://github.com/r-lib/actions/tree/master/examples#readme for
+# additional example workflows available for the R community.
+
+name: ci-tests
+
+on:
+  push:
+    branches: [ "feature/google_bigquery_connector" ]
+  pull_request:
+    branches: [ "feature/google_bigquery_connector" ]
+
+jobs:
+  build:
+    runs-on: ${{ matrix.config.os-name }}-${{ matrix.config.os-version }}
+    name: >- 
+      ${{ matrix.config.os-name }}-${{ matrix.config.os-version }}
+      R ${{ matrix.config.r-version}} - Java ${{ matrix.config.java}}
+      Spark ${{ matrix.config.spark }}
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          # - os-name: ubuntu
+          #   os-version: latest
+          #   java: 17
+          #   spark: "3.5"
+          #   r-version: release
+          # - os-name: ubuntu
+          #   os-version: latest
+          #   java: 8
+          #   spark: "2.4"
+          #   r-version: devel
+          # - os-name: ubuntu
+          #   os-version: "22.04"
+          #   java: 11
+          #   spark: "3.0"
+          #   r-version: oldrel
+          # - os-name: macos
+          #   os-version: latest
+          #   java: 8
+          #   spark: "3.2"
+          #   r-version: release
+          # - os-name: macos
+          #   os-version: latest
+          #   java: 17
+          #   spark: "3.4"
+          #   r-version: devel
+          # - os-name: windows
+          #   os-version: latest
+          #   java: 8
+          #   spark: "3.1"
+          #   r-version: oldrel
+          - os-name: windows
+            os-version: "2022"
+            java: 17
+            spark: "3.3"
+            r-version: release
+    env:
+      SPARK_VERSION: ${{ matrix.config.spark }}
+      BIGQUERY_PROJECT_ID: mirai-sbb
+      BIGQUERY_MATERIALIZATION_DATASET: test
+      BIGQUERY_APPLICATION_CREDENTIALS: ${{ github.workspace }}/adc.json
+      R_DEFAULT_INTERNET_TIMEOUT: 1800
+      # Override where sparklyr is getting Spark version information from
+      R_SPARKINSTALL_INSTALL_INFO_PATH: ${{ github.workspace }}/spark_versions.json
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Setup Java
+        uses: actions/[email protected]
+        with:
+          distribution: 'zulu'
+          java-version: ${{ matrix.config.java }}
+          java-package: jdk
+          architecture: x64
+
+      - name: Print effective Java version
+        run: java -version
+
+      - name: Set up R ${{ matrix.r-version }}
+        uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r-version }}
+          use-public-rspm: true
+
+      - name: Print effective R version
+        run: version
+        shell: Rscript {0}
+
+      - name: R CMD javareconf
+        if: runner.os != 'Windows'
+        run: |
+          java -version
+          echo java_home:$JAVA_HOME
+          echo library paths: $LD_LIBRARY_PATH
+          sudo R CMD javareconf JAVA_HOME=$JAVA_HOME
+
+      - name: Install and cache dependencies
+        uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          cache-version: ${{ matrix.config.java }}
+          extra-packages: |
+            any::rcmdcheck
+
+      - name: Install Microsoft Visual C++ 2010 SP1 Redistributable Package via Chocolatey
+        if: runner.os == 'Windows'
+        run: |
+          # Install the package using Chocolatey
+          choco install vcredist2010 -y --allow-empty-checksums
+
+      - name: Install Spark
+        run: |
+          info <- sparklyr::spark_install(version="${{ matrix.config.spark }}")
+          if (.Platform$OS.type == "windows") {
+            hadoop_path <- file.path(info$sparkVersionDir, "tmp", "hadoop")
+            hadoop_bin_path <- file.path(hadoop_path, "bin")
+            dir.create(hadoop_bin_path, recursive = TRUE)
+
+            cat(
+              c(
+                paste0("HADOOP_VERSION=", info$hadoopVersion),
+                paste0("HADOOP_HOME=", hadoop_path),
+                paste0("PATH=", Sys.getenv("PATH"), ":", hadoop_bin_path)
+              ),
+              file = Sys.getenv("GITHUB_ENV"),
+              sep = "\n",
+              append = TRUE
+            )
+          }
+        shell: Rscript {0}
+
+      - name: Install Winutils
+        if: runner.os == 'Windows'
+        run: |
+          git clone https://github.com/cdarlint/winutils.git C:/tmp/winutils
+          LATEST_HADOOP_DIR=$(ls -d C:/tmp/winutils/hadoop-${HADOOP_VERSION}* | sort -V | tail -n 1)
+          echo "Latest Hadoop Directory: ${LATEST_HADOOP_DIR}"
+          cp -R ${LATEST_HADOOP_DIR}/bin ${HADOOP_HOME}
+        shell: bash
+
+      - name: Set Google application default credentials
+        env:
+          ADC: ${{ secrets.GCLOUD_APPLICATION_CREDENTIALS }}
+        shell: bash
+        run: |
+          echo "$ADC" > $BIGQUERY_APPLICATION_CREDENTIALS
+
+      - name: Run R CMD check
+        uses: r-lib/actions/check-r-package@v2
+        with:
+          error-on: '"error"'
+          check-dir: '"check"'
+
+      - name: Upload check results
+        if: failure()
+        uses: actions/upload-artifact@main
+        with:
+          name: ${{ matrix.config.os-name }}${{matrix.config.os-version}}-Java${{ matrix.config.java }}-R${{ matrix.config.r-version }}-Spark${{ matrix.config.spark }}-results
+          path: |
+            check/sparkbq.Rcheck/
+            !check/sparkbq.Rcheck/00_pkg_src/
+            !adc.json
+
+      - name: Upload successfully built package
+        if: success()
+        uses: actions/upload-artifact@main
+        with:
+          name: ${{ matrix.config.os-name }}${{matrix.config.os-version}}-Java${{ matrix.config.java }}-R${{ matrix.config.r-version }}-Spark${{ matrix.config.spark }}-results
+          path: |
+            check/sparkbq_*.tar.gz
+            !adc.json
diff --git a/.gitignore b/.gitignore
@@ -32,7 +32,11 @@ vignettes/*.pdf
 *.utf8.md
 *.knit.md
 .Rproj.user
+adc.json
 
 # Spark-specific files
 *.log
 logs
+
+# Ignore generated credentials from google-github-actions/auth
+gha-creds-*.json
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: sparkbq
 Type: Package
 Title: Google 'BigQuery' Support for 'sparklyr'
-Version: 0.1.1
+Version: 0.2.0
 Authors@R: c(person(family = "Mirai Solutions GmbH", role = "aut",
                     email = "[email protected]"),
              person("Martin", "Studer", role = "cre",
@@ -11,13 +11,13 @@ Authors@R: c(person(family = "Mirai Solutions GmbH", role = "aut",
 URL: http://www.mirai-solutions.com, https://github.com/miraisolutions/sparkbq
 BugReports: https://github.com/miraisolutions/sparkbq/issues
 Description: A 'sparklyr' extension package providing an integration with Google 'BigQuery'.
-  It supports direct import/export where records are directly streamed from/to 'BigQuery'.
-  In addition, data may be imported/exported via intermediate data extracts on Google 'Cloud Storage'.
 Depends: R (>= 3.3.2)
 Imports: sparklyr (>= 0.7.0)
-Suggests: dplyr
+Suggests: 
+    dplyr,
+    testthat (>= 3.0.0)
 License: GPL-3 | file LICENSE
-SystemRequirements: Spark (>= 2.2.x)
+SystemRequirements: Spark (>= 2.3)
 Encoding: UTF-8
-LazyData: yes
-RoxygenNote: 6.1.1
+RoxygenNote: 7.3.2
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,10 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
 export(bigquery_defaults)
-export(default_bigquery_type)
-export(default_billing_project_id)
-export(default_dataset_location)
-export(default_gcs_bucket)
+export(default_materialization_dataset)
+export(default_materialization_project)
+export(default_project_id)
 export(default_service_account_key_file)
 export(spark_read_bigquery)
 export(spark_write_bigquery)

diff --git a/NEWS b/NEWS
@@ -1,5 +1,8 @@
 # News
 
+## 0.2.0 2024-10-??
+  *  Upgrade to official Google Spark Bigquery Connector (https://github.com/GoogleCloudDataproc/spark-bigquery-connector)
+
 ## 0.1.1 2019-12-18
   * Use newer spark-bigquery spark package, updates gcloud dependencies (#14)
   * Fix service account key file parameter passing (#13, #15)